<a href="https://colab.research.google.com/github/rabbitmetrics/cx-analytics/blob/main/notebooks/cx-analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd

import chromadb

# Imports needed for similarity search improvement
from sklearn.preprocessing import normalize

# Imports needed for vector store with huggingface
from langchain_huggingface import HuggingFaceEmbeddings

# Imports needed for vector store with distilbert
from transformers import DistilBertTokenizer, DistilBertModel
import torch

In [2]:
# Import data
df = pd.read_csv('input_data/soc_sample.csv', header=0)
df = df[df.iloc[:, 0].str.startswith('in the city of Halle')]

In [3]:
# Add different embeddings to the df

# Huggingface
huggingface_embeddings_model = HuggingFaceEmbeddings()
df['embeddings_huggingface'] = df['text'].apply(lambda x: huggingface_embeddings_model.embed_query(x))
df['embeddings_huggingface_normalized'] = df['embeddings_huggingface'].apply(
    lambda emb: normalize([emb], axis=1).flatten().tolist()
)

# DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to create embeddings
def create_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state to get the sentence embedding
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply the embedding function to the 'text' column
df['embeddings_distilbert'] = df['text'].apply(lambda x: create_embedding(x))



print(df.head(1))

                                                    text  \
87953  in the city of Halle the number of females bor...   

                                  embeddings_huggingface  \
87953  [-0.01284471619874239, 0.026470182463526726, 0...   

                       embeddings_huggingface_normalized  \
87953  [-0.012844716066090781, 0.026470182190160445, ...   

                                   embeddings_distilbert  
87953  [-0.13163061, -0.038754307, 0.14012174, -0.263...  


In [4]:
client = chromadb.Client()

In [5]:
# Make vector store statbel_huggingface with unnormalized huggingface embedding

ids = [str(index) for index in df.index]  # Convert index to string for IDs

collection_huggingface = client.create_collection(name="statbel_huggingface")
collection_huggingface.add(
    ids=ids,  # Unique identifiers for each document
    documents=df['text'].tolist(), 
    embeddings=df['embeddings_huggingface'].tolist(),
    metadatas=[{"index": index} for index in df.index]  # Optional metadata
    )


collection_huggingface_norm = client.create_collection(name="statbel_huggingface_norm")
collection_huggingface_norm.add(
    ids=ids,  # Unique identifiers for each document
    documents=df['text'].tolist(), 
    embeddings=df['embeddings_huggingface_normalized'].tolist(),
    metadatas=[{"index": index} for index in df.index]  # Optional metadata
    )


# Make vector store statbel_distilbert with unnormalized distilbert embedding
collection_distilbert = client.create_collection(name="statbel_distilbert")
collection_distilbert.add(
    ids=ids,  # Unique identifiers for each document
    documents=df['text'].tolist(),  # The text you want to store
    embeddings=df['embeddings_distilbert'].tolist(),  # The corresponding embeddings
    metadatas=[{"index": index} for index in df.index]  # Optional metadata
    )



In [12]:
import numpy as np
# Start asking some questions ..
query_text = "Let's have a look at all males of age 30 that are married and born in bel. How many are there? Please make sure you get this right: their age has to be 30."

# Generate embeddings for the query
query_embedding_huggingface = huggingface_embeddings_model.embed_query(query_text)
query_embedding_huggingface_normalized = normalize(np.array(query_embedding_huggingface).reshape(1, -1), axis=1).flatten()
query_embedding_distilbert = create_embedding(query_text)



# Perform a similarity search for unnormalized huggingface embedding
results_huggingface = collection_huggingface.query(
    query_embeddings=[query_embedding_huggingface],  
    n_results=2
)


# Perform a similarity search for normalized huggingface embedding
results_huggingface_norm = collection_huggingface.query(
    query_embeddings=[query_embedding_huggingface_normalized],  
    n_results=2
)

# Perform a similarity search for distilbert tokenizer
results_distilbert = collection_distilbert.query(
    query_embeddings=[query_embedding_distilbert],  # The embedding of your query
    n_results=2  # Number of results to retrieve
)

print(results_huggingface)
print('---------')
print(results_huggingface_norm)
print('---------')
print(results_distilbert)

{'ids': [['88788', '88714']], 'embeddings': None, 'documents': [['in the city of Halle the number of males born in bel not married and age 65 is 30', 'in the city of Halle the number of males born in bel not married and age 64 is 29']], 'uris': None, 'data': None, 'metadatas': [[{'index': 88788}, {'index': 88714}]], 'distances': [[0.7273622751235962, 0.7450776100158691]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
---------
{'ids': [['88788', '88714']], 'embeddings': None, 'documents': [['in the city of Halle the number of males born in bel not married and age 65 is 30', 'in the city of Halle the number of males born in bel not married and age 64 is 29']], 'uris': None, 'data': None, 'metadatas': [[{'index': 88788}, {'index': 88714}]], 'distances': [[0.7273622751235962, 0.7450776100158691]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: