<a href="https://colab.research.google.com/github/rabbitmetrics/cx-analytics/blob/main/notebooks/cx-analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings

In [2]:
# Extract data from files

df = pd.read_csv('input/soc_sample.csv', header=0)
df = df[df.iloc[:, 0].str.startswith('in the city of Halle')]


huggingface_embeddings_model = HuggingFaceEmbeddings()
df['embeddings_huggingface'] = df['text'].apply(lambda x: huggingface_embeddings_model.embed_query(x))

print(df.shape)
print(df.head)

  from tqdm.autonotebook import tqdm, trange


(1006, 2)
<bound method NDFrame.head of                                                     text  \
87953  in the city of Halle the number of females bor...   
87954  in the city of Halle the number of males born ...   
87955  in the city of Halle the number of females bor...   
87956  in the city of Halle the number of females bor...   
87957  in the city of Halle the number of females bor...   
...                                                  ...   
88954  in the city of Halle the number of females for...   
88955  in the city of Halle the number of females for...   
88956  in the city of Halle the number of females for...   
88957  in the city of Halle the number of males forei...   
88958  in the city of Halle the number of males forei...   

                                  embeddings_huggingface  
87953  [-0.01284471619874239, 0.026470182463526726, 0...  
87954  [-0.0030114194378256798, -0.001073996420018375...  
87955  [-0.013922133482992649, 0.028771555051207542, ...  
879

In [3]:
import chromadb
client = chromadb.Client()
collection_huggingface = client.create_collection(name="statbel_huggingface")

In [4]:
# Generate unique IDs for each entry
ids = [str(index) for index in df.index]  # Convert index to string for IDs

# Add embeddings and corresponding text to ChromaDB
collection_huggingface.add(
    ids=ids,  # Unique identifiers for each document
    documents=df['text'].tolist(), 
    embeddings=df['embeddings_huggingface'].tolist(),
    metadatas=[{"index": index} for index in df.index]  # Optional metadata
)


In [5]:
# Define your query
query_text = "how many married males of age 30 which are born in bel are there in Halle"

# Generate an embedding for the query
query_embedding_huggingface = huggingface_embeddings_model.embed_query(query_text)

# Perform a similarity search
results = collection_huggingface.query(
    query_embeddings=[query_embedding_huggingface],  
    n_results=1
)

results

{'ids': [['88082']],
 'embeddings': None,
 'documents': [['in the city of Halle the number of males born in bel married and age 28 is 29']],
 'uris': None,
 'data': None,
 'metadatas': [[{'index': 88082}]],
 'distances': [[0.18536950647830963]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [6]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to create embeddings
def create_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state to get the sentence embedding
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply the embedding function to the 'text' column
df['embeddings_distilbert'] = df['text'].apply(lambda x: create_embedding(x))

print(df.shape)
print(df.head())

(1006, 3)
                                                    text  \
87953  in the city of Halle the number of females bor...   
87954  in the city of Halle the number of males born ...   
87955  in the city of Halle the number of females bor...   
87956  in the city of Halle the number of females bor...   
87957  in the city of Halle the number of females bor...   

                                  embeddings_huggingface  \
87953  [-0.01284471619874239, 0.026470182463526726, 0...   
87954  [-0.0030114194378256798, -0.001073996420018375...   
87955  [-0.013922133482992649, 0.028771555051207542, ...   
87956  [-0.005704917479306459, 0.019736476242542267, ...   
87957  [-0.004115410149097443, 0.037098776549100876, ...   

                                   embeddings_distilbert  
87953  [-0.13163061, -0.038754307, 0.14012174, -0.263...  
87954  [-0.1245866, 0.05417601, 0.19719774, -0.291879...  
87955  [-0.11988481, -0.041241653, 0.16018592, -0.264...  
87956  [-0.13078669, -0.02642730

In [7]:
# Add embeddings and corresponding text to ChromaDB

collection_distilbert = client.create_collection(name="statbel_distilbert")
collection_distilbert.add(
    ids=ids,  # Unique identifiers for each document
    documents=df['text'].tolist(),  # The text you want to store
    embeddings=df['embeddings_distilbert'].tolist(),  # The corresponding embeddings
    metadatas=[{"index": index} for index in df.index]  # Optional metadata
)

In [8]:
# Generate an embedding for the query
query_embedding_distilbert = create_embedding(query_text)

# Perform a similarity search
results = collection_distilbert.query(
    query_embeddings=[query_embedding_distilbert],  # The embedding of your query
    n_results=5  # Number of results to retrieve
)

results

{'ids': [['88710', '88727', '88664', '88647', '88726']],
 'embeddings': None,
 'documents': [['in the city of Halle the number of males born in bel not married and age 25 is 200',
   'in the city of Halle the number of males born in bel not married and age 30 is 188',
   'in the city of Halle the number of males born in bel not married and age 10 is 247',
   'in the city of Halle the number of males born in bel not married and age 9 is 240',
   'in the city of Halle the number of males born in bel not married and age 15 is 251']],
 'uris': None,
 'data': None,
 'metadatas': [[{'index': 88710},
   {'index': 88727},
   {'index': 88664},
   {'index': 88647},
   {'index': 88726}]],
 'distances': [[8.25822639465332,
   8.612930297851562,
   8.852355003356934,
   8.939790725708008,
   8.962467193603516]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}