<a href="https://colab.research.google.com/github/rabbitmetrics/cx-analytics/blob/main/notebooks/cx-analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import numpy as np

import chromadb

# Imports needed for vector store with huggingface
from langchain_huggingface import HuggingFaceEmbeddings

# Imports needed for vector store/model predictions with distilbert
from transformers import DistilBertTokenizer, DistilBertModel
import torch

In [2]:
# Embedding function for Distilbert
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

def create_embedding_distilbert(text):
    # Tokenize input text
    inputs = tokenizer_distilbert(text, return_tensors='pt', padding=True, truncation=True)
    # Get model outputs
    with torch.no_grad():
        outputs = model_distilbert(**inputs)
    # Take the mean of the last hidden state to get the sentence embedding
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [3]:
client = chromadb.PersistentClient(path=".")

collection_hf = client.get_or_create_collection("statbel_hf")
collection_distilbert = client.get_or_create_collection("statbel_distilbert")

In [4]:
# Make vector store statbel_huggingface with unnormalized huggingface embedding
chroma_db_path = "./chroma.sqlite3"
hf_embeddings_model = HuggingFaceEmbeddings()


# Check if the ChromaDB file exists. If not, build it
if not os.path.exists(chroma_db_path):
    # Load the context data
    df = pd.read_csv('input_data/soc_sample.csv', header=0)
    df = df[df.iloc[:, 0].str.startswith('in the city of Halle')]


    # Huggingface embeddings
    df['embeddings_hf'] = df['text'].apply(lambda x: hf_embeddings_model.embed_query(x))
    
    # DistilBERT embeddings
    df['embeddings_distilbert'] = df['text'].apply(lambda x: create_embedding_distilbert(x))

    ids = [str(index) for index in df.index]  # Convert index to string for IDs

    collection_hf = client.get_or_create_collection(name="statbel_hf")
    collection_hf.add(
        ids=ids,  # Unique identifiers for each document
        documents=df['text'].tolist(), 
        embeddings=df['embeddings_hf'].tolist(),
        metadatas=[{"index": index} for index in df.index]  # Optional metadata
        )
    
    # Make vector store statbel_distilbert with unnormalized distilbert embedding
    collection_distilbert = client.get_or_create_collection(name="statbel_distilbert")
    collection_distilbert.add(
        ids=ids,  # Unique identifiers for each document
        documents=df['text'].tolist(),  # The text you want to store
        embeddings=df['embeddings_distilbert'].tolist(),  # The corresponding embeddings
        metadatas=[{"index": index} for index in df.index]  # Optional metadata
        )



In [5]:
# Start asking some questions ..
query = "in the city of Halle how many males of age 65 which are born in bel and married?"
query = "in the city of Halle how many males of age 65 are there"


def get_context(query, model):
    if model == 'huggingface':
        query_embedding = hf_embeddings_model.embed_query(query)
        results = collection_hf.query(
                                      query_embeddings=[query_embedding],  
                                      n_results=10
                                     )
    elif model == 'distilbert':
        query_embedding = create_embedding_distilbert(query)
        results = collection_distilbert.query(
                                              query_embeddings=[query_embedding],
                                              n_results=10
                                             ) 
        
    context = "\n".join(results['documents'][0])
    print(f'{model}:\n{context}\n')
    return context

context_hf = get_context(query, 'huggingface')
context_distilbert = get_context(query, 'distilbert')

huggingface:
in the city of Halle the number of males born in bel unmarried and age 52 is 75
in the city of Halle the number of males born in bel unmarried and age 65 is 30
in the city of Halle the number of males born in bel unmarried and age 53 is 80
in the city of Halle the number of males born in bel unmarried and age 49 is 78
in the city of Halle the number of males born in bel unmarried and age 50 is 87
in the city of Halle the number of males born in bel unmarried and age 59 is 47
in the city of Halle the number of males born in bel unmarried and age 63 is 30
in the city of Halle the number of males born in bel unmarried and age 51 is 69
in the city of Halle the number of males born in bel unmarried and age 47 is 78
in the city of Halle the number of males born in bel unmarried and age 60 is 37

distilbert:
in the city of Halle the number of males born in bel unmarried and age 65 is 30
in the city of Halle the number of males born in bel unmarried and age 61 is 40
in the city of

In [6]:
from transformers import DistilBertForQuestionAnswering

# Get the model's predictions
inputs = tokenizer_distilbert(query, context_hf, return_tensors='pt', padding=True, truncation=True)
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')


with torch.no_grad():
    outputs = model(**inputs)


# Get the start and end scores
start_scores = outputs.start_logits
end_scores = outputs.end_logits


# Get the most likely start and end of the answer
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores) + 1  # +1 because the end index is inclusive


# Decode the answer
answer_tokens = inputs['input_ids'][0][start_index:end_index]
answer = tokenizer_distilbert.decode(answer_tokens)


print("Answer:", answer)

Answer: 30
