In [None]:
import os
import time
import json
import sqlite3
import hashlib
from typing import List, Dict
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [None]:
def generate_id_for_text(text: str) -> str:
    """Generates a deterministic ID for a given text."""
    return hashlib.sha256(text.encode()).hexdigest()

In [None]:
def collate_json_data(data: List[Dict]) -> List[Dict]:
    """Collates data from a list of JSON objects."""
    collated_data = {}
    for item in data:
        for text in item['summary']:
            text = text.lower()
            text_id = generate_id_for_text(text)
            if text_id not in collated_data:
                collated_data[text_id] = {
                    'id': text_id,
                    'text': text,
                    'intents': []
                }
            intents = list(set(map(str.lower, item['intents'])))
            collated_data[text_id]['intents'].extend(intents)
    return list(collated_data.values())

In [None]:
def create_or_load_pinecone_index(index_name: str, embedding_dims: int) -> Pinecone.Index:
    """Creates or loads a Pinecone index."""
    pinecone_api_key = os.environ.get("PINECONE_API_KEY")
    pc = Pinecone(api_key=pinecone_api_key)

    if not pc.has_index(index_name):
        pc.create_index(
            name=index_name,
            dimension=embedding_dims,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            ),
        )
        while not pc.describe_index(index_name).status["ready"]:
            time.sleep(1)
        print(f"Index {index_name} is ready")
    else:
        print(f"Index {index_name} already exists")
    return pc.Index(index_name)

In [None]:
def process_and_upsert_data(index: Pinecone.Index, collated_data: List[Dict], model: SentenceTransformer) -> None:
    """Processes and upserts data to Pinecone index."""
    inserted_count = 0
    upserted_count = 0
    for record in collated_data:
        try:
            response = index.fetch([record['id']])
            if record['id'] in response.vectors:
                existing_metadata = response.vectors[record['id']]['metadata']
                existing_metadata['intents'] = list(set(existing_metadata['intents']+record['intents']))
                index.upsert(vectors=[{
                    'id': record['id'],
                    'values': model.encode(record['text']).tolist(),
                    'metadata': existing_metadata
                }])
                upserted_count += 1
            else:
                index.upsert(vectors=[{
                    'id': record['id'],
                    'values': model.encode(record['text']).tolist(),
                    'metadata': {
                        'text': record['text'],
                        'intents': record['intents']
                    }
                }])
                inserted_count += 1
        except Exception as e:
            print(f"Error processing record {record['id']}: {e}")
    print(f"Inserted {inserted_count} records, Upserted {upserted_count} records")
    assert inserted_count + upserted_count == len(collated_data), "Inserted + upserted do not match total data"

In [None]:
def update_database_status(db_path: str, filename: str, status: str) -> None:
    """Updates the database status."""
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        cursor.execute("UPDATE policy_processed SET status = ? WHERE filename = ?", (status, filename))
        conn.commit()

In [None]:
def load_embedding_model(model_name: str) -> SentenceTransformer:
    """Loads the embedding model."""
    model = SentenceTransformer(
        model_name,
        token=os.getenv('HF_API_KEY', None)
    )
    return model

In [None]:
def get_files_to_process(db_path: str) -> List[str]:
    """Returns a list of filenames with status 'parsing done'."""
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT filename FROM policy_processed WHERE status = ?", ('parsing done',))
        filenames = [row[0] for row in cursor.fetchall()]
    return filenames

In [None]:
def main(input_dir, db_path):
    sleep_time = 10
    while True:
        filenames = get_files_to_process(db_path)
        for filename in filenames:
            json_file = os.path.splitext(filename)[0] + '.json'
            json_file_path = os.path.join(input_dir, json_file)

            if os.path.exists(json_file_path):
                with open(json_file_path, 'r') as f:
                    data = json.load(f)

                collated_data = collate_json_data(data)
                index = create_or_load_pinecone_index("policy-info-index", 768)
                model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
                process_and_upsert_data(index, collated_data, model)
                update_database_status(db_path, filename, 'index updated')
        time.sleep(sleep_time)

In [None]:
load_dotenv('../data/.env')

True

In [None]:
# Define file paths
input_dir = '../data/policy_docs'
db_path = '../data/chatbot.db'

In [None]:
main(input_dir, db_path)

Index policy-info-index is ready


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Inserted 27 records, Upserted 0 records


KeyboardInterrupt: 

## Query

In [None]:
index_name = "policy-info-index"
pc_index = create_or_load_pinecone_index(index_name, embedding_dims=768)

Index policy-info-index already exists


In [None]:
# view index stats
pc_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 27}},
 'total_vector_count': 27,
 'vector_type': 'dense'}

In [None]:
{'id': '954c7b39a4948ec9978d166a3b35c88b42bed9a9525b9598953051edd4f74260',
  'text': 'follow the return process outlined above using the chatbot.',
  'intents': ['exchange', 'return']}

{'id': '954c7b39a4948ec9978d166a3b35c88b42bed9a9525b9598953051edd4f74260',
 'text': 'follow the return process outlined above using the chatbot.',
 'intents': ['exchange', 'return']}

In [None]:
response = pc_index.fetch(['954c7b39a4948ec9978d166a3b35c88b42bed9a9525b9598953051edd4f74260'])

In [None]:
response.vectors['954c7b39a4948ec9978d166a3b35c88b42bed9a9525b9598953051edd4f74260']['metadata']

{'intents': ['return', 'exchange'],
 'text': 'follow the return process outlined above using the chatbot.'}

In [None]:
embedding_model = load_embedding_model('sentence-transformers/all-mpnet-base-v2')

In [None]:
query_text = 'what is the return process in the chatbot'

In [None]:
query_response = pc_index.query(
    vector=embedding_model.encode(query_text).tolist(),
    # vector=[0.]*768,
    top_k=50,
    include_metadata=True,
    include_values=False,
    filter={
        'intents': {
            '$in': ['return']
        }
    }
)

In [None]:
query_response

{'matches': [{'id': '02fc645b77e893c769ea67cbd7114215d7c8c5aa8c8432d6753f1b6e2bcc5242',
              'metadata': {'intents': ['return', 'refund'],
                           'text': 'the chatbot will generate a return '
                                   'authorization (ra) number.'},
              'score': 0.736194074,
              'values': []},
             {'id': '954c7b39a4948ec9978d166a3b35c88b42bed9a9525b9598953051edd4f74260',
              'metadata': {'intents': ['return', 'exchange'],
                           'text': 'follow the return process outlined above '
                                   'using the chatbot.'},
              'score': 0.68538785,
              'values': []},
             {'id': '5660656bed519ea84ccb8037366965ee8c9de4dd75364a479a6c13dba65af640',
              'metadata': {'intents': ['return', 'refund'],
                           'text': 'contact the chatbot with order details.'},
              'score': 0.513379,
              'values': []},
        

In [None]:
# Reranking
# Keep in mind to transform data for reranking
documents = [
    {"id": x["id"], "text": x["metadata"]["text"]}
    for x in query_response["matches"]
]

pc = Pinecone(os.getenv("PINECONE_API_KEY"))
reranked_documents = pc.inference.rerank(
    model="bge-reranker-v2-m3",
    query=query_text,
    documents=documents,
    top_n=10,
    return_documents=True,
)

In [None]:
reranked_documents

RerankResult(
  model='bge-reranker-v2-m3',
  data=[{
    index=0,
    score=0.8221891,
    document={
        id='02fc645b77e893c769ea67cbd7114215d7c8c5aa8c8432d6753f1b6e2bcc5242',
        text='the chatbot will generate a return authorization (ra) number.'
    }
  },{
    index=1,
    score=0.8101787,
    document={
        id='954c7b39a4948ec9978d166a3b35c88b42bed9a9525b9598953051edd4f74260',
        text='follow the return process outlined above using the chatbot.'
    }
  },{
    index=2,
    score=0.007815889,
    document={
        id='5660656bed519ea84ccb8037366965ee8c9de4dd75364a479a6c13dba65af640',
        text='contact the chatbot with order details.'
    }
  },{
    index=4,
    score=0.0013831174,
    document={
        id='2d1bfaf5c7d248494b311ce557e189653f5bd3f4180ac8b78314121994a3dedd',
        text='there is a 30-day return or exchange policy.'
    }
  },{
    index=12,
    score=0.00090750254,
    document={
        id='688d963cb5dde5337af41ceaabd98f5485192c77d6f27e88