# Chunking to support multi-turn chat

In [None]:
import chromadb
import pandas as pd
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
import argparse

from chromadb.utils import embedding_functions

from chromadb.api.types import (
    Document,
    Documents,
    Embedding,
    Image,
    Images,
    EmbeddingFunction,
    Embeddings,
    is_image,
    is_document,
)
from tqdm import tqdm

In [None]:
# !pip install langchain

In [None]:
data_path = 'hpe_press_releases.csv'
df = pd.read_csv(data_path)

In [None]:
# df['Content'].iloc[0]

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)

In [None]:
# l = text_splitter.split_text(df['Content'].iloc[0])
# [(i,o) for i,o in zip(l,[df['Content'].iloc[0]])][0]
def chunk_text(text_splitter,text):
    return text_splitter.split_text(text)

# chunk_text(text_splitter,df['Content'].iloc[10])

In [None]:
# data_path = data_path
df = pd.read_csv(data_path)
LEN=df.shape[0]
chunks = []
inds = []
for i in range(LEN):
    ch = chunk_text(text_splitter,df.iloc[i]['Content'])
    chunks+=ch
    inds+=[i]*len(ch)

print(len(chunks),len(inds))

In [None]:
settings = chromadb.get_settings()
settings.allow_reset = True
path_to_db='/nvmefs1/test_user/cache/rag_db2/'
# model_path='/nvmefs1/andrew.mendez/chromadb_cache/all-MiniLM-L6-v2'
model_path='/nvmefs1/test_user/cache/vector_model/e5-base-v2'
print(f"creating/resetting db at {path_to_db}...")
db = chromadb.PersistentClient(path=path_to_db, settings=settings)
print("Done!")
db.reset()
# model_path=args.emb_model_path
print("Loading {}...".format(model_path))
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=model_path, device="cuda"
)
print("Done!")
#model_path='/mnt/efs/shared_fs/determined/all-MiniLM-L6-v2/'
#emb_fn.models['all-MiniLM-L6-v2'].save(model_path)
#print("Model saved at:{} ".format(model_path))
collection = db.create_collection(name="HPE_press_releases", embedding_function=emb_fn)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=100)
def chunk_text(text_splitter,text):
    return text_splitter.split_text(text)

# data_path = csv_path
df = pd.read_csv(data_path)
LEN=df.shape[0]
chunks = []
inds = []
print("Number of docs: ",LEN)
for i in tqdm(range(LEN)):
    ch = chunk_text(text_splitter,df.iloc[i]['Content'])
    chunks+=ch
    inds+=[i]*len(ch)
print("Number of chunks: ",len(chunks))

for i in tqdm(range(len(chunks))):
    collection.add(
        documents=[chunks[i]],
        metadatas=[{'Title':df.iloc[inds[i]]['Title'],'Content':df.iloc[inds[i]]['Content'],'Date':df.iloc[inds[i]]['Date']}],
        ids=[f'id{str(i)}']
    )



In [None]:
# results

In [None]:
query = "How long has Antonio Neri been at HPE?"
# query = "Who is Antonio Neri?"
# query = "What is HPE Greenlake for Large Language Models?"

results = collection.query(query_texts=[query], n_results=2)
print("query: ",query, "results: ",results['documents'])

In [None]:
results

In [None]:
results2 = "\n".join([results['documents'][0][0]])
results2 = results2[:4500]
print("len(results2): ",len(results2))
print("results2: ",results2)
prompt = f"[INST]`{results2}`. Using the above information, answer the following question factually: {query}. Answer concisely at most in three sentences. Respond in a natural way, like you are having a conversation with a friend.[/INST]"
print("=========prompt=============: ")
print(prompt)
print("=========end_of_prompt=============")

In [None]:
# results['documents']

In [None]:
# results.keys()

In [None]:
# [results['metadatas'][0][i]['Content'] for i in range(2)]

In [None]:
# results2 = "\n\n".join([results['metadatas'][0][0]['Content']])
# results2

In [None]:
# results['documents'][0][:2]

In [None]:
# results2 = "\n\n".join([results['metadatas'][0][0]['Content']])
# print("8500//3: ",8500//3)
results2 = "\n".join([i for i in results['documents'][0][:3]])
results2 = results2[:4500]
print("len(results2): ",len(results2))
print("results2: ",results2)
prompt = f"[INST]`{results2}`. Using the above information, answer the following question factually: {query}. Answer concisely at most in three sentences. Respond in a natural way, like you are having a conversation with a friend.[/INST]"
print("=========prompt=============: ")
print(prompt)
print("=========end_of_prompt=============")

In [None]:
!ls /nvmefs1/test_user/cache/vector_model/e5-base-v2

In [None]:
results

In [None]:
# results
from datetime import datetime

In [None]:
date_strings = [i['Date'] for i in results['metadatas'][0]]
# Your list of datetime strings
# date_strings = ['2017-11-21', '2018-03-19', '2022-01-28', '2023-06-20', '2022-04-27']
# Step 1: Parse strings into datetime objects
date_objects = [datetime.fromisoformat(date_str) for date_str in date_strings]

# Step 2: Extract year, month, and day
formatted_dates = [dt.strftime('%Y-%m-%d') for dt in date_objects]

# Step 3: Sort datetime objects while keeping track of original indices
sorted_dates_with_indices = sorted(enumerate(zip(date_objects, formatted_dates)),
                                   key=lambda x: x[1][0], reverse=True)

# Extract sorted dates and original indices
sorted_dates = [date_str for _, (dt, date_str) in sorted_dates_with_indices]
original_indices = [index for index, _ in sorted_dates_with_indices]

# Print the result
print("Sorted Dates:", sorted_dates)
print("Original Indices:", original_indices)
results_x = [results["documents"][0][original_indices[0]],results["documents"][0][original_indices[1]],results["documents"][0][original_indices[2]]]# get the first three document
# await show_sources(results)
# print("results: ",results)`
'''
2/6/24 (Andrew): Add limit to ensure that any press release does not exceed >14k. 
This assumes TitanML API deployed on A100
This will decrease when API is deployed no T4.
'''
results2 = "\n".join(results_x)
results2 = results2[:4500]
print("len(results2): ",len(results2))
print("results2: ",results2)

In [None]:
# results["documents"][0][[0,1]]