# Chunking to support multi-turn chat

In [1]:
import chromadb
import pandas as pd
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
import argparse

from chromadb.utils import embedding_functions

from chromadb.api.types import (
    Document,
    Documents,
    Embedding,
    Image,
    Images,
    EmbeddingFunction,
    Embeddings,
    is_image,
    is_document,
)
from tqdm import tqdm

In [2]:
# !pip install langchain

In [3]:
data_path = 'hpe_press_releases.csv'
df = pd.read_csv(data_path)

In [4]:
df['Content'].iloc[0]

'IN THIS ARTICLE\n\nHPE and the UAE Cyber Security Council will cooperate to enhance cyber skills training in schools and universities\nThe initiative will encourage young Emiratis to study technology at university and help them to acquire skills for a career in the technology industry\n\n\nDubai, United Arab Emirates – Jan. 20, 2022 – Hewlett Packard Enterprise (NYSE: HPE) today announced that it has signed a Memorandum of Understanding (MOU) with the UAE Cyber Security Council to enhance cyber skills training in schools and prepare Emirati youth for careers in the technology industry. HPE and the Cyber Security Council will launch various initiatives nationwide and expect to work with more than 500 students a year.\n\r\n The MoU was signed by Mohammed Hamad Al-Kuwaiti, Head of Cyber Security for the Government of the UAE, and Ahmad Alkhallafi, managing director, UAE, Hewlett Packard Enterprise, at Intersec, the largest security exhibition in the Middle East region, currently underway

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)

In [6]:
# l = text_splitter.split_text(df['Content'].iloc[0])
# [(i,o) for i,o in zip(l,[df['Content'].iloc[0]])][0]
def chunk_text(text_splitter,text):
    return text_splitter.split_text(text)

# chunk_text(text_splitter,df['Content'].iloc[10])

In [7]:
# data_path = data_path
df = pd.read_csv(data_path)
LEN=df.shape[0]
chunks = []
inds = []
for i in range(LEN):
    ch = chunk_text(text_splitter,df.iloc[i]['Content'])
    chunks+=ch
    inds+=[i]*len(ch)

print(len(chunks),len(inds))

1751 1751


In [87]:
settings = chromadb.get_settings()
settings.allow_reset = True
path_to_db='/nvmefs1/test_user/cache/rag_db2/'
# model_path='/nvmefs1/andrew.mendez/chromadb_cache/all-MiniLM-L6-v2'
model_path='/nvmefs1/test_user/cache/vector_model/e5-base-v2'
print(f"creating/resetting db at {path_to_db}...")
db = chromadb.PersistentClient(path=path_to_db, settings=settings)
print("Done!")
db.reset()
# model_path=args.emb_model_path
print("Loading {}...".format(model_path))
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=model_path, device="cuda"
)
print("Done!")
#model_path='/mnt/efs/shared_fs/determined/all-MiniLM-L6-v2/'
#emb_fn.models['all-MiniLM-L6-v2'].save(model_path)
#print("Model saved at:{} ".format(model_path))
collection = db.create_collection(name="HPE_press_releases", embedding_function=emb_fn)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=100)
def chunk_text(text_splitter,text):
    return text_splitter.split_text(text)

# data_path = csv_path
df = pd.read_csv(data_path)
LEN=df.shape[0]
chunks = []
inds = []
print("Number of docs: ",LEN)
for i in tqdm(range(LEN)):
    ch = chunk_text(text_splitter,df.iloc[i]['Content'])
    chunks+=ch
    inds+=[i]*len(ch)
print("Number of chunks: ",len(chunks))

for i in tqdm(range(len(chunks))):
    collection.add(
        documents=[chunks[i]],
        metadatas=[{'Title':df.iloc[inds[i]]['Title'],'Content':df.iloc[inds[i]]['Content'],'Date':df.iloc[inds[i]]['Date']}],
        ids=[f'id{str(i)}']
    )



creating/resetting db at /nvmefs1/test_user/cache/rag_db2/...
Done!
Loading /nvmefs1/test_user/cache/vector_model/e5-base-v2...
Done!
Number of docs:  149


100%|██████████| 149/149 [00:00<00:00, 4909.86it/s]


Number of chunks:  421


100%|██████████| 421/421 [00:18<00:00, 22.95it/s]


In [152]:
# results

In [159]:
query = "How long has Antonio Neri been at HPE?"
# query = "Who is Antonio Neri?"
# query = "What is HPE Greenlake for Large Language Models?"

results = collection.query(query_texts=[query], n_results=2)
print("query: ",query, "results: ",results['documents'])

query:  How long has Antonio Neri been at HPE? results:  [['HOUSTON, Texas – October 4, 2022 – Hewlett Packard Enterprise (NYSE: HPE) will host its Securities Analyst Meeting on October 19, 2022. Join the live webcast to hear Antonio Neri, president and CEO, and Tarek Robbiati, executive vice president and CFO, discuss HPE’s vision, strategy, and financial outlook.\xa0\n\nThe webcast will begin Wednesday, October 19 at 2:30 p.m. CT (3:30 p.m. ET) and will be available at www.hpe.com/investor/SAM2022. Following the executives’ presentations, HPE will hold a live Q&A session.\n\r\nA replay of the webcast will be available at the same website shortly after the call and will remain available for approximately one year. For additional information, see investors.hpe.com.\xa0\n\nAbout Hewlett Packard Enterprise\nHewlett Packard Enterprise (NYSE: HPE) is the global edge-to-cloud company that helps organizations accelerate outcomes by unlocking value from all of their data, everywhere. Built on

In [160]:
results

{'ids': [['id195', 'id419']],
 'distances': [[0.2937813699245453, 0.3080323083482148]],
 'metadatas': [[{'Content': 'HOUSTON, Texas – October 4, 2022 – Hewlett Packard Enterprise (NYSE: HPE) will host its Securities Analyst Meeting on October 19, 2022. Join the live webcast to hear Antonio Neri, president and CEO, and Tarek Robbiati, executive vice president and CFO, discuss HPE’s vision, strategy, and financial outlook.\xa0\n\nThe webcast will begin Wednesday, October 19 at 2:30 p.m. CT (3:30 p.m. ET) and will be available at www.hpe.com/investor/SAM2022. Following the executives’ presentations, HPE will hold a live Q&A session.\n\r\nA replay of the webcast will be available at the same website shortly after the call and will remain available for approximately one year. For additional information, see investors.hpe.com.\xa0\n\nAbout Hewlett Packard Enterprise\nHewlett Packard Enterprise (NYSE: HPE) is the global edge-to-cloud company that helps organizations accelerate outcomes by unl

In [158]:
results2 = "\n".join([results['documents'][0][0]])
results2 = results2[:4500]
print("len(results2): ",len(results2))
print("results2: ",results2)
prompt = f"[INST]`{results2}`. Using the above information, answer the following question factually: {query}. Answer concisely at most in three sentences. Respond in a natural way, like you are having a conversation with a friend.[/INST]"
print("=========prompt=============: ")
print(prompt)
print("=========end_of_prompt=============")

len(results2):  2331
results2:  Learning more about Bobby's career path, his advice for others, and what he does in his down time
Leading the charge to ensure HPE’s digital and physical assets are always protected is his primary job. Being passionate about football and a stickler for designer shoes is part of his alter-ego.

We had a chance to catch up with Bobby and get to know him a little better.  What is the best thing about working in IT security?

Information security, or cybersecurity, is a really cool field. It’s an industry that's been around for more than 30 years, and it's still at the beginning. It is fast-paced and continually evolving, so there is always more that we need to understand.  This is also why companies continue to invest heavily in IT security.

What advice would you have for someone who wants to get into IT security?

I would say understand how truly broad a topic it is and don't be intimidated by the technical aspects of it.  When you think of cybersecurity,

In [145]:
# results['documents']

In [146]:
# results.keys()

In [147]:
# [results['metadatas'][0][i]['Content'] for i in range(2)]

In [148]:
# results2 = "\n\n".join([results['metadatas'][0][0]['Content']])
# results2

In [149]:
# results['documents'][0][:2]

In [150]:
# results2 = "\n\n".join([results['metadatas'][0][0]['Content']])
# print("8500//3: ",8500//3)
results2 = "\n".join([i for i in results['documents'][0][:3]])
results2 = results2[:4500]
print("len(results2): ",len(results2))
print("results2: ",results2)
prompt = f"[INST]`{results2}`. Using the above information, answer the following question factually: {query}. Answer concisely at most in three sentences. Respond in a natural way, like you are having a conversation with a friend.[/INST]"
print("=========prompt=============: ")
print(prompt)
print("=========end_of_prompt=============")

len(results2):  4500
results2:  Learning more about Bobby's career path, his advice for others, and what he does in his down time
Leading the charge to ensure HPE’s digital and physical assets are always protected is his primary job. Being passionate about football and a stickler for designer shoes is part of his alter-ego.

We had a chance to catch up with Bobby and get to know him a little better.  What is the best thing about working in IT security?

Information security, or cybersecurity, is a really cool field. It’s an industry that's been around for more than 30 years, and it's still at the beginning. It is fast-paced and continually evolving, so there is always more that we need to understand.  This is also why companies continue to invest heavily in IT security.

What advice would you have for someone who wants to get into IT security?

I would say understand how truly broad a topic it is and don't be intimidated by the technical aspects of it.  When you think of cybersecurity,

In [None]:
!ls /nvmefs1/test_user/cache/vector_model/e5-base-v2

In [125]:
results

['Reimagining your technology strategy for sustainable, responsible, and innovative AI.\xa0In a session co-led by Crusoe, leaders with Crusoe, Rescale, and Writer joined HPE to discuss how AI is currently transforming productivity and automating workflows across industries. As technology leaders, the panelists also underscored how the industry needs to align on energy-efficient solutions to minimize impact environmental impact and ensure responsible development and deployment of AI.\u2028\u2028A full list of AI House Davos sessions that HPE drove with leading experts worldwide can be found\xa0here.']

In [121]:
# results
from datetime import datetime

In [143]:
date_strings = [i['Date'] for i in results['metadatas'][0]]
# Your list of datetime strings
# date_strings = ['2017-11-21', '2018-03-19', '2022-01-28', '2023-06-20', '2022-04-27']
# Step 1: Parse strings into datetime objects
date_objects = [datetime.fromisoformat(date_str) for date_str in date_strings]

# Step 2: Extract year, month, and day
formatted_dates = [dt.strftime('%Y-%m-%d') for dt in date_objects]

# Step 3: Sort datetime objects while keeping track of original indices
sorted_dates_with_indices = sorted(enumerate(zip(date_objects, formatted_dates)),
                                   key=lambda x: x[1][0], reverse=True)

# Extract sorted dates and original indices
sorted_dates = [date_str for _, (dt, date_str) in sorted_dates_with_indices]
original_indices = [index for index, _ in sorted_dates_with_indices]

# Print the result
print("Sorted Dates:", sorted_dates)
print("Original Indices:", original_indices)
results_x = [results["documents"][0][original_indices[0]],results["documents"][0][original_indices[1]],results["documents"][0][original_indices[2]]]# get the first three document
# await show_sources(results)
# print("results: ",results)`
'''
2/6/24 (Andrew): Add limit to ensure that any press release does not exceed >14k. 
This assumes TitanML API deployed on A100
This will decrease when API is deployed no T4.
'''
results2 = "\n".join(results_x)
results2 = results2[:4500]
print("len(results2): ",len(results2))
print("results2: ",results2)

Sorted Dates: ['2024-01-24', '2024-01-23', '2024-01-23', '2024-01-23', '2023-06-06', '2023-06-06', '2023-02-22', '2022-12-07', '2022-10-19', '2022-10-04', '2022-08-30', '2022-06-22', '2022-06-22', '2022-05-18', '2022-03-01']
Original Indices: [14, 2, 3, 7, 6, 8, 0, 12, 5, 1, 10, 4, 11, 13, 9]
len(results2):  3692
results2:  Reimagining your technology strategy for sustainable, responsible, and innovative AI. In a session co-led by Crusoe, leaders with Crusoe, Rescale, and Writer joined HPE to discuss how AI is currently transforming productivity and automating workflows across industries. As technology leaders, the panelists also underscored how the industry needs to align on energy-efficient solutions to minimize impact environmental impact and ensure responsible development and deployment of AI.  A full list of AI House Davos sessions that HPE drove with leading experts worldwide can be found here.
HPE veteran of 28+ years brings proven leadership to accelerate Supercomputing and AI 

In [142]:
# results["documents"][0][[0,1]]