Script Preprocessing Workflow:

1. Upload and Read the "Script" uploaded by the livestreamer.
2. Preprocess the Script into smaller chunks (either by paragraph or sentences)
3. Embed each chunk and store in ChromaDB (Vector Database) for RAG

In [1]:
# Read the script from an uploaded file
def read_script(file_path):
    with open(file_path, 'r') as file:
        script = file.read()
    return script

# Can look into other methods for splitting into chunks (this requires the user to split into paragraphs by 2 lines)
def split_script(script, split_by='paragraph'):
    if split_by == 'paragraph':
        # Split by double newline characters for paragraphs
        chunks = script.split('\n\n')
    elif split_by == 'sentence':
        # Split by periods for sentences
        chunks = script.split('. ')
    else:
        raise ValueError("Unsupported split_by value. Use 'paragraph' or 'sentence'.")
    return chunks

In [2]:
import ollama

# Use nomic-embed-text (local model) for embeddings
# print(ollama.embeddings(model='nomic-embed-text', prompt='The sky is blue because of rayleigh scattering'))

def get_embedding(text):
    res = ollama.embeddings(model='nomic-embed-text', prompt=text)
    return res['embedding']

import chromadb

# Initialize the ChromaDB client
chroma_client = chromadb.Client()

# Create a collection for the script
script_collection_name = "collection"
script_collection = chroma_client.get_or_create_collection(name=script_collection_name)

    
# Embed and store script chunks
def store_script_chunks(chunks):
    
    embeddings = [get_embedding(chunk) for chunk in chunks]
    documents = chunks
    ids = list(str(x) for x in range(len(chunks)))
    script_collection.add(
        documents=documents,
        embeddings=embeddings,
        ids=ids
    )

# Main function to process the script
def process_script(file_path):
    script = read_script(file_path)
    chunks = split_script(script, split_by='paragraph')  # You can also choose 'sentence'
    store_script_chunks(chunks)

# Example usage
file_path = "../data/scripts/example_script" 
process_script(file_path)


In [3]:
script_collection.get()

{'ids': ['0', '1', '2', '3', '4', '5', '6', '7'],
 'embeddings': None,
 'metadatas': [None, None, None, None, None, None, None, None],
 'documents': ['[Product 1: Casual Sneakers]\n"First up, we have our best-selling casual sneakers. These beauties come in sizes 6 through 12, with half sizes available. Priced at just $75, they offer incredible value. Let’s talk about some of their standout features:\nBreathable Mesh Upper: This material allows for maximum airflow, keeping your feet cool and comfortable all day long.\nMemory Foam Insoles: These insoles conform to the shape of your feet, providing personalized comfort with every step.\nVersatile Colors: Choose from classic black, crisp white, and navy blue to match any outfit.\nThese sneakers run true to size, so you can order your usual size with confidence."',
  '\n[Product 2: Running Shoes]\n"Next on our list are the top-rated running shoes, designed for both performance and comfort. Available in sizes 5 through 11, including half siz

In [15]:
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser, JsonOutputParser

local_llm = 'llama3'
llama3_chat = ChatOllama(model=local_llm, temperature=0)
llama3_chat_json = ChatOllama(model=local_llm, temperature=0, format='json')

In [None]:
prompt = ChatPromptTemplate.from_template(
    """
    You are a helpful AI Assistant chatbot that replies to user queries from a livestream.
    
    Based on the following:
    {document},
    answer the user's message.
    
    User Message: {user_message}
    
    Adopt a helpful tone and be concise in your answers.
    """
)
output_parser = StrOutputParser()
json_chain = prompt | llama3_chat_json | JsonOutputParser()

In [5]:
prompt = ChatPromptTemplate.from_template(
    """
    You are a helpful AI Assistant chatbot that replies to user queries from a livestream.
    
    Based on the following:
    {document},
    answer the user's message.
    
    User Message: {user_message}
    
    Adopt a helpful tone and be concise in your answers.
    """
)
output_parser = StrOutputParser()
chain = prompt | llama3_chat | output_parser

In [6]:
document = script_collection.get(ids=['6'])
document

{'ids': ['6'],
 'embeddings': None,
 'metadatas': [None],
 'documents': ['\n[Product 7: Slip-Ons]\n"For those who love convenience, our slip-ons are the perfect choice. Priced at $85, these come in sizes 6 through 12. Here are the details:\nSleek Design: Suitable for both casual and semi-formal occasions.\nCushioned Insole: Provides all-day comfort.\nVersatile Colors: Available in grey, navy, and black.\nThese slip-ons have a slightly snug fit, so if you’re between sizes, go up half a size."'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [7]:
document_text = document['documents'][0]
document_text

'\n[Product 7: Slip-Ons]\n"For those who love convenience, our slip-ons are the perfect choice. Priced at $85, these come in sizes 6 through 12. Here are the details:\nSleek Design: Suitable for both casual and semi-formal occasions.\nCushioned Insole: Provides all-day comfort.\nVersatile Colors: Available in grey, navy, and black.\nThese slip-ons have a slightly snug fit, so if you’re between sizes, go up half a size."'

In [8]:
message = "what colours slip ons"

chain.invoke({"document": document_text, "user_message": message})

'Hi there!\n\nAccording to our product details, the Slip-Ons are available in three versatile colors: grey, navy, and black.\n\nHope that helps! Let me know if you have any other questions.'

In [9]:
# retrieve specific parts from script to reply user's message
def get_relevant_document(message):
    res = script_collection.query(query_texts=[message], n_results=1)
    return res

# generate auto reply from retrieved document, if no retrieved documents, return None
def get_auto_reply(message):
    document = get_relevant_document(message)
    if document:
        reply = chain.invoke({"document": document, "user_message": message})
        return reply
    else:
        return None

In [14]:
script_collection.query(query_embeddings=[get_embedding("Boo")], n_results=1)

{'ids': [['1']],
 'distances': [[522.2816162109375]],
 'metadatas': [[None]],
 'embeddings': None,
 'documents': [['\n[Product 2: Running Shoes]\n"Next on our list are the top-rated running shoes, designed for both performance and comfort. Available in sizes 5 through 11, including half sizes, these are priced at $120. Let’s take a closer look at what makes these running shoes a favorite among our customers:\nLightweight Design: Made with advanced materials to reduce weight without sacrificing durability.\nDurable Rubber Outsole: Provides excellent traction on various surfaces, ensuring stability during your runs.\nCushioned Midsole: This feature absorbs impact, protecting your joints and reducing fatigue.\nThese running shoes come in neon green, electric blue, and classic black. They have a snug fit, so if you’re between sizes, it’s best to go for the larger one."']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}