DATA Cleaning

In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("hf://datasets/dair-ai/emotion/unsplit/train-00000-of-00001.parquet")
df.head(10)
df = df.sample(frac=0.1)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def write_emotions_to_txt(df, output_file):
    # Open the output file in write mode
    with open(output_file, 'w') as f:
        # Group the dataframe by 'label' (emotion types)
        grouped = df.groupby('label')
        emotion_type=None
        
        # Loop through each story type and its corresponding stories
        for emotion_type_id, emotions in grouped:
            # Write the story type as a heading
            if emotion_type_id==0:
                emotion_type="sadness"
            if emotion_type_id==1:
                emotion_type="joy"
            if emotion_type_id==2:
                emotion_type="love"
            if emotion_type_id==3:
                emotion_type="anger"
            if emotion_type_id==4:
                emotion_type="fear"
            if emotion_type_id==5:
                emotion_type="surprise"
            f.write(f"{emotion_type}\n")
            
            # Loop through all stories under this story type and write them
            for emotion in emotions['text']:
                f.write(f"{emotion}\n")  # Add a new line after each story
            
            # Add a couple of blank lines between different story types
            f.write("\n\n")

    print(f"Emotions have been written to {output_file}")

In [4]:
write_emotions_to_txt(df, "emotions.txt")

Emotions have been written to emotions.txt


RAG

In [12]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_community.document_loaders import TextLoader #load the document
from langchain_text_splitters import RecursiveCharacterTextSplitter #for creating chunks from the loaded document
from langchain_openai import OpenAIEmbeddings #for converting chunks into embeddings
from langchain_chroma import Chroma #database for stroring the embeddings

In [13]:
from dotenv import load_dotenv
load_dotenv()

True

In [40]:
import os
dir = os.getcwd()
db_dir = os.path.join(dir,"chroma_db")
print(db_dir)

/Users/joanneliu/Desktop/AI4DM/H2M1/FlaskApp_Template/chroma_db


Create vector DB

In [41]:
#Read the text content from the .txt file and load it as langchain document
loader = TextLoader('emotions.txt')
document = loader.load()

In [42]:
#Split the document into chunks using text splitters 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(document)

print("Document chunk info:\n")
print(f"Number of document chunks: {len(chunks)}")
print(f"Sample chunk: \n{chunks[3].page_content}\n")

Document chunk info:

Number of document chunks: 5163
Sample chunk: 
i feel she is emotionally disturbed and like she said its either me or her

i just feel like i abused the time with the awesomeness that is my photo instructor

i will be off to work again leaving the kids at home my feeling of discontent is replaced with happiness for having this

i will feel weepy or anxious but it s manageable

i feel totally rejected boo hoo never mind

i am left feeling empty and confused

i feel unloved right now

i feel sort of lethargic

i feel like i m constantly apologizing for lame food pictures so here i go again

i get ready for bed that evening i am feeling really really discouraged and vow to do nothing but work on my cv the next day leave this silly writing behind buckle down and find real work

i cant help feeling totally rejected like im the only one no one cares enough to ask

i have been feeling very deprived

ive just been feeling ignored



In [43]:
#create embeddings using openAI embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [44]:
#store the embeddings and chunks into Chroma DB
Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_dir)

<langchain_chroma.vectorstores.Chroma at 0x3aa5d3970>

Retrieve and generate

In [45]:
#setting up the DB for retrieval
embeddings_used = OpenAIEmbeddings(model="text-embedding-3-small")
vectorDB = Chroma(persist_directory=db_dir,embedding_function=embeddings_used)

In [46]:
#setting up Retriver
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [47]:
def getRetriever(dir):
    """
    dir is the directory of the vector DB
    """
    embeddings_used = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorDB = Chroma(persist_directory=dir,embedding_function=embeddings_used)
    retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    return retriever

In [51]:
def poemGeneration_langChain_RAG(msg,theme,retrieverDir):
    """
    msg is the scenario for the story from the pic (hugging face model output);
    type is the genre of the story- Horror, Fantasy, Adventure, Comedy, Mystery, Romance
    retriever is the vector DB with relevant stories from txt version of 
        stories dataset from Hugging face - https://huggingface.co/datasets/ShehryarAzhar/stories
    """
    llm = ChatOpenAI(
            model="gpt-4o",
            temperature=0.2,
            max_tokens=200,
            timeout=None,
            max_retries=2
        )

    system_prompt = (
        "You are a expert poem writer about the theme of {theme}" 
        "Use the following pieces of retrieved context to generate a poem based on the given emotion and around the theme of {theme} "
        "keep the poem to less than 20 words."
        "\n\n"
        "{context}"
    )

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{scenario_lang}"),
        ]
    )

    rag_chain = prompt | llm | StrOutputParser()

    retriever = getRetriever(retrieverDir)

    out_message = rag_chain.invoke({
            "theme" : theme,
            "context":retriever,
            "scenario_lang" : msg,
        })
    
    return out_message

In [52]:
emotion = "i am ever feeling nostalgic about the fireplace i will know that it is still on the property" #example output from huggingface model
story = poemGeneration_langChain_RAG("warm", emotion, db_dir)
print(story)

By the hearth's glow,  
Memories softly burn,  
In the heart's corner,  
The fire's warmth returns.
