##Install Required Packages

In [63]:
!pip install chromadb langchain openai tiktoken --quiet

In [64]:
!pip install langchain-community --quiet

## Importing Dependencies

In [65]:
import pandas as pd
import tiktoken
import os
import openai

#from openai.embeddings_utils import get_embedding

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader

##Mounting Google Drive to Access Data

In [66]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


DataSet provided at https://www.kaggle.com/datasets/fahadrehman07/movie-reviews-and-emotion-dataset by FAHAD REHMAN

In [74]:
movie = pd.read_csv('/content/drive/MyDrive/Movies_Reviews_modified_version1.csv')
movie

Unnamed: 0.1,Unnamed: 0,Ratings,Reviews,movie_name,Resenhas,genres,Description,emotion
0,0,3.0,"It had some laughs, but overall the motivation...",Waiting to Exhale,"Riu algumas risadas, mas no geral a motivação ...","['Comedy', 'Drama', 'Romance']","Based on Terry McMillan's novel, this film fol...",anticipation
1,1,4.0,"WAITING TO EXHALE Waiting, and waiting, and wa...",Waiting to Exhale,"ESPERANDO PARA EXALAR Esperando, e esperando, ...","['Comedy', 'Drama', 'Romance']","Based on Terry McMillan's novel, this film fol...",anticipation
2,2,4.0,"Angela Basset was good as expected, but Whitne...",Waiting to Exhale,"Angela Basset foi boa como o esperado, mas Whi...","['Comedy', 'Drama', 'Romance']","Based on Terry McMillan's novel, this film fol...",anticipation
3,3,5.0,"The movie is okay, mediocre might even be the ...",Waiting to Exhale,"O filme é bom, medíocre pode até ser a palavra...","['Comedy', 'Drama', 'Romance']","Based on Terry McMillan's novel, this film fol...",anticipation
4,4,5.0,I got an opportunity to see Waiting To Exhale ...,Waiting to Exhale,Tive a oportunidade de ver Waiting To Exhale p...,"['Comedy', 'Drama', 'Romance']","Based on Terry McMillan's novel, this film fol...",anticipation
...,...,...,...,...,...,...,...,...
46168,46168,10.0,10/10 buying this the second it's out online. ...,Robin Hood,10/10 comprando este no segundo que sai online...,"['Drama', 'Action', 'Romance']",Add a Plot,anticipation
46169,46169,4.0,"""Lady Jayne:Killer"" is a bottom of the barrel ...",Betrayal,"""Lady Jayne: Killer"" é uma parte inferior do f...","['Action', 'Drama', 'Thriller']",Felix and Misela are father and daughter and o...,sadness
46170,46170,4.0,"""Lady Jayne:Killer"" is a bottom of the barrel ...",Betrayal,"""Lady Jayne: Killer"" é uma parte inferior do f...","['Action', 'Drama', 'Thriller']",After being ditched by long term girlfriend Ge...,sadness
46171,46171,5.0,"As thrillers go, there are a few surprises her...",Betrayal,"Como os filmes de suspense, existem algumas su...","['Action', 'Drama', 'Thriller']",Felix and Misela are father and daughter and o...,sadness


## Group by movie_name and combine reviews

Removing unnecesary columns and grouping the reviews to reduce the total length of data to be processed

In [75]:
df = movie
# Group by movie_name and combine reviews
combined_df = df.groupby('movie_name').agg({
    'Description': 'first',  # Assuming the description is the same for all rows of the same movie
    'genres': 'first',  # Assuming genres are the same for all rows of the same movie
    'Reviews': 'first' # Combine all reviews
}).reset_index()

# Create the combined_info column
combined_df['combined_info'] = combined_df.apply(
    lambda row: f"Title: {row['movie_name']}. Overview: {row['Description']} Genres: {row['genres']} Reviews: {row['Reviews']}",
    axis=1
)

# Drop unnecessary columns if desired
combined_df = combined_df[['movie_name', 'combined_info']]

Drop unnecessary columns and clean the data

In [76]:
movie = combined_df
movie = movie.dropna()

Calculate the total length of all combined_info entries

In [78]:
res = 0;
for i in range(len(movie['combined_info'])):
  res += len(movie['combined_info'][i])
res

2361276

Save the cleaned and combined data to a CSV file

In [79]:
movie[['combined_info']].to_csv('movie_updated.csv', index=False)

Display the first few rows of the processed data

In [80]:
pd.read_csv('movie_updated.csv').head()

Unnamed: 0,combined_info
0,Title: +1. Overview: Three college friends hit...
1,Title: 10 Rules for Sleeping Around. Overview:...
2,Title: 10 Things I Hate About You. Overview: A...
3,Title: 100 Girls. Overview: Matt spends the ni...
4,Title: 11/11/11. Overview: Nicole has six mont...


#Setting up openAI API

In [81]:
api_key = 'API_KEY'

Increase CSV field size limit to handle large text data, Load data using CSVLoader, Split data into chunks for embedding and Set up OpenAI embeddings

In [82]:
import csv
import sys
import time

csv.field_size_limit(sys.maxsize)
#data loader
loader = CSVLoader(file_path="movie_updated.csv")
data = loader.load()

#data transformers
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

#embeddings model, this can be a local LLM as well
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)

Sample text chunk

In [83]:
texts[1]

Document(metadata={'source': 'movie_updated.csv', 'row': 1}, page_content='combined_info: Title: 10 Rules for Sleeping Around. Overview: A screwball sex comedy following two couples and their 10 rules to a happy, healthy, open relationship. Genres: [\'Comedy\', \'Romance\'] Reviews: This film begins with an attractive young woman named "Cameron Johnson" (Virginia Williams) asking her good friend "Kate Oliver" (Tammin Sursok) for permission to use her summer house in the Hamptons for the weekend. When Kate asks her what her plans are Cameron reveals that she has a date she would like to spend some time with. This shocks Kate because she knows that Cameron is married. It\'s then that Cameron tells her that she and her husband "Vince Johnson" (Jesse Bradford) have an arraignment where they are allowed to sleep around with strangers as long as they follow 10 basic rules. As it so happens, at that exact same moment, Vince is also telling Kate\'s boyfriend "Ben Roberts" (Chris Marquette) abo

Set up batch size and sleep time. Initialize an empty list to store embeddings. Embed the text data in batches

In [84]:
# Batching and Rate Limiting
batch_size = 32  # Adjust as needed
sleep_time = 2  # Adjust as needed

# Initialize an empty list to store embeddings
embeddings_result = []

# Iterate over the texts in batches and embed them
for i in range(0, len(texts), batch_size):
    batch = texts[i: i + batch_size]
    # Ensure all elements in the batch are strings
    batch = [str(text) for text in batch]  # Convert non-string elements to strings
    embeddings_result.extend(embeddings.embed_documents(batch))  # Embed the batch
    time.sleep(sleep_time)  # Introduce a delay after each batch

Reinstall chromadb if needed for vector store operations

In [85]:
!pip install chromadb --quiet

Creating a Vector Store

Store the embedded texts in a Chroma vector store for efficient retrieval.

In [86]:
from langchain.vectorstores import Chroma
docsearch = Chroma.from_documents(
    texts,
    embeddings
)

Performing a Sample Query

Use the vector store to perform a similarity search and retrieve relevant documents based on a user query.


In [87]:
query = "I'm looking for an action movie. What could you suggest to me?"
docs = docsearch.similarity_search(query, k=1)
docs

[Document(metadata={'row': 1361, 'source': 'movie_updated.csv'}, page_content="combined_info: Title: The Quest. Overview: John Douglas, a high-society playboy, is a cynic concerning the women of his social set, and has a pictured ideal of the girl of his dreams. Wising to avoid the upcoming social season, he ...                See full summary\xa0» Genres: ['Action', 'Adventure'] Reviews: A thief is kidnapped by smugglers and then forced to fight in a competition, in which the winner takes home the golden dragon(Which is apparently worth millions, but looks like it was made out of brass) lots of fight sequences ensue. The Quest is proof that action stars need not direct. Jean-Claude Van Damme directs each actor in such a lackluster fashion that they manage to almost become invisible. Roger Moore and James Remar are truly wasted and even the fight sequences are poorly staged. If you really want to see something like this done well, check out Bloodsport and Kickboxer.* out of 4 (Bad)")]

# Setting Up OpenAI Chat Model

Configure the OpenAI GPT-3.5 Turbo model for querying and set up environment variables.


In [88]:
import os
os.environ['OPENAI_API_KEY'] = api_key

In [89]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

Constructing a QA Chain

Build a retrieval-based question-answering chain using the pre-loaded vector store and the chat model.


In [90]:
qa = RetrievalQA.from_chain_type(llm,
                                 chain_type="stuff",
                                 retriever=docsearch.as_retriever(),
                                 return_source_documents=True)

Sample query using the QA chain

In [97]:
query = "I'm looking for a romantic movie with a psitive review. What could you suggest to me?"
result = qa({"query": query})
result['result']

'Based on your preferences and age, you might enjoy the movie "La La Land." It is a romantic musical film that has received positive reviews for its captivating story, beautiful cinematography, and memorable music. Give it a try and see if it resonates with you!'

Display the source document for the query result

In [98]:
result['source_documents'][0]

Document(metadata={'row': 971, 'source': 'movie_updated.csv'}, page_content="combined_info: Title: Roman Holiday. Overview: A bored and sheltered princess escapes her guardians and falls in love with an American newsman in Rome. Genres: ['Comedy', 'Romance'] Reviews: I just don't get it how such movies get so many good reviews and even win Oscars! Now again, what's so good about this movie?! Don't bother answering, I know: Nothing! Boring story with no memorable lines and nothing about it is entertaining. I like Gregory Peck and Hepburn though but the movie just fails to do it for me.I think people just like fairy tales put for them in a realistic mold. I don't mind stories of the like but if were entertaining and can make me bear to the end.I have to say though that I can totally understand those movies were good in their own times being new ideas and when there was like one actor every 20 years but not nowadays, oh nah!")

# Custom Prompt for Movie Recommendations

Define a custom prompt template to provide more detailed movie recommendations.


In [93]:
from langchain.prompts import PromptTemplate

template = """You are a movie recommender system that help users to find anime that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three movies, with a short description of the plot and the reason why the user might like it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Your response:"""


PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}

llm=ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

qa = RetrievalQA.from_chain_type(llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)

query = "I'm looking for an action movie with animals, any suggestions?"
result = qa({'query':query})
print(result['result'])

1. Princess Mononoke: This anime film follows the story of a young warrior who gets involved in a struggle between the gods of a forest and the humans who consume its resources. The movie features action-packed scenes with various animal spirits and creatures, making it a thrilling watch for fans of action and animals.

2. Wolf Children: In this heartwarming anime movie, a young woman raises her half-wolf, half-human children after the death of their werewolf father. The film beautifully combines elements of action and drama, showcasing the unique bond between humans and animals in a touching and adventurous way.

3. The Boy and the Beast: This anime film tells the story of a young boy who enters the beast world and becomes the disciple of a bear-like warrior. The movie is filled with action-packed sequences, featuring various animal characters and a captivating storyline that explores the connection between humans and beasts.


Adding User Information to the Prompt

Incorporate user-specific information into the prompt to personalize movie recommendations.


In [94]:
from langchain.prompts import PromptTemplate

template_prefix = """You are a movie recommender system that help users to find movie that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix= """Question: {question}
Your response:"""

user_info = user_info.format(age = 25, gender = 'male')

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

You are a movie recommender system that help users to find movie that match their preferences. 
Use the following pieces of context to answer the question at the end. 
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 25
Gender: male
Question: {question}
Your response:


Set up the final prompt template

In [95]:
PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)

query = "I'm looking for an action movie with cats, any suggestions?"
result = qa({'query':query})
print(result['result'])

Based on your preference for action movies with cats, I recommend checking out the movie "Unleashed." It involves a cosmic event turning a dog and cat into two perfect guys, which might be an interesting twist for you.


In [96]:
result['source_documents']

[Document(metadata={'row': 1486, 'source': 'movie_updated.csv'}, page_content="combined_info: Title: Unleashed. Overview: When a cosmic event turns Emma's dog and cat into two perfect guys, Emma reconsiders her outlook on dating, hilariously works out her trust issues, and ultimately learns to love herself. Genres: ['Action', 'Crime'] Reviews: This was a terrible movie. Some of the parts are so stupid & silly that I found myself laughing out loud. This movie has such an absolutely absurd plot that it is a waste of 8 dollars and 2 hours. Granted, Jet Li kicks some serious a$$, but it is nonetheless a terrible plot. The entire middle section of the movie contains almost no fighting - but instead, the movie tries to build up some stupid love drama. The movie starts out very promising - with some intense fight scenes, but then quickly goes downhill. Jet Li's acting is one for the record books - in some scenes of the movie - you may honestly think that this is a movie about mental retardati