In [1]:
import os
from dotenv import load_dotenv
import nltk
from openai import OpenAI
import pandas as pd
import tiktoken
from typing import List

nltk.download('punkt')
nltk.download('stopwords')
load_dotenv()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iabdu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iabdu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo"
EMBED_MODEL = "text-embedding-ada-002"
# Store the API key in a variable.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=OPENAI_API_KEY)
MAX_TOKENS = 1536

def prep(text: str):
    return text.replace("\n", " ").replace("\r", " ").replace("\t", " ")

def tokenize(text: List[str]):
    encoding = tiktoken.encoding_for_model(EMBED_MODEL)
    return encoding.encode(text)

def embed(tokens: List[int]):
    response = client.embeddings.create(input=tokens,model=EMBED_MODEL)
    return response.data[0].embedding

def chunk_text(text:str):
    current_chunk = []
    current_para = ""
    chunks = []
    paras = []
    current_len = 0
    sentences = nltk.sent_tokenize(text)
    chunks_of_tokens = []
    
    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenize(sentence)
        sentence_token_len = len(sentence_tokens)
        
        # Check if adding the next sentence exceeds the max token limit
        if current_len + sentence_token_len > MAX_TOKENS:
            # Add the current chunk to the list and start a new one
            paras.append(current_para)
            current_para = ""
            chunks_of_tokens.append(current_chunk)
            embeddings = embed(current_chunk)
            chunks.append(embeddings)
            current_chunk = []
            current_len = 0
        
        # Add the sentence to the current chunk
        current_para += " " + sentence
        current_chunk.extend(sentence_tokens)
        current_len += sentence_token_len
    
    # Add the last chunk if it's not empty
    if current_chunk:
        paras.append(current_para)
        chunks_of_tokens.append(current_chunk)
        embeddings = embed(current_chunk)
        chunks.append(embeddings)

    return paras, chunks, chunks_of_tokens

def create_embeddings(filename: str):
    with open(filename, "r") as file:
        text = file.read()
    text = prep(text)
    return chunk_text(text)
    
def create_embeddings_prompt(prompt:str):
    prompt = prep(prompt)
    return chunk_text(prompt)

def vectorize_chunks(paras: List, chunks: List, **kwargs):
    vectors = []
    for i in range(len(chunks)):
        if "filename" in kwargs:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"file": filename, "para": f"{paras[i]}"}})
        else:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"para": f"{paras[i]}"}})
        
    return vectors

In [3]:
import time

vector_list = []
def vectorize_movie_list(row):
    paras, chunks, chunks_of_tokens  = create_embeddings_prompt(row["new_column"])
    vectors = vectorize_chunks(paras, chunks)
    return vectors


In [4]:
movies_df = pd.read_csv('Resources/rotten_tomatoes_movies.csv')
movies_df.columns
pd.set_option('display.max_columns', None)

In [5]:
columns_to_drop = ['rotten_tomatoes_link', 'authors', 'streaming_release_date', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']
movies_df.drop(columns=columns_to_drop, inplace=True)

In [6]:
movies_df.dtypes

movie_title               object
movie_info                object
critics_consensus         object
content_rating            object
genres                    object
directors                 object
actors                    object
original_release_date     object
runtime                  float64
production_company        object
tomatometer_status        object
tomatometer_rating       float64
tomatometer_count        float64
audience_status           object
audience_rating          float64
audience_count           float64
dtype: object

In [7]:
movies_df.fillna("",inplace=True)
movies_df["new_column"] = movies_df["movie_title"] + movies_df["movie_info"] + movies_df["critics_consensus"] + movies_df["content_rating"] + movies_df["genres"] + movies_df["directors"] + movies_df["actors"] + movies_df["original_release_date"] + str(movies_df["runtime"]) + movies_df["production_company"] + str(movies_df["tomatometer_rating"]) + str(movies_df["tomatometer_count"]) + str(movies_df["audience_rating"]) + str(movies_df["audience_count"])

  movies_df.fillna("",inplace=True)


In [8]:
df_short = movies_df.copy()
df_short=df_short[["new_column"]]
df_short.head()

Unnamed: 0,new_column
0,Percy Jackson & the Olympians: The Lightning T...
1,Please GiveKate (Catherine Keener) and her hus...
2,"10A successful, middle-aged Hollywood songwrit..."
3,12 Angry Men (Twelve Angry Men)Following the c...
4,"20,000 Leagues Under The SeaIn 1866, Professor..."


In [9]:
movies_df["vectors"]=df_short.apply(vectorize_movie_list,axis=1)
movies_df.head()

In [10]:
movies_df.to_csv('Resources/movies_vectors.csv', index=False)

In [11]:
movies_vectors = pd.read_csv('Resources/movies_vectors.csv')
movies_vectors.head()

Unnamed: 0,movie_title,movie_info,critics_consensus,content_rating,genres,directors,actors,original_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,new_column,vectors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,Percy Jackson & the Olympians: The Lightning T...,"[{'id': '0', 'values': [0.0041449228301644325,..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,90.0,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,Please GiveKate (Catherine Keener) and her hus...,"[{'id': '0', 'values': [-0.006923696491867304,..."
2,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,122.0,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,"10A successful, middle-aged Hollywood songwrit...","[{'id': '0', 'values': [-0.01198294386267662, ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,95.0,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,12 Angry Men (Twelve Angry Men)Following the c...,"[{'id': '0', 'values': [0.00045957480324432254..."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,127.0,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,"20,000 Leagues Under The SeaIn 1866, Professor...","[{'id': '0', 'values': [0.020719904452562332, ..."


In [12]:
from pinecone import Pinecone

pc = Pinecone(api_key="db471cdc-9b70-4057-afc0-de7ff20c73d7")
index = pc.Index("new2")

In [13]:
import ast

# Extract vectors from the DataFrame
formatted_vectors = []

x = 0
vectors = movies_vectors["vectors"].tolist()
for vector in vectors:
    v1 = ast.literal_eval(vector[1:-1])

    if isinstance(v1, dict):
        v1["id"] = str(x)
        formatted_vectors.append(v1)
    elif isinstance(v1, tuple):
        for v in v1:
            v["id"] = str(x) + chr(65 + int(v["id"]))
            formatted_vectors.append(v)
    else:
        print(f"Error at Index: {x}. Not a dictionary or tuple.")
    x += 1
    

In [14]:
import itertools

def chunks(iterable, batch_size):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

# Upsert data with 10 vectors per upsert request
# x = 0
# for ids_vectors_chunk in chunks(formatted_vectors, batch_size=100):
#     index.upsert(vectors=ids_vectors_chunk)
#     print(x)
#     x+=1

In [15]:
from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain

query_responses=[]

def ask_a_question(prompt):
    # convert the prompt to chunks of  embeddings
    paras, chunks, chunks_of_tokens  = create_embeddings_prompt(prompt)
    print(f"Embeddings: {chunks[0]}")
    # vectorize the embeddings
    prompt_vectors = vectorize_chunks(paras, chunks)
    print(f"Vectorized: {prompt_vectors[0]}")
    # search the index for the best match using semantic search
    query_response = index.query(
        top_k=1,
        vector=prompt_vectors[0]["values"]
    )
    query_responses.append(query_response)
    print(f"Query response: {query_response}")
    # get the id of the best match
    best_id = query_response["matches"][0]["id"]
    print(f"Best ID: {best_id}")
    # fetch the best match from the index
    result = index.fetch(ids=[best_id])
    # get the paragraph of interest from the result metadata
    para_of_interest = result["vectors"][best_id]["metadata"]["para"]
    print(f"Para of interest: {para_of_interest}")
    # Initialize the langchain chat model.
    llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=OPENAI_MODEL, temperature=0.0)
    # turn the para_of_interest into a Document
    document = Document(page_content=para_of_interest)
    # Create the QA chain using the LLM.
    chain = load_qa_chain(llm)
    # Pass the para_of_interest and the prompt to the chain, and print the result.
    question = "If you can't find the answer in the provided document, say, I just don't know the answer to that, otherwise, answer the question. " + prompt
    result = chain.invoke({"input_documents": [document], "question": question})
    return result["output_text"], para_of_interest

In [16]:
# query_responses=[]

# questions = ["In the movie 'The Shawshank Redemption,' what is the name of the protagonist played by Tim Robbins?",
#             "Which 1994 film won the Academy Award for Best Picture?",
#             "Who directed the 1975 thriller film 'Jaws'?",
#             "In the movie 'Forrest Gump,' what does Forrest say life is like?",
#             "Which actor played the lead role in the 'Rocky' film series?",
#             "What is the name of the fictional land in the movie 'The Wizard of Oz'",
#             "In the movie 'Jurassic Park,' what kind of dinosaurs were the main attraction of the park?",
#             "Who played the iconic character Jack Dawson in the 1997 film 'Titanic'",
#             "Which 1980 comedy film features the iconic line 'Here's Johnny!'?",
#             "In the movie 'The Godfather,' what is the name of Vito Corleone's youngest son, played by Al Pacino?"]

# answers = []
# for question in questions:
#     answers.append(ask_a_question(question))

In [17]:
# # Tomato status = [NA, Rotten, Fresh, Certified Fresh]
# # Audience Status = [NA, Spilled, Upright]

# # NA = Anything with NA
# # negative = Rotten + Spilled
# # neutral = Rotten + Upright or Fresh + Spilled
# # positive = Fresh + Upright or Certified Fresh + Spilled
# # very_positive = Certified Fresh + Upright

# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# # Initialize the analyzer
# analyzer = SentimentIntensityAnalyzer()

# # Define custom tomato and audience statuses
# tomato_status = ["NA", "Rotten", "Fresh", "Certified Fresh"]
# audience_status = ["NA", "Spilled", "Upright"]

# # Define custom sentiment categories
# custom_sentiment_categories = {
#     "NA": "NA",
#     "negative": "negative",
#     "neutral": "neutral",
#     "positive": "positive",
#     "very_positive": "very_positive",
# }

# # Sample sentences
# reviews = [
#     {"text": "The tomato is rotten.", "tomato_status": "Rotten", "audience_status": "NA"},
#     {"text": "The tomato is fresh and the audience is spilled.", "tomato_status": "Fresh", "audience_status": "Spilled"},
#     {"text": "Certified fresh tomato and upright audience.", "tomato_status": "Certified Fresh", "audience_status": "Upright"},
#     {"text": "NA tomato and audience status.", "tomato_status": "NA", "audience_status": "NA"},
# ]

# # Analyze sentiment for each review
# for review in reviews:
#     text = review["text"]
#     tomato_status = review["tomato_status"]
#     audience_status = review["audience_status"]

#     # Analyze sentiment of the review text
#     scores = analyzer.polarity_scores(text)

#     # Map VADER scores to custom sentiment categories
#     if tomato_status == "NA" or audience_status == "NA":
#         custom_sentiment = "NA"
#     elif tomato_status == "Rotten" and audience_status == "Spilled":
#         custom_sentiment = "negative"
#     elif (tomato_status == "Rotten" and audience_status == "Upright") or (tomato_status == "Fresh" and audience_status == "Spilled"):
#         custom_sentiment = "neutral"
#     elif (tomato_status == "Fresh" and audience_status == "Upright") or (tomato_status == "Certified Fresh" and audience_status == "Spilled"):
#         custom_sentiment = "positive"
#     else:  # tomato_status == "Certified Fresh" and audience_status == "Upright"
#         custom_sentiment = "very_positive"

#     print(f"Review: {text}")
#     print(f"Tomato Status: {tomato_status}, Audience Status: {audience_status}")
#     print(f"Sentiment Scores: {scores}")
#     print(f"Custom Sentiment: {custom_sentiment}")
#     print()

In [18]:
# # Analyze sentiment for each review
# def analyze_review(text, tomato_status, audience_status):

#     # Analyze sentiment of the review text
#     scores = analyzer.polarity_scores(text)

#     # Map VADER scores to custom sentiment categories
#     if tomato_status == "NA" or audience_status == "NA":
#         custom_sentiment = "NA"
#     elif tomato_status == "Rotten" and audience_status == "Spilled":
#         custom_sentiment = "negative"
#     elif (tomato_status == "Rotten" and audience_status == "Upright") or (tomato_status == "Fresh" and audience_status == "Spilled"):
#         custom_sentiment = "neutral"
#     elif (tomato_status == "Fresh" and audience_status == "Upright") or (tomato_status == "Certified Fresh" and audience_status == "Spilled"):
#         custom_sentiment = "positive"
#     else:  # tomato_status == "Certified Fresh" and audience_status == "Upright"
#         custom_sentiment = "very_positive"

#     print(f"Review: {text}")
#     print(f"Tomato Status: {tomato_status}, Audience Status: {audience_status}")
#     print(f"Sentiment Scores: {scores}")
#     print(f"Custom Sentiment: {custom_sentiment}")
#     print()

#     return scores, custom_sentiment

In [19]:
import gradio as gr
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load movie dataset
movie_data = movies_vectors

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

def recommend_movies(user_input):
    # Process user input

    answer, para = ask_a_question(user_input)

    if answer == "I just don't know the answer to that, otherwise, answer the question.":
        return "I'm sorry, I could not find any relevant information, Please try again?"
    
    return answer

    #Find the movie uding para
    #movie = movies_vectors[movies_vectors["new_column"] == para]

    # review = movie["critics_consensus"].values[0]
    # tomato_status = movie["tomatometer_rating"].values[0]
    # audience_status = movie["audience_rating"].values[0]

    # #Analyze sentiment of the review
    # sentiment_score, custom_sentiment = analyze_review(review, tomato_status, audience_status)

    # # Apply sentiment analysis to user_input
    # #sentiment_score = analyzer.polarity_scores(user_input)
    # # Determine sentiment category based on sentiment score
    # if sentiment_score['compound'] >= 0.05:
    #     sentiment = "positive"
    # elif sentiment_score['compound'] <= -0.05:
    #     sentiment = "negative"
    # else:
    #     sentiment = "neutral"
    
    # # Based on sentiment, recommend movies
    # if sentiment == "positive":
    #     recommended_movies = movie_data[movie_data['tomatometer_status'] == 'Certified Fresh']['movie_title'].tolist()
    # elif sentiment == "negative":
    #     recommended_movies = movie_data[movie_data['tomatometer_status'] == 'Rotten']['movie_title'].tolist()
    # else:
    #     recommended_movies = movie_data['movie_title'].tolist()[:5]  # Return top 5 movies regardless of sentiment

    # Convert list of movie titles into a single string
    recommended_movies_string = "\n".join(recommended_movies)

    return recommended_movies_string

# Create Gradio interface
input_text = gr.Textbox(lines=3, label="Enter your thoughts about a movie:")
output_text = gr.Label(num_top_classes=5, label="Recommended Movies:")
gr.Interface(fn=recommend_movies, inputs=input_text, outputs=output_text, title="Movie Recommendation System").launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Embeddings: [0.004772534128278494, -0.005569576285779476, -0.01398388110101223, -0.023898310959339142, -0.024546312168240547, -0.016031567007303238, -0.017975572496652603, 0.00587413739413023, -0.01815701462328434, -0.0055404165759682655, 0.03693610802292824, 0.020995263010263443, 0.005887097213417292, 1.8525640825828305e-06, 0.01986773870885372, 0.02178582362830639, 0.032192736864089966, -0.0012522636679932475, 0.031648412346839905, -0.015085484832525253, 0.007950983941555023, -0.0024543071631342173, -0.006823460105806589, -0.022407906129956245, 0.0001318278955295682, 0.004727174062281847, 0.011838994920253754, -0.022563425824046135, 0.01643332839012146, -0.024961033836007118, 0.013867241330444813, -0.018519895151257515, 0.007795462850481272, -0.0008399724611081183, -0.02423527091741562, -0.01145019382238388, 0.02278374694287777, -0.008028743788599968, 0.006168978288769722, 0.0052455756813287735, 0.03084489144384861, 0.01991957798600197, -0.015772366896271706, -0.014670763164758682, -