In [1]:
from pinecone import Pinecone, ServerlessSpec

from pinecone import Pinecone

pc = Pinecone(api_key="db471cdc-9b70-4057-afc0-de7ff20c73d7")
index = pc.Index("quickstart")

In [2]:
index.upsert(
    vectors=[
        {
            "id": "vec1", 
            "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], 
            "metadata": {"genre": "drama"}
        }, {
            "id": "vec2", 
            "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2], 
            "metadata": {"genre": "action"}
        }, {
            "id": "vec3", 
            "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3], 
            "metadata": {"genre": "drama"}
        }, {
            "id": "vec4", 
            "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4], 
            "metadata": {"genre": "action"}
        }
    ],
    namespace= "ns1"
)

{'upserted_count': 4}

In [3]:
index.query(
    namespace="ns1",
    vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
    top_k=2,
    include_values=True,
    include_metadata=True,
    filter={"genre": {"$eq": "action"}}
)

{'matches': [{'id': 'vec4',
              'metadata': {'genre': 'action'},
              'score': 0.0799999237,
              'values': [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]},
             {'id': 'vec2',
              'metadata': {'genre': 'action'},
              'score': 0.0800000429,
              'values': [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]}],
 'namespace': 'ns1',
 'usage': {'read_units': 6}}

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, RagTokenForGeneration
from dotenv import load_dotenv
import os
from openai import OpenAI
import tiktoken
from typing import List
from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iabdu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Load environment variables.
variables_to_clear = ['OPENAI_API_KEY',
                      'LANGCHAIN_TRACING_V2',
                        'LANGCHAIN_ENDPOINT',
                        'LANGCHAIN_API_KEY',
                        'LANGCHAIN_PROJECT',
                        'PINECONE_API_KEY',
                        'PINECONE_ENVIRONMENT',
                        'PINECONE_INDEX']
for var in variables_to_clear:
    if var in os.environ:
        del os.environ[var]

load_dotenv()

True

In [6]:
# Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo-preview"
EMBED_MODEL = "text-embedding-ada-002"
# Store the API key in a variable.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=OPENAI_API_KEY)
MAX_TOKENS = 512

def prep(text: str):
    return text.replace("\n", " ").replace("\r", " ").replace("\t", " ")

def tokenize(text: List[str]):
    encoding = tiktoken.encoding_for_model(EMBED_MODEL)
    return encoding.encode(text)

def embed(tokens: List[int]):
    response = client.embeddings.create(input=tokens,model=EMBED_MODEL)
    return response.data[0].embedding

def chunk_text(text:str):
    current_chunk = []
    current_para = ""
    chunks = []
    paras = []
    current_len = 0
    sentences = nltk.sent_tokenize(text)
    chunks_of_tokens = []
    
    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenize(sentence)
        sentence_token_len = len(sentence_tokens)
        
        # Check if adding the next sentence exceeds the max token limit
        if current_len + sentence_token_len > MAX_TOKENS:
            # Add the current chunk to the list and start a new one
            paras.append(current_para)
            current_para = ""
            chunks_of_tokens.append(current_chunk)
            embeddings = embed(current_chunk)
            chunks.append(embeddings)
            current_chunk = []
            current_len = 0
        
        # Add the sentence to the current chunk
        current_para += " " + sentence
        current_chunk.extend(sentence_tokens)
        current_len += sentence_token_len
    
    # Add the last chunk if it's not empty
    if current_chunk:
        paras.append(current_para)
        chunks_of_tokens.append(current_chunk)
        embeddings = embed(current_chunk)
        chunks.append(embeddings)

    return paras, chunks, chunks_of_tokens

def create_embeddings(filename: str):
    with open(filename, "r") as file:
        text = file.read()
    text = prep(text)
    return chunk_text(text)
    
def create_embeddings_prompt(prompt:str):
    prompt = prep(prompt)
    return chunk_text(prompt)

def vectorize_chunks(paras: List, chunks: List, **kwargs):
    vectors = []
    for i in range(len(chunks)):
        if "filename" in kwargs:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"file": filename, "para": f"{paras[i]}"}})
        else:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"para": f"{paras[i]}"}})
        
    return vectors

In [31]:
import time

vector_list = []
def vectorize_movie_list(row):
    #print(row)
    paras, chunks, chunks_of_tokens  = create_embeddings_prompt(row["new_column"])
    #print(paras[0][0:10])
    vectors = vectorize_chunks(paras, chunks)
    #vector_list.extend(vectors)
    #time.sleep(10)
    return vectors


In [8]:
movies_df = pd.read_csv('Resources/rotten_tomatoes_movies.csv')
movies_df.columns

Index(['rotten_tomatoes_link', 'movie_title', 'movie_info',
       'critics_consensus', 'content_rating', 'genres', 'directors', 'authors',
       'actors', 'original_release_date', 'streaming_release_date', 'runtime',
       'production_company', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'],
      dtype='object')

In [9]:
movie_reviews_df = pd.read_csv('Resources/rotten_tomatoes_critic_reviews.csv')
movie_reviews_df.columns

Index(['rotten_tomatoes_link', 'critic_name', 'top_critic', 'publisher_name',
       'review_type', 'review_score', 'review_date', 'review_content'],
      dtype='object')

In [10]:
df_movies = pd.merge(movies_df, movie_reviews_df, on='rotten_tomatoes_link', how='inner')
pd.set_option('display.max_columns', None)
df_movies.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [11]:
#columns_to_drop = ['rotten_tomatoes_link', 'authors', 'streaming_release_date', 'tomatometer_status', 'audience_status', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count', 'critic_name', 'top_critic', 'publisher_name', 'review_type', 'review_score', 'review_date']
#df_movies.drop(columns=columns_to_drop, inplace=True)
columns_to_drop = ['rotten_tomatoes_link', 'authors', 'streaming_release_date', 'tomatometer_status', 'audience_status', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']
movies_df.drop(columns=columns_to_drop, inplace=True)

In [12]:
movies_df.dtypes

movie_title               object
movie_info                object
critics_consensus         object
content_rating            object
genres                    object
directors                 object
actors                    object
original_release_date     object
runtime                  float64
production_company        object
tomatometer_rating       float64
tomatometer_count        float64
audience_rating          float64
audience_count           float64
dtype: object

In [32]:
#df_movies.fillna("",inplace=True)
#df_movies["new_column"] = df_movies["movie_title"] + df_movies["movie_info"] + df_movies["critics_consensus"] + df_movies["content_rating"] + df_movies["genres"] + df_movies["directors"] + df_movies["actors"] + df_movies["original_release_date"] + str(df_movies["runtime"]) + df_movies["production_company"] + str(df_movies["tomatometer_rating"]) + str(df_movies["tomatometer_count"]) + str(df_movies["audience_rating"]) + str(df_movies["audience_count"]) + df_movies["review_content"]
movies_df.fillna("",inplace=True)
movies_df["new_column"] = movies_df["movie_title"] + movies_df["movie_info"] + movies_df["critics_consensus"] + movies_df["content_rating"] + movies_df["genres"] + movies_df["directors"] + movies_df["actors"] + movies_df["original_release_date"] + str(movies_df["runtime"]) + movies_df["production_company"] + str(movies_df["tomatometer_rating"]) + str(movies_df["tomatometer_count"]) + str(movies_df["audience_rating"]) + str(movies_df["audience_count"])

In [14]:
movies_df.__len__

<bound method DataFrame.__len__ of                                              movie_title  \
0      Percy Jackson & the Olympians: The Lightning T...   
1                                            Please Give   
2                                                     10   
3                        12 Angry Men (Twelve Angry Men)   
4                           20,000 Leagues Under The Sea   
...                                                  ...   
17707                                          Zoot Suit   
17708                                           Zootopia   
17709                                    Zorba the Greek   
17710                                               Zulu   
17711                                          Zulu Dawn   

                                              movie_info  \
0      Always trouble-prone, the life of teenager Per...   
1      Kate (Catherine Keener) and her husband Alex (...   
2      A successful, middle-aged Hollywood songwriter...   
3   

In [25]:
movies_df_short = movies_df.loc[0:10].copy()
movies_df_short

Unnamed: 0,movie_title,movie_info,critics_consensus,content_rating,genres,directors,actors,original_release_date,runtime,production_company,tomatometer_rating,tomatometer_count,audience_rating,audience_count,new_column
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,119.0,20th Century Fox,49.0,149.0,53.0,254421.0,Percy Jackson & the Olympians: The Lightning T...
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,90.0,Sony Pictures Classics,87.0,142.0,64.0,11574.0,Please GiveKate (Catherine Keener) and her hus...
2,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,122.0,Waner Bros.,67.0,24.0,53.0,14684.0,"10A successful, middle-aged Hollywood songwrit..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,95.0,Criterion Collection,100.0,54.0,97.0,105386.0,12 Angry Men (Twelve Angry Men)Following the c...
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,127.0,Disney,89.0,27.0,74.0,68918.0,"20,000 Leagues Under The SeaIn 1866, Professor..."
5,"10,000 B.C.",Mammoth hunter D'Leh (Steven Strait) has long ...,With attention strictly paid to style instead ...,PG-13,"Action & Adventure, Classics, Drama",Roland Emmerich,"Steven Strait, Camilla Belle, Cliff Curtis, Jo...",2008-03-07,109.0,Warner Bros. Pictures,8.0,149.0,37.0,411140.0,"10,000 B.C.Mammoth hunter D'Leh (Steven Strait..."
6,The 39 Steps,"While on vacation in London, Canadian Richard ...","Packed with twists and turns, this essential e...",NR,"Action & Adventure, Classics, Mystery & Suspense",Alfred Hitchcock,"Robert Donat, Madeleine Carroll, Godfrey Tearl...",1935-08-01,80.0,Gaumont British Distributors,96.0,51.0,86.0,23890.0,"The 39 StepsWhile on vacation in London, Canad..."
7,3:10 to Yuma,"Dan Evans (Van Heflin), a drought-plagued Ariz...",,NR,"Classics, Drama, Western",Delmer Daves,"Glenn Ford, Van Heflin, Felicia Farr, Leora Da...",1957-08-07,92.0,Columbia Pictures,96.0,28.0,79.0,9243.0,"3:10 to YumaDan Evans (Van Heflin), a drought-..."
8,Charly (A Heartbeat Away),"Cultural differences, past loves and personal ...",,PG,"Comedy, Drama, Romance",Adam Thomas Anderegg,"Heather Beers, Gary Neilson, Lisa McCammon, Ja...",2002-09-27,103.0,Excel Entertainment,20.0,10.0,87.0,4819.0,"Charly (A Heartbeat Away)Cultural differences,..."
9,Abraham Lincoln,The 16th U.S. president (Walter Huston) is por...,,NR,"Classics, Drama",D.W. Griffith,"Walter Huston, Una Merkel, Kay Hammond, Ian Ke...",1930-11-08,97.0,United Artists,82.0,11.0,40.0,457.0,Abraham LincolnThe 16th U.S. president (Walter...


In [26]:
df_short2 = movies_df_short.copy()
df_short2=df_short2[["new_column"]]
df_short2.head()

Unnamed: 0,new_column
0,Percy Jackson & the Olympians: The Lightning T...
1,Please GiveKate (Catherine Keener) and her hus...
2,"10A successful, middle-aged Hollywood songwrit..."
3,12 Angry Men (Twelve Angry Men)Following the c...
4,"20,000 Leagues Under The SeaIn 1866, Professor..."


In [33]:
movies_df_short["vectors"]=movies_df_short.apply(vectorize_movie_list,axis=1)

BadRequestError: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [None]:
movies_df_short.head(3)


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [None]:
def preprocess_text(text):
    if pd.isnull(text):  # Handle NaN values
        return []
    # Tokenization
    tokens = word_tokenize(text)
    # Removing punctuation and stop words, and stemming
    tokens = [ps.stem(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens


In [None]:
df_movies['review_content_tokens'] = df_movies['review_content'].apply(preprocess_text)

# Feature Engineering (example: extracting year from 'original_release_date')
df_movies['release_year'] = pd.to_datetime(df_movies['original_release_date']).dt.year

In [None]:
train_df, test_df = train_test_split(df_movies, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [None]:
MAX_SEQ_LENGTH = 100  # Example sequence length

# Define special token for out-of-vocabulary words
OOV_TOKEN = '<OOV>'
OOV_INDEX = 0  # Define index for the <OOV> token

# Function to filter out non-numeric tokens and replace them with the special token
def filter_tokens(tokens):
    return [int(token) if token.isdigit() else OOV_INDEX for token in tokens]

# Padding sequences after filtering tokens
train_sequences = pad_sequences(train_df['review_content_tokens'].apply(filter_tokens), maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')
val_sequences = pad_sequences(val_df['review_content_tokens'].apply(filter_tokens), maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')
test_sequences = pad_sequences(test_df['review_content_tokens'].apply(filter_tokens), maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')

In [None]:
def split_data(df):
    # Split the dataset into training, validation, and test sets
    train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # Split into 80% training/validation and 20% test
    train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)  # Split training/validation into 80% training and 20% validation
    return train_df, val_df, test_df

In [None]:
# Initialize tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")

# Initialize retriever
retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)

# Initialize generator
generator = RagSequenceForGeneration.from_pretrained("facebook/rag-token-base", retriever=retriever)

# Initialize token generator
token_generator = RagTokenForGeneration.from_pretrained("facebook/rag-token-base", retriever=retriever)