In [2]:
# import pandas as pd
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import PorterStemmer
# import string
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense
# from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, RagTokenForGeneration
# from dotenv import load_dotenv
# import os
# from openai import OpenAI
# import tiktoken
# from typing import List
# from langchain_openai import ChatOpenAI
# from langchain.docstore.document import Document
# from langchain.chains.question_answering import load_qa_chain
# from pinecone import Pinecone, ServerlessSpec
# #nltk.download('punkt')
# #nltk.download('stopwords')

In [3]:
# # Load environment variables.
# variables_to_clear = ['OPENAI_API_KEY',
#                       'LANGCHAIN_TRACING_V2',
#                         'LANGCHAIN_ENDPOINT',
#                         'LANGCHAIN_API_KEY',
#                         'LANGCHAIN_PROJECT',
#                         'PINECONE_API_KEY',
#                         'PINECONE_ENVIRONMENT',
#                         'PINECONE_INDEX']
# for var in variables_to_clear:
#     if var in os.environ:
#         del os.environ[var]

# load_dotenv()

In [4]:
import os
from dotenv import load_dotenv
import nltk
from openai import OpenAI
import pandas as pd
import tiktoken
from typing import List

nltk.download('punkt')
nltk.download('stopwords')
load_dotenv()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iabdu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iabdu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo"
EMBED_MODEL = "text-embedding-ada-002"
# Store the API key in a variable.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=OPENAI_API_KEY)
MAX_TOKENS = 1536

def prep(text: str):
    return text.replace("\n", " ").replace("\r", " ").replace("\t", " ")

def tokenize(text: List[str]):
    encoding = tiktoken.encoding_for_model(EMBED_MODEL)
    return encoding.encode(text)

def embed(tokens: List[int]):
    response = client.embeddings.create(input=tokens,model=EMBED_MODEL)
    return response.data[0].embedding

def chunk_text(text:str):
    current_chunk = []
    current_para = ""
    chunks = []
    paras = []
    current_len = 0
    sentences = nltk.sent_tokenize(text)
    chunks_of_tokens = []
    
    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenize(sentence)
        sentence_token_len = len(sentence_tokens)
        
        # Check if adding the next sentence exceeds the max token limit
        if current_len + sentence_token_len > MAX_TOKENS:
            # Add the current chunk to the list and start a new one
            paras.append(current_para)
            current_para = ""
            chunks_of_tokens.append(current_chunk)
            embeddings = embed(current_chunk)
            chunks.append(embeddings)
            current_chunk = []
            current_len = 0
        
        # Add the sentence to the current chunk
        current_para += " " + sentence
        current_chunk.extend(sentence_tokens)
        current_len += sentence_token_len
    
    # Add the last chunk if it's not empty
    if current_chunk:
        paras.append(current_para)
        chunks_of_tokens.append(current_chunk)
        embeddings = embed(current_chunk)
        chunks.append(embeddings)

    return paras, chunks, chunks_of_tokens

def create_embeddings(filename: str):
    with open(filename, "r") as file:
        text = file.read()
    text = prep(text)
    return chunk_text(text)
    
def create_embeddings_prompt(prompt:str):
    prompt = prep(prompt)
    return chunk_text(prompt)

def vectorize_chunks(paras: List, chunks: List, **kwargs):
    vectors = []
    for i in range(len(chunks)):
        if "filename" in kwargs:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"file": filename, "para": f"{paras[i]}"}})
        else:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"para": f"{paras[i]}"}})
        
    return vectors

In [6]:
# import time

# vector_list = []
# def vectorize_movie_list(row):
#     paras, chunks, chunks_of_tokens  = create_embeddings_prompt(row["new_column"])
#     vectors = vectorize_chunks(paras, chunks)
#     return vectors


In [7]:
# movies_df = pd.read_csv('Resources/rotten_tomatoes_movies.csv')
# movies_df.columns
# pd.set_option('display.max_columns', None)

In [8]:
#movie_reviews_df = pd.read_csv('Resources/rotten_tomatoes_critic_reviews.csv')
#movie_reviews_df.columns

In [9]:
# columns_to_drop = ['rotten_tomatoes_link', 'authors', 'streaming_release_date', 'tomatometer_status', 'audience_status', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']
# movies_df.drop(columns=columns_to_drop, inplace=True)

In [10]:
# movies_df.dtypes

In [11]:
# movies_df.fillna("",inplace=True)
# movies_df["new_column"] = movies_df["movie_title"] + movies_df["movie_info"] + movies_df["critics_consensus"] + movies_df["content_rating"] + movies_df["genres"] + movies_df["directors"] + movies_df["actors"] + movies_df["original_release_date"] + str(movies_df["runtime"]) + movies_df["production_company"] + str(movies_df["tomatometer_rating"]) + str(movies_df["tomatometer_count"]) + str(movies_df["audience_rating"]) + str(movies_df["audience_count"])

In [12]:
# df_short2 = movies_df.copy()
# df_short2=df_short2[["new_column"]]
# df_short2.head()

In [13]:
#movies_df["vectors"]=df_short2.apply(vectorize_movie_list,axis=1)
#movies_df.head()

In [14]:
movies_vectors = pd.read_csv('Resources/movies_vectors.csv')
movies_vectors.head()

Unnamed: 0,movie_title,movie_info,critics_consensus,content_rating,genres,directors,actors,original_release_date,runtime,production_company,tomatometer_rating,tomatometer_count,audience_rating,audience_count,new_column,vectors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,119.0,20th Century Fox,49.0,149.0,53.0,254421.0,Percy Jackson & the Olympians: The Lightning T...,"[{'id': '0', 'values': [0.004161064513027668, ..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,90.0,Sony Pictures Classics,87.0,142.0,64.0,11574.0,Please GiveKate (Catherine Keener) and her hus...,"[{'id': '0', 'values': [-0.006923696491867304,..."
2,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,122.0,Waner Bros.,67.0,24.0,53.0,14684.0,"10A successful, middle-aged Hollywood songwrit...","[{'id': '0', 'values': [-0.01198294386267662, ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,95.0,Criterion Collection,100.0,54.0,97.0,105386.0,12 Angry Men (Twelve Angry Men)Following the c...,"[{'id': '0', 'values': [0.00045957480324432254..."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,127.0,Disney,89.0,27.0,74.0,68918.0,"20,000 Leagues Under The SeaIn 1866, Professor...","[{'id': '0', 'values': [0.020719904452562332, ..."


In [None]:
# import json
# vectors = movies_vectors["vectors"].tolist()
# for vector in vectors:
#     replacement_1 = vector.replace("'", '"')
#     head_section = replacement_1[0:replacement_1.find('"para": \"')+len('"para": \"')]
#     tail_section = replacement_1[replacement_1.find('"}}]'):replacement_1.find('"}}]')+len('"}}]')]
#     middle_section = replacement_1[replacement_1.find('"para": \"')+len('"para": \"'):replacement_1.find('"}}]')]
#     middle_section_replacement = middle_section.replace('"', "'")
#     final = head_section + middle_section_replacement + tail_section
#     print(final)
#     v1 = json.loads(final)

In [None]:
#movies_df.to_csv('Resources/movies_vectors.csv', index=False)

In [16]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index(os.getenv("PINECONE_INDEX"))

In [None]:
# #index.upsert(
#     vectors=[
#         {
#             "id": "vec1", 
#             "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], 
#             "metadata": {"genre": "drama"}
#         }, {
#             "id": "vec2", 
#             "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2], 
#             "metadata": {"genre": "action"}
#         }, {
#             "id": "vec3", 
#             "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3], 
#             "metadata": {"genre": "drama"}
#         }, {
#             "id": "vec4", 
#             "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4], 
#             "metadata": {"genre": "action"}
#         }
#     ],
#     namespace= "ns1"
# )

In [None]:
# index.query(
#     namespace="ns1",
#     vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
#     top_k=2,
#     include_values=True,
#     include_metadata=True,
#     filter={"genre": {"$eq": "action"}}
# )

In [17]:
import ast

# Extract vectors from the DataFrame
formatted_vectors = []

x = 0
vectors = movies_vectors["vectors"].tolist()
#print(vectors[0])
#print(vectors[1])
for vector in vectors:
    #print (vector.replace("'", '"'))
    v1 = ast.literal_eval(vector[1:-1])
    #try:
        #print(v1['id'])
    #except: 
        #print(v1)
        #break
    #break
    if isinstance(v1, dict):
        v1["id"] = str(x)
        formatted_vectors.append(v1)
    elif isinstance(v1, tuple):
        for v in v1:
            v["id"] = str(x) + chr(65 + int(v["id"]))
            formatted_vectors.append(v)
    else:
        print(f"Error at Index: {x}. Not a dictionary or tuple.")
    x += 1
    #print(x)
    #v1 = json.loads(vector.replace("'", '"'))
    

In [None]:
import itertools

pc = Pinecone(api_key="db471cdc-9b70-4057-afc0-de7ff20c73d7")
index = pc.Index("new2")

def chunks(iterable, batch_size):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

# Upsert data with 10 vectors per upsert request
x = 0
for ids_vectors_chunk in chunks(formatted_vectors, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk)
    print(x)
    x+=1

In [20]:
from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain

query_responses=[]

def ask_a_question(prompt):
    # convert the prompt to chunks of  embeddings
    paras, chunks, chunks_of_tokens  = create_embeddings_prompt(prompt)
    print(f"Embeddings: {chunks[0]}")
    # vectorize the embeddings
    prompt_vectors = vectorize_chunks(paras, chunks)
    print(f"Vectorized: {prompt_vectors[0]}")
    # search the index for the best match using semantic search
    query_response = index.query(
        top_k=2,
        vector=prompt_vectors[0]["values"]
    )
    query_responses.append(query_response)
    print(f"Query response: {query_response}")
    # get the id of the best match
    best_id = query_response["matches"][0]["id"]
    print(f"Best ID: {best_id}")
    # fetch the best match from the index
    result = index.fetch(ids=[best_id])
    # get the paragraph of interest from the result metadata
    para_of_interest = result["vectors"][best_id]["metadata"]["para"]
    print(f"Para of interest: {para_of_interest}")
    # Initialize the langchain chat model.
    llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=OPENAI_MODEL, temperature=0.0)
    # turn the para_of_interest into a Document
    document = Document(page_content=para_of_interest)
    # Create the QA chain using the LLM.
    chain = load_qa_chain(llm)
    # Pass the para_of_interest and the prompt to the chain, and print the result.
    question = "If you can't find the answer in the provided document, say, I just don't know the answer to that, otherwise, answer the question. " + prompt
    result = chain.invoke({"input_documents": [document], "question": question})
    return result["output_text"]

In [23]:
query_responses=[]

questions = ["what does inigo montoya say?",
            "what is the love boat television show about?",
            "who is the captain of the love boat?",
            "what are the symptoms of antibiotic overuse?",
            "what is covid-19?",
            "what is the a-team?",
            "who are the members of the a-team?",
            "How many skeletons did the necromancer raise?",
            "What will langsmith help us do?"]

answers = []
for question in questions:
    answers.append(ask_a_question(question))

Embeddings: [-0.0145847387611866, -0.006895051337778568, 0.0028357261326164007, -0.018353987485170364, -0.02163098193705082, 0.009992726147174835, -0.014001067727804184, -0.0257799681276083, -0.029197607189416885, -0.0003889673971571028, 0.05144742131233215, 0.006863406393676996, 0.0034176388289779425, -0.0011444870615378022, 0.010196659713983536, 0.0036321203224360943, 0.04196804389357567, -0.0012429376365616918, 0.011223357170820236, -0.004855719394981861, 0.017341352999210358, 0.034317031502723694, 0.0014231371460482478, -0.027327047660946846, -0.005724193528294563, 0.003431703196838498, 0.020716799423098564, -0.030857199802994728, 0.019169719889760017, -0.022193556651473045, 0.015456728637218475, 0.006244574673473835, -0.008544097654521465, -0.0011805270332843065, -0.037129905074834824, -0.052656956017017365, 0.024612626060843468, -0.048015717417001724, -0.007967459037899971, 0.0012921628076583147, 0.014176872558891773, -0.013037659227848053, -0.004205242730677128, -0.0289163198322

ProtocolError: Failed to connect; did you specify the correct index name?

In [None]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [None]:
train_df, test_df = train_test_split(df_movies, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [None]:
def split_data(df):
    # Split the dataset into training, validation, and test sets
    train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # Split into 80% training/validation and 20% test
    train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)  # Split training/validation into 80% training and 20% validation
    return train_df, val_df, test_df