In [107]:
# %pip install tiktoken
# $pip install nltk
# %pip install faiss-cpu

In [None]:
import pickle

In [1]:
from sqlalchemy import create_engine
import pandas as pd
conn_string = (
    "sqlite:///../database/moviesdb.db"
    )

sqlite_engine = create_engine(conn_string, echo=False) #echo can be set to true if you want to see intermediate processing messages as well

get_script_urls_sql = "Select * from movie_script_urls"
df = pd.read_sql(get_script_urls_sql, sqlite_engine)

In [6]:
df.head()

Unnamed: 0,movie_id,movie_title,movie_script_url
0,m0,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.html
1,m1,1492: conquest of paradise,http://www.hundland.org/scripts/1492-ConquestO...
2,m2,15 minutes,http://www.dailyscript.com/scripts/15minutes.html
3,m3,2001: a space odyssey,http://www.scifiscripts.com/scripts/2001.txt
4,m4,48 hrs.,http://www.awesomefilm.com/script/48hours.txt


In [2]:
import requests
from bs4 import BeautifulSoup
errored_urls = []
def download_script(url):
    plot = ""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        plot = soup.text
    except:
        errored_urls.append(url)
        print("Error occured for : ", url )
    
    return plot



In [121]:
## Uncomment below lines in this cell and run them only once, it will save the movie scripts as a pickle file.
## from next time, directly load the pickle file into pandas df as done in next cell.

# df['movie_script'] = df['movie_script_url'].apply(download_script)

movie_scripts_file_path = '../indexes/movie_scripts_pandas_df.pickle'
# df.to_pickle(movie_scripts_file_path)

In [122]:
# You can directly get all pickle files (movie_scripts_pandas_df.pickle) from this link - https://drive.google.com/drive/folders/1rWUQbW4gyOvU96Uo8vFSbJH4VTNjAg7g?usp=share_link
df = pd.read_pickle(movie_scripts_file_path)

In [123]:
df.head(2)

Unnamed: 0,movie_id,movie_title,movie_script_url,movie_script
0,m0,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.html,\n\n10 Things I Hate About You script by Karen...
1,m1,1492: conquest of paradise,http://www.hundland.org/scripts/1492-ConquestO...,


In [124]:
df['total_words'] = df['movie_script'].str.split().str.len()

In [125]:
xdf= df[~(df['movie_script'].str.contains('403 Forbidden')) & ~(df['total_words']==0)].reset_index(drop=True)

xdf.head(2)

Unnamed: 0,movie_id,movie_title,movie_script_url,movie_script,total_words
0,m0,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.html,\n\n10 Things I Hate About You script by Karen...,18604
1,m2,15 minutes,http://www.dailyscript.com/scripts/15minutes.html,\n\n15 Minutes script by John Hertzfeld\n\n\n\...,25215


In [92]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter,NLTKTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

In [94]:
from langchain.document_loaders import DataFrameLoader
loader = DataFrameLoader(xdf, page_content_column="movie_script")
documents = loader.load()


In [97]:
print("Total docs with scripts found: ", len(documents))

Total docs with scripts found:  432


In [96]:
# nltk is needed for below and punkt also needed, download by uncommenting line below
import nltk
# nltk.download('punkt')
text_splitter= NLTKTextSplitter(chunk_size=4000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

In [102]:
print("Total docs after splitting documents based on conditions: ", len(docs))

q = [x for x in docs if x.metadata['movie_title']=='10 things i hate about you']
print("For example, the number of separate docs for one of the movie is now increased to: ", len(q))


Total docs after splitting documents based on conditions:  19552
For example, the number of separate docs for one of the movie is now increased to:  45


In [103]:
embeddings = OpenAIEmbeddings()

In [117]:
# # Note - Running below command uses up around 5-10$ of openai quota for embeddings due to large number of documents
# # To save up on openai queries, its better to run this cell only once and then export the db object as pickle file and then use that for reading further as done in next cell
#  I have already saved the pickle files at the link, so you can directly read it in next cell - https://drive.google.com/drive/folders/1rWUQbW4gyOvU96Uo8vFSbJH4VTNjAg7g?usp=share_link

db = FAISS.from_documents(docs, embeddings)

index_file_path = '../indexes/langchain_faiss_index_scripts_432.pickle'
with open(index_file_path, 'wb') as handle:
    pickle.dump(db, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [119]:
#  You can get the pickle files from and put them in indexes directory under movies_qna - https://drive.google.com/drive/folders/1rWUQbW4gyOvU96Uo8vFSbJH4VTNjAg7g?usp=share_link
with open(index_file_path, 'rb') as handle:
    db = pickle.load(handle)

In [127]:

query = "becca has relationship with whom in 10 things i hate about you?"
sim_docs = db.similarity_search(query)

In [None]:
sim_docs