In [1]:
import numpy as np
import pandas as pd
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from tqdm import tqdm
from imdb import Cinemagoer
import os
import getpass
import warnings
from langchain_google_genai import GoogleGenerativeAI
warnings.filterwarnings("ignore")

In [2]:
def scrape(filename):
    ia = Cinemagoer()
    response = ia.search_movie(filename)
    url=f"https://www.imdb.com/title/tt{response[0].movieID}/reviews/?ref_=tt_ov_rt"
    filename="data/"+filename
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=chrome_options)
    url+="reviews?sort=totalVotes&dir=desc&ratingFilter=0"
    driver.get(url)
    sel = Selector(text = driver.page_source)
    review_counts = sel.css('.lister .header span::text').extract_first().replace(',','').split(' ')[0]
    more_review_pages = min((int(review_counts)+24)//25,101)-1
    i = 0
    j=0
    t = tqdm(range(int(more_review_pages)))  
    while (i<more_review_pages):
        try:
            css_selector = 'load-more-trigger'
            driver.find_element(By.ID, css_selector).click()
            time.sleep(1)
            i+=1
            j=0
            t.update(1)
        except:
            time.sleep(0.2)
            j+=1
            if (j==20):
                i+=1
                j=0
            continue
    t.close()
    rating_list = []
    review_date_list = []
    review_title_list = []
    author_list = []
    review_list = []
    review_url_list = []
    error_url_list = []
    error_msg_list = []
    reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')

    for d in tqdm(reviews):
        try:
            sel2 = Selector(text = d.get_attribute('innerHTML'))
            try:
                rating = sel2.css('.rating-other-user-rating span::text').extract_first()
            except:
                rating = np.NaN
            try:
                review = sel2.css('.text.show-more__control::text').extract_first()
            except:
                review = np.NaN
            try:
                review_date = sel2.css('.review-date::text').extract_first()
            except:
                review_date = np.NaN
            try:
                author = sel2.css('.display-name-link a::text').extract_first()
            except:
                author = np.NaN
            try:
                review_title = sel2.css('a.title::text').extract_first()
            except:
                review_title = np.NaN
            try:
                review_url = sel2.css('a.title::attr(href)').extract_first()
            except:
                review_url = np.NaN
            rating_list.append(rating)
            review_date_list.append(review_date)
            review_title_list.append(review_title)
            author_list.append(author)
            review_list.append(review)
            review_url_list.append(review_url)
        except Exception as e:
            error_url_list.append(url)
            error_msg_list.append(e)
    review_df = pd.DataFrame({
        'Review_Date':review_date_list,
        'Author':author_list,
        'Rating':rating_list,
        'Review_Title':review_title_list,
        'Review':review_list,
        'Review_Url':review_url
        })
    review_df.to_csv(f'{filename}.csv', index=False)

In [3]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WikipediaLoader, CSVLoader
def build_retriever(filename):
    wiki=WikipediaLoader(query=f"{filename} (film)", load_max_docs=1, doc_content_chars_max=1_000_000).load()
    csv=CSVLoader(file_path=f"data/{filename}.csv", source_column="Review_Url").load()
    wikipedia_text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 0,
        length_function = len,
        is_separator_regex= False,
        separators = ["\n==", "\n", " "] 
    )

    csv_text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 50,
        length_function = len,
        is_separator_regex= False,
        separators = ["\n", " "] 
    )

    chunked_wikipedia_docs = wikipedia_text_splitter.transform_documents(wiki)
    chunked__csv_docs = csv_text_splitter.transform_documents(csv)

    store = LocalFileStore("./shared_cache/")
    core_embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    embedder = CacheBackedEmbeddings.from_bytes_store(
        core_embeddings_model, store, namespace=core_embeddings_model.model
    )
    faiss_retriever = FAISS.from_documents(chunked__csv_docs, embedder).as_retriever()

    wiki_bm25_retriever = BM25Retriever.from_documents(
        chunked_wikipedia_docs
    )
    wiki_bm25_retriever.k = 1

    # set up FAISS vector store
    wiki_faiss_store = FAISS.from_documents(
        chunked_wikipedia_docs, embedder
    )

    wiki_faiss_retriever = wiki_faiss_store.as_retriever(search_kwargs={"k": 1})

    # set up ensemble retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=[wiki_bm25_retriever, wiki_faiss_retriever],
        weights=[0.25, 0.75] 
    )
    return faiss_retriever,ensemble_retriever

In [4]:
from langchain.agents.agent_toolkits import create_retriever_tool

def get_retriever_tool(csv_retriever, wiki_retriever, name):
    wiki_retriever_tool = create_retriever_tool(
        csv_retriever, 
        "Wikipedia",
        f"Searches and returns documents regarding the plot, history, and cast of the {name} movie"
    )

    csv_retriever_tool = create_retriever_tool(
        wiki_retriever,
        "PublicReviews",
        f"Searches and returns documents regarding public reviews of the {name} movie"
    )

    retriever_tool = [wiki_retriever_tool, csv_retriever_tool]
    return retriever_tool

In [5]:
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
def create_agent(llm, retriever_tool, verbose=False):
    return create_conversational_retrieval_agent(
        llm,
        retriever_tool,
        verbose=verbose
    )

In [6]:
def query(agent, query):
    response=agent.invoke({"input":query})
    return response["output"]

In [7]:
def build_agent(filename):
    if "GOOGLE_API_KEY" not in os.environ:
        os.environ["GOOGLE_API_KEY"] = getpass.getpass("Provide your Google API Key") 
    llm = GoogleGenerativeAI(model="gemini-1.0-pro", google_api_key=os.environ["GOOGLE_API_KEY"])
    if not os.path.exists(f"data/{filename}.csv"):
        scrape(filename)
        print(f"Scraped {filename}")
    csv_retriever, wiki_retriever=build_retriever(filename)
    print("Built Retriever")
    retriever_tool=get_retriever_tool(csv_retriever, wiki_retriever, filename)
    print("Built Retriever Tool")
    agent=create_agent(llm, retriever_tool)
    print("Built Agent")
    return agent

In [8]:
agent=build_agent("Inside Out")

Built Retriever
Built Retriever Tool
Built Agent


In [9]:
print(query(agent, "What do people think about Inside Out?"))

Starting agent call
Inside Out has received generally positive reviews from critics. On Rotten Tomatoes, the film has a 98% approval rating based on 341 reviews, with an average rating of 8.20/10. The site's consensus reads, "Smart, funny, and emotionally resonant, Inside Out is a Pixar masterpiece that explores the complexities of the human mind with wit and heart." On Metacritic, the film has a score of 85 out of 100, based on 47 critics, indicating "universal acclaim".

Audiences have also responded positively to Inside Out. On Rotten Tomatoes, the film has an audience score of 90%, based on over 100,000 user ratings. On Metacritic, the film has a user score of 8.3 out of 10, based on over 500 user ratings.

Overall, Inside Out has been praised for its originality, humor, and emotional depth. Many critics have hailed it as one of Pixar's best films.


In [10]:
agents={}
filename="Inside Out"
agents[filename]=build_agent(filename)

Built Retriever
Built Retriever Tool
Built Agent


In [11]:
filename="Barbie"
agents[filename]=build_agent(filename)

Built Retriever
Built Retriever Tool
Built Agent


In [12]:
filename="Oppenheimer"
agents[filename]=build_agent(filename)

Built Retriever
Built Retriever Tool
Built Agent


In [13]:
from langchain.agents import Tool
from langchain.agents import ZeroShotAgent, AgentExecutor
def create_bot(agents):
    if "GOOGLE_API_KEY" not in os.environ:
        os.environ["GOOGLE_API_KEY"] = getpass.getpass("Provide your Google API Key") 
    llm = GoogleGenerativeAI(model="gemini-1.0-pro", google_api_key=os.environ["GOOGLE_API_KEY"])
    tools = []
    for name,agent in agents.items():
        tools.append(Tool(
            name = f"{name}Info",
            func=agent.invoke,
            description=f"useful for when you need to answer questions about {name}. Input should be a fully formed question."
        ))
    prefix = """Answer the following questions as best you can. Give in detail explanations for you answers. If you have exerpts you can quote those too.  You have access to the following tools:"""
    suffix = """Begin!"

    Question: {input}
    {agent_scratchpad}"""

    prompt = ZeroShotAgent.create_prompt(
        tools, 
        prefix=prefix, 
        suffix=suffix, 
        input_variables=["input", "agent_scratchpad"]
        )

    from langchain import LLMChain

    llm_chain = LLMChain(llm=llm, prompt=prompt)

    bot = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
    bot_chain = AgentExecutor.from_agent_and_tools(agent=bot,
                                                    tools=tools, 
                                                    verbose=True,
                                                    handle_parsing_errors="Cannot find information"
                                                   )
    return bot_chain

def query_bot(bot, query):
    return bot.invoke({"input":query})["output"]

In [14]:
bot=create_bot(agents)
print(query_bot(bot, "I want to watch a movie with my young daughter.Should we watch Oppenheimer or Inside Out?"))



[1m> Entering new AgentExecutor chain...[0m
Starting agent call


  warn_deprecated(


[32;1m[1;3mThought: I need to learn about Oppenheimer and Inside Out
Action: Inside OutInfo
Action Input: What is Inside Out?[0mStarting agent call

Observation: [36;1m[1;3m{'input': 'What is Inside Out?', 'chat_history': [HumanMessage(content='What is Inside Out?'), AIMessage(content='Inside Out is a 2015 American computer-animated comedy-drama adventure film produced by Pixar Animation Studios and released by Walt Disney Pictures. The film was directed by Pete Docter and co-directed by Ronnie del Carmen, and produced by Jonas Rivera. The screenplay was written by Docter, del Carmen, Meg LeFauve, and Josh Cooley. The film stars the voices of Amy Poehler, Bill Hader, Lewis Black, Mindy Kaling, Phyllis Smith, Richard Kind, Diane Lane, and Kyle MacLachlan.\n\nInside Out is set in the mind of an 11-year-old girl named Riley Andersen, where five personified emotions—Joy, Sadness, Anger, Fear, and Disgust—help her navigate the challenges of growing up. The film explores the importance 