# CineInsight - Movie Recommendation and Sentiment Analysis
### Section 008 - Team 3

## Download Base Datasets from Kaggle

In [1]:
! pip install kaggle
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d tmdb/tmdb-movie-metadata
! kaggle datasets download -d andrezaza/clapper-massive-rotten-tomatoes-movies-and-reviews
! unzip tmdb-movie-metadata.zip
! unzip clapper-massive-rotten-tomatoes-movies-and-reviews.zip

Downloading tmdb-movie-metadata.zip to /content
 90% 8.00M/8.89M [00:01<00:00, 12.8MB/s]
100% 8.89M/8.89M [00:01<00:00, 8.53MB/s]
Downloading clapper-massive-rotten-tomatoes-movies-and-reviews.zip to /content
100% 152M/152M [00:07<00:00, 27.2MB/s]
100% 152M/152M [00:07<00:00, 22.1MB/s]
Archive:  tmdb-movie-metadata.zip
  inflating: tmdb_5000_credits.csv   
  inflating: tmdb_5000_movies.csv    
Archive:  clapper-massive-rotten-tomatoes-movies-and-reviews.zip
  inflating: rotten_tomatoes_movie_reviews.csv  
  inflating: rotten_tomatoes_movies.csv  


## Movie Recommender

### Bags of Words Formation

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re
import string

# Create the dataframes
credits_df = pd.read_csv('/content/tmdb_5000_credits.csv')
movies_df = pd.read_csv('/content/tmdb_5000_movies.csv')

# Join the dataframes
credits_df.columns = ['id', 'title', 'cast', 'crew']
movies_df = movies_df.merge(credits_df, on='id')
movies_df.drop('title_x', axis=1, inplace=True)
movies_df.drop('title_y', axis=1, inplace=True)
org_movies_df = movies_df.copy(deep=True)

# Process 'overview' column: remove all punctuations, lowercase all letters
movies_df['overview'] = movies_df['overview'].str.replace('[^\w\s]', '', regex=True).str.lower()
movies_df['overview'].fillna('', inplace=True)

# Process 'genres', 'keywords', 'cast', 'crew' columns
## 1. Transform stringified objects to literal Python objects
features = ['genres', 'keywords', 'cast', 'crew']
for f in features:
  movies_df[f] = movies_df[f].apply(literal_eval)

str_filter_pattern = r'[' + re.escape(string.punctuation) + '\s]'

## 2. Retrieve all genres and keywords
def get_items(x):
    if isinstance(x, list):
      items = []
      for i in x:
        name = re.sub(str_filter_pattern, '', i["name"])
        items.append(name.lower())
      return items
    # In case null/missing values, return empty list
    return []
movies_df['genres'] = movies_df['genres'].apply(get_items)
movies_df['keywords'] = movies_df['keywords'].apply(get_items)

## 3. Retrieve first 15 actors/actresses from the cast
def get_cast(x):
  if isinstance(x, list):
    members = []
    for i in x[:15]:
      name = re.sub(str_filter_pattern, '', i['name'])
      members.append(name.lower())
    return members
  # In case null/missing values, return empty list
  return []
movies_df['cast'] = movies_df['cast'].apply(get_cast)

## 4. Retrieve the director of from the crew
def get_director(x):
  for i in x:
    if i['job'] == 'Director':
      return re.sub(str_filter_pattern, '', i["name"]).lower()
  return ''
movies_df['director'] = movies_df['crew'].apply(get_director)

# Create a new movie_meta data frame
# Each row is a movie id, title with its bag of words
titles_df = movies_df['original_title']
id_df = movies_df["id"]

metadata_merged_df = movies_df.apply(lambda row: row['overview'] + ' ' + ' '.join(row['genres']) + ' ' + ' '.join(row['keywords']) + ' ' + ' '.join(row['cast']) + ' ' + row['director'], axis=1)

movie_meta = pd.DataFrame({
    'id': id_df,
    'title': titles_df,
    'bag_of_words': metadata_merged_df
})

movie_meta.head(10)

Unnamed: 0,id,title,bag_of_words
0,19995,Avatar,in the 22nd century a paraplegic marine is dis...
1,285,Pirates of the Caribbean: At World's End,captain barbossa long believed to be dead has ...
2,206647,Spectre,a cryptic message from bonds past sends him on...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,john carter is a warweary former military capt...
5,559,Spider-Man 3,the seemingly invincible spiderman goes up aga...
6,38757,Tangled,when the kingdoms most wantedand most charming...
7,99861,Avengers: Age of Ultron,when tony stark tries to jumpstart a dormant p...
8,767,Harry Potter and the Half-Blood Prince,as harry begins his sixth year at hogwarts he ...
9,209112,Batman v Superman: Dawn of Justice,fearing the actions of a godlike super hero le...


### Stopword Removal & Lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (stopwords and WordNet)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for text preprocessing
def preprocess_text(text):
    # Tokenize the text data
    tokens = word_tokenize(text)

    # Remove punctuation
    # tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Initialize WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Preprocess the bag of words column
movie_meta['preprocessed_bag_of_words'] = movie_meta['bag_of_words'].apply(preprocess_text)
movie_meta['sentiment_score'] = -1

movie_meta.to_csv("/content/drive/MyDrive/6363-CineInsight/movie_word_bags.csv", index=False)

# Display the preprocessed bag of words data
movie_meta.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,title,bag_of_words,preprocessed_bag_of_words,sentiment_score
0,19995,Avatar,in the 22nd century a paraplegic marine is dis...,22nd century paraplegic marine dispatched moon...,-1
1,285,Pirates of the Caribbean: At World's End,captain barbossa long believed to be dead has ...,captain barbossa long believed dead come back ...,-1
2,206647,Spectre,a cryptic message from bonds past sends him on...,cryptic message bond past sends trail uncover ...,-1
3,49026,The Dark Knight Rises,following the death of district attorney harve...,following death district attorney harvey dent ...,-1
4,49529,John Carter,john carter is a warweary former military capt...,john carter warweary former military captain w...,-1


### Vectorization and Similarity Computation

#### Compute Term Frequency

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer to tokenize and count the frequency of terms
count_vectorizer = CountVectorizer()

# Fit and transform the preprocessed bag of words to obtain term frequencies
tf_matrix = count_vectorizer.fit_transform(movie_meta['preprocessed_bag_of_words'])

# Convert the TF matrix into a DataFrame
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

# Concatenate the titles with the TF DataFrame
tf_df = pd.concat([movie_meta['title'], tf_df], axis=1)

# Display the TF DataFrame
tf_df.head(5)

Unnamed: 0,title,00,007,10,100,1000,100000,1000000,1000foot,10191,...,übertarget,đỗthịhảiyến,špelacolja,γη,юлияснигирь,پیمانمعادی,卧底肥妈,徐帆,绝地奶霸,超级妈妈
0,Avatar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Pirates of the Caribbean: At World's End,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Spectre,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Dark Knight Rises,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,John Carter,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Compute Inverse Document Frequency

In [None]:
# Compute total number of documents
total_documents = len(movie_meta)

# Create a list to store the IDF values for each term
idf_values = []

# Iterate over each term in the vocabulary
for term in count_vectorizer.get_feature_names_out():
    # Compute the number of documents containing the term
    num_documents_with_term = sum(movie_meta['preprocessed_bag_of_words'].str.contains(term))

    # Compute IDF value for the term
    idf_value = np.log(total_documents / (1 + num_documents_with_term))  # Adding 1 to avoid division by zero

    # Append IDF value to the list
    idf_values.append(idf_value)

# Create a DataFrame to store the IDF values
idf_df = pd.DataFrame({'Term': count_vectorizer.get_feature_names_out(), 'IDF': idf_values})

# Display the IDF DataFrame
idf_df.head(5)

Unnamed: 0,Term,IDF
0,0,3.479784
1,7,5.837939
2,10,4.186537
3,100,5.532557
4,1000,6.079101


#### Compute TF-IDF Scores

In [None]:
# Compute TF-IDF scores
tfidf_matrix = tf_matrix.multiply(idf_values)

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

# Concatenate titles with TF-IDF DataFrame
tfidf_df = pd.concat([movie_meta['title'], tfidf_df], axis=1)

# Display the TF-IDF DataFrame
tfidf_df.head(10)

Unnamed: 0,title,00,007,10,100,1000,100000,1000000,1000foot,10191,...,übertarget,đỗthịhảiyến,špelacolja,γη,юлияснигирь,پیمانمعادی,卧底肥妈,徐帆,绝地奶霸,超级妈妈
0,Avatar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Pirates of the Caribbean: At World's End,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Spectre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,The Dark Knight Rises,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,John Carter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Spider-Man 3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Tangled,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Avengers: Age of Ultron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Harry Potter and the Half-Blood Prince,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Batman v Superman: Dawn of Justice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the 'overview' text to a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_meta['preprocessed_bag_of_words'])

#### Compute Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print(cosine_sim[:5, :5])

[[1.69794685e+03 1.02205840e+01 1.45235923e+01 2.22687808e+01
  8.52333503e+01]
 [1.02205840e+01 1.40187913e+03 4.47954492e+01 1.78634708e+00
  2.30787830e+01]
 [1.45235923e+01 4.47954492e+01 1.47262912e+03 8.31127840e+00
  1.48654804e+01]
 [2.22687808e+01 1.78634708e+00 8.31127840e+00 2.66235115e+03
  1.17033028e+01]
 [8.52333503e+01 2.30787830e+01 1.48654804e+01 1.17033028e+01
  2.02796843e+03]]


### Get Movie Recommendations

In [12]:
def get_recommendations(title, cosine_sim, df, n=10):
    # Get the index of the movie that matches the title
    idx = df.index[df['title'] == title].tolist()[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top n most similar movies (excluding itself)
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top n most similar movies
    return df[['title', 'bag_of_words']].iloc[movie_indices]

# get_recommendations('Kung Fu Panda')

## Review Sentiment Analyzer

In [2]:
import pandas as pd
import re
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")

# Step 1: Load the dataset
data = pd.read_csv("rotten_tomatoes_movie_reviews.csv")

# Step 2: Data Preprocessing

# Assuming 'reviewText' is the column containing review text
review_texts = data['reviewText']
print(review_texts.count())

# Replace NaN values with empty strings
review_texts_filled = review_texts.fillna('')

# Text Cleaning
review_texts_cleaned = review_texts_filled.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x).lower()))

# Tokenization
review_texts_tokenized = review_texts_cleaned.apply(word_tokenize)

# Remove Stop Words
stop_words = set(stopwords.words('english'))
review_texts_filtered = review_texts_tokenized.apply(lambda x: [word for word in x if word not in stop_words])

# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize tokens
review_texts_lemmatized = review_texts_filtered.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Convert the tokenized and stemmed lists back to strings
review_texts_joined = review_texts_lemmatized.apply(lambda x: ' '.join(x))

# Text Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
preprocessed_data = vectorizer.fit_transform(review_texts_joined)

# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
X = vectorizer.fit_transform(review_texts_joined)
y = data['scoreSentiment']

# Step 4: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train Naive Bayes Classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

# Step 6: Model Evaluation
y_pred = naive_bayes_classifier.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


1375738
Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.80      0.39      0.52     96088
    POSITIVE       0.76      0.95      0.84    192905

    accuracy                           0.76    288993
   macro avg       0.78      0.67      0.68    288993
weighted avg       0.77      0.76      0.74    288993



## Web Scraper

### Install the Chromium Browser

In [3]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.GLTnkChJdr/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.oenz5e1OQn/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.IdJJxPFGZq/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian bust



### Configure Selenium WebDriver

In [6]:
# Configure Selenium WebDriver for headless Chrome

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

# Set up Chrome Service
service = Service(executable_path=r'/usr/bin/chromedriver')

# Set up Chrome options
chrome_options = Options()
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize the Chrome webdrivers with the specified options
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.implicitly_wait(5)

### Movie Review Scraper & Analyzer

In [7]:
import time
import numpy as np
from datetime import datetime
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, ElementClickInterceptedException
from IPython.display import clear_output


# Function to log messages
def log_message(message):
  # Get the current date and time
  current_time = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
  # Format the log message with date and time
  log_message = f"[{current_time}] {message}"
  # Print the log message to the console
  clear_output(wait=True)
  print(log_message)
  with open("/content/drive/MyDrive/6363-CineInsight/log_file.txt", 'a') as f:
    f.write(log_message + '\n')

# Preprocessing functions
def preprocess_text(text):
  text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())  # Clean text
  tokens = word_tokenize(text)  # Tokenize text
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize tokens
  return ' '.join(tokens)

def get_sentiment_analysis_score(title):
  log_message(f"Performing sentiment analysis on reviews of: {title}")

  # Step 1: Search for the Movie
  # Before submitting the search, filter for only 'Titles' results
  driver.get("https://www.imdb.com/")
  filter_dropdown = driver.find_element(By.XPATH, "//form[@id='nav-search-form']/div/div/label")
  filter_dropdown.click()
  time.sleep(1)
  title_only_option = driver.find_element(By.XPATH, "//span[@id='navbar-search-category-select-contents']/ul/li[2]/span")
  title_only_option.click()
  time.sleep(1)
  search_box = driver.find_element(By.ID, "suggestion-search")
  search_box.send_keys(title)
  search_box.submit()

  # Step 2: Retrive Movie Page URL(s)
  # First matching result may not be the desired one, so fetch all matching urls and choose first one having user reviews
  try:
    search_urls = driver.find_elements(By.LINK_TEXT, title)
    movie_urls = [url.get_attribute("href") for url in search_urls]
    log_message(f"Sentiment Analysis - Searched and found movie: {title}")
  except NoSuchElementException:
    return 0

  reviews_found = False
  for url in movie_urls:
    print(url)
    driver.get(url)
    try:
      # Step 3: Access User Reviews
      # If this url has user review url, go to reviews and break out of loop
      review_url = driver.find_element(By.XPATH, "//div[@id='__next']/main/div/section/section/div[3]/section/section/div[3]/div[2]/div[2]/ul/li/a").get_attribute("href")
      driver.get(review_url)
      reviews_found = True
      log_message(f"Sentiment Analysis - Accessed user reviews of: {title}")
      break
    except NoSuchElementException:
      # Otherwise review url not found, continue to next search result url
      continue
  if not reviews_found:
    log_message(f"Sentiment Analysis - No reviews found for: {title}")
    return 0

  # Step:4 Load more reviews
  max_clicks = 4
  click_count = 0
  while click_count < max_clicks:
    try:
      load_more_button = driver.find_element(By.ID, "load-more-trigger")
      load_more_button.click();
      log_message(f"Sentiment Analysis - More reviews of {title} loaded {click_count + 1} time(s)")
      click_count += 1
      time.sleep(2)
    except (ElementNotInteractableException, NoSuchElementException):
      log_message(f"Sentiment Analysis - No more reviews for {title} to load")
      break

  # Step 5: Read User Reviews
  user_review_elements = driver.find_elements(By.CSS_SELECTOR, ".text")
  user_reviews = [review_element.text for review_element in user_review_elements]
  log_message(f"Sentiment Analysis - Found {len(user_reviews)} reviews for {title}")
  if (len(user_reviews) == 0):
    return 0

  sentiment_scores = []
  # Print the user reviews (for demonstration)
  for review in user_reviews:
      processed_review = preprocess_text(review)
      processed_review_vectorized = vectorizer.transform([processed_review])
      processed_review_vectorized = processed_review_vectorized.reshape(1, -1)
      sentiment_label = naive_bayes_classifier.predict(processed_review_vectorized)[0]
      sentiment_score = 1 if sentiment_label == 'POSITIVE' else 0
      sentiment_scores.append(sentiment_score)

  # Aggregate sentiment scores
  overall_sentiment_score = sum(sentiment_scores) / len(sentiment_scores)
  log_message(f"Sentiment Analysis - Overall sentiment score for {title}: {overall_sentiment_score}")

  return overall_sentiment_score

### Movie Scraper

In [44]:
import re
import time
import string
import random
import json
import sys
from datetime import datetime


# Function to remove all punctuations and whitespaces from a string
def clean_str(_string):
  return re.sub(r'[' + re.escape(string.punctuation) + '\s]', '', _string).lower()

# Read last checkpoint to continue from there
checkpoint = {
    "last_page": 1,
    "last_idx": -1
}
try:
    # Open the JSON file for reading
    with open('/content/drive/MyDrive/6363-CineInsight/checkpoint.json', 'r') as file:
        # Attempt to load JSON data from the file
        try:
            last_checkpoint = json.load(file)
            # Check if the loaded JSON data is empty
            if not last_checkpoint:
                print("The JSON file is empty.")
            else:
                checkpoint = last_checkpoint
        except json.decoder.JSONDecodeError:
            print("Checkpoint not found. Using default values.")
except FileNotFoundError:
    print("Checkpoint not yet created. Using default values.")

# Navigate to the website and scrap movie metadata
# Total of 500 pages, each consists of 20 movies
for p_idx in range(checkpoint["last_page"], 501):
  log_message(f"Continue to page {p_idx}")

  # Navigate to the page
  driver.get(f"https://www.themoviedb.org/movie?page={p_idx}&language=en-US")
  movie_urls = driver.find_elements(By.XPATH, f"//div[@id='page_{p_idx}']/div/div[2]/h2/a")
  movie_urls = [url.get_attribute('href') for url in movie_urls]

  # Loop through each movie page
  if checkpoint["last_idx"] == 19:
    checkpoint["last_idx"] = -1
    continue

  for m_idx in range(checkpoint["last_idx"] + 1, len(movie_urls)):
    # Extract the movie id
    movie_id = re.search(r'/movie/(\d+)', movie_urls[m_idx]).group(1)
    movie_id = "" if not movie_id else movie_id

    driver.get(movie_urls[m_idx])
    # Title
    title = driver.find_element(By.XPATH, "//section[@id='original_header']/div[2]/section/div/h2/a").text
    title = "" if not title else title

    # Setiment analysis for this title
    sentiment_score = get_sentiment_analysis_score(title)
    driver.get(movie_urls[m_idx])

    # Scrape the movie meta data
    log_message(f"Learning data of '{title}' - id {movie_id} (page: {p_idx}, index: {m_idx})")
    # Overview
    overview = ""
    try:
      overview = driver.find_element(By.XPATH, "//section[@id='original_header']/div[2]/section/div[3]/div/p").text
    except NoSuchElementException:
      overview = driver.find_element(By.XPATH, "//section[@id='original_header']/div[2]/section/div[2]/div/p").text
    overview = '' if not overview else re.sub('[^\w\s]', '', overview).lower() # remove punctuations but keep spaces between words

    # Genres
    genres = driver.find_elements(By.XPATH, "//section[@id='original_header']/div[2]/section/div/div/span[3]/a")
    genres = '' if not genres else " ".join([clean_str(genre.text) for genre in genres])

    # Keywords
    keywords = driver.find_elements(By.XPATH, "//div[@id='media_v4']/div/div/div[2]/div/section/div/div/section[2]/ul/li/a")
    keywords = '' if not keywords else " ".join([clean_str(keyword.text) for keyword in keywords])

    # Director
    profiles = driver.find_elements(By.XPATH, "//section[@id='original_header']/div[2]/section/div[3]/ol/li")
    director = ""
    try:
      for profile in profiles:
        if "Director" in profile.find_element(By.CLASS_NAME, "character").text:
          director = clean_str(profile.find_element(By.TAG_NAME, "a").text)
          break
    except NoSuchElementException:
      director = ""
    director = "" if not director else director

    # Cast
    full_cast_url = driver.find_element(By.XPATH, "//div[@id='media_v4']/div/div/div/div/section/p/a").get_attribute('href')
    driver.get(full_cast_url)
    try:
      top_actors = driver.find_elements(By.XPATH, "//div[@id='media_v4']/div/div/section/ol/li/div/div/p/a")
    except NoSuchElementException:
      top_actors = ""
    top_actors = '' if not top_actors else " ".join([clean_str(actor.text) for actor in top_actors[:15]])

    # Form the bag of words
    bag_of_words = f"{overview} {genres} {keywords} {top_actors} {director}"

    # Create a data frame from the scraped meta data
    movie_df = pd.DataFrame({"id": [movie_id], "title": [title], "bag_of_words": [bag_of_words]})
    movie_df["processed_bag_of_words"] = movie_df["bag_of_words"].apply(preprocess_text)
    movie_df["sentiment_score"] = sentiment_score

    # Append to existing movie data csv file
    movie_df.to_csv('/content/drive/MyDrive/6363-CineInsight/movie_word_bags.csv', mode='a', header=False, index=False)

    # Save checkpoint
    checkpoint["last_page"] = p_idx
    checkpoint["last_idx"] = m_idx
    with open('/content/drive/MyDrive/6363-CineInsight/checkpoint.json', 'w') as file:
        json.dump(checkpoint, file)
    if m_idx == 19:
      checkpoint["last_idx"] = -1

    # Wait to avoid being blocked by TMDB. Randomly between 10 - 30 seconds
    wait_time = random.randint(5, 10)
    for i in range(wait_time, 0, -1):
      # Clear the output
      clear_output(wait=True)
      # Print the current countdown time with logging
      log_message(f"Movie '{title}' learned and analyzed. Learning next movie in {i} second(s)")
      # Wait for 1 second
      time.sleep(1)

driver.quit()

[04-12-2024 23:39:58] Movie 'Inri. La película de la Semana Santa de Estepa' learned and analyzed. Learning next movie in 8 second(s)


KeyboardInterrupt: 

## Recommendations after Learning New Data

### Vectorization and Similarity Computation

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

movie_data = pd.read_csv('/content/drive/MyDrive/6363-CineInsight/movie_word_bags.csv', lineterminator='\n')

# Check and drop duplicates that have sentiment score -1 (unrated)
grouped_indices = movie_data.groupby('id').apply(lambda x: x.index.tolist())

indices_to_keep = []

for indices in grouped_indices:
  if len(indices) > 1:
    for i in indices:
      if movie_data.loc[i, 'sentiment_score'] != -1:
        indices_to_keep.append(i)
        break
  else:
    indices_to_keep.append(indices[0])

final_data = movie_data.loc[indices_to_keep]
final_data = final_data.reset_index(drop=True)

final_data.to_csv('/content/drive/MyDrive/6363-CineInsight/movie_word_bags.csv', index=False)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(final_data['preprocessed_bag_of_words'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

### Make Updated Recommendations

In [42]:
def get_rated_recommendations(title, cosine_sim, df, n=10):
    # Get the index of the movie that matches the title
    idx = df.index[df['title'] == title].tolist()[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top n most similar movies (excluding itself)
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Perform sentiment analysis for any unrated movie record
    updated = False
    for i in movie_indices:
      sentiment_score = df.loc[i, 'sentiment_score']
      title = df.loc[i, 'title']
      if sentiment_score == -1:
        sentiment_score = get_sentiment_analysis_score(title)
        df.loc[i, 'sentiment_score'] = sentiment_score
        updated = True
    if updated:
      df.to_csv('/content/drive/MyDrive/6363-CineInsight/movie_word_bags.csv', mode='w', index=False)

    # Return the top n most similar movies with sentiment score
    recommendations = df[['title', 'bag_of_words', 'sentiment_score']].iloc[movie_indices]

    return recommendations.sort_values(by='sentiment_score', ascending=False)

In [46]:
get_recommendations("Batman Begins", cosine_similarities, final_data, n=10)

Unnamed: 0,title,bag_of_words
69,The Dark Knight,batman raises the stakes in his war on crime w...
3487,The Dark Knight Rises,following the death of district attorney harve...
165,Batman Returns,having defeated the joker batman now faces the...
4899,The Batman,in his second year of fighting crime batman un...
183,Batman Forever,the dark knight of gotham city confronts a das...
4357,Batman v Superman: Dawn of Justice,fearing the actions of a godlike super hero le...
124,Batman,the dark knight of gotham city begins his war ...
4012,Teenage Mutant Ninja Turtles,when a kingpin threatens new york city a group...
3126,Defendor,a crooked cop a mob boss and the young girl th...
184,Batman & Robin,along with crimefighting partner robin and new...


In [47]:
get_rated_recommendations("Batman Begins", cosine_similarities, final_data, n=10)

Unnamed: 0,title,bag_of_words,sentiment_score
124,Batman,the dark knight of gotham city begins his war ...,0.935484
165,Batman Returns,having defeated the joker batman now faces the...,0.92
4012,Teenage Mutant Ninja Turtles,when a kingpin threatens new york city a group...,0.894309
4357,Batman v Superman: Dawn of Justice,fearing the actions of a godlike super hero le...,0.88
69,The Dark Knight,batman raises the stakes in his war on crime w...,0.87
3126,Defendor,a crooked cop a mob boss and the young girl th...,0.85
3487,The Dark Knight Rises,following the death of district attorney harve...,0.808
183,Batman Forever,the dark knight of gotham city confronts a das...,0.739837
4899,The Batman,in his second year of fighting crime batman un...,0.66
184,Batman & Robin,along with crimefighting partner robin and new...,0.528
