In [None]:
# !pip install selenium

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Web Scraping from Rotten Tomatoes

In [None]:
def scrape_rotten_tomatoes(movie_names):
    # Configure Chrome options
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument("--headless")  # Run in background

    # Initialize the driver
    driver = webdriver.Chrome(options=options)

    full_results = {}
    for movie in movie_names:
        movie = movie.lower().replace(" ","_")
        movie = re.sub(r'[^\w_]', '', movie)
        url = f"https://www.rottentomatoes.com/m/{movie}"
        results = {}
        try:
            # Go to the page
            driver.get(url)
            # Optionally add a sleep here if necessary:
            # time.sleep(1)

            # Find the critics score directly using the identified selector
            score_element = driver.find_element(By.CSS_SELECTOR, 'rt-text[slot="criticsScore"]')
            audience_element = driver.find_element(By.CSS_SELECTOR, 'rt-text[slot="audienceScore"]')

            # Extract and convert the text from the element
            critics_score = int(score_element.text.strip().replace("%", ""))
            audience_score = int(audience_element.text.strip().replace("%", ""))

            results["critic score"] = critics_score
            results["audience score"] = audience_score

            # Find all <dt> (keys) and <dd> (values)
            keys = driver.find_elements(By.CSS_SELECTOR, "dl div dt")
            values = driver.find_elements(By.CSS_SELECTOR, "dl div dd")

            for key, value in zip(keys, values):
                key_text = key.text.strip()
                # Find all value items (could be rt-text or rt-link)
                value_items = value.find_elements(By.CSS_SELECTOR, 'rt-text[data-qa="item-value"], rt-link[data-qa="item-value"]')
                # Join multiple values into one string
                value_text = ", ".join(v.text.strip() for v in value_items if v.text.strip())
                results[key_text] = value_text

            # Only add this movie if it has a Box Office value (non-empty)
            if "audience score" in results and results["audience score"]:
                full_results[movie] = results

        except Exception as e:
            #print(f"Error processing {movie}")
            pass

    driver.quit()
    return full_results

In [None]:
See another notebook

# 2. Sentiment Analysis on Movie Reviews

In [None]:
## for text mining analysis
# Get the 90th percentile box office revenue
threshold = movies_cleaned["Box Office (Gross USA)"].quantile(0.9)

# Select rows where Box Office is greater than or equal to the 90th percentile
top_10_percent_movies = movies_cleaned[movies_cleaned["Box Office (Gross USA)"] >= threshold]
top_10_percent_movies.to_csv("top_10_percent_movies.csv")

In [None]:
import pandas as pd

top_10 = pd.read_csv("top_10_percent_movies.csv")
top_10.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
top_10_movies = top_10["movie_name"].tolist()
top_10_movies

### Web Scrapping for Critics and Audience Reviews

In [None]:
def scrape_critics_reviews(url, max_clicks=1):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    # Initialize driver
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # Click "Load More" repeatedly
    clicks = 0
    while clicks < max_clicks:
        try:
            #load_more_button = driver.find_element(By.CSS_SELECTOR, "button.ipc-see-more__button")  # Adjust selector as needed
            rt_button = driver.find_element(By.CSS_SELECTOR, 'rt-button[data-loadmoremanager="btnLoadMore:click"]')
            driver.execute_script("arguments[0].click();", rt_button)
            time.sleep(3)  # Wait for new items to load
            clicks += 1
            print(f"Clicked 'Load More' {clicks} times...")
        except (NoSuchElementException, ElementClickInterceptedException):
            print("No more 'Load More' button or couldn't click it.")
            break

    # After loading all content
    soup = BeautifulSoup(driver.page_source, "lxml")
    driver.quit()

    critics_reviews = []
    critics_reviews_tags = soup.find_all('div', class_='review-text-container')
    if critics_reviews_tags:
        for review in critics_reviews_tags:
            review_text = review.find('p', class_='review-text').get_text(strip=True)
            critics_reviews.append(review_text)
    return critics_reviews

In [None]:
def scrape_audience_reviews(url, max_clicks=1):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    # Initialize driver
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # Click "Load More" repeatedly
    clicks = 0
    while clicks < max_clicks:
        try:
            #load_more_button = driver.find_element(By.CSS_SELECTOR, "button.ipc-see-more__button")  # Adjust selector as needed
            rt_button = driver.find_element(By.CSS_SELECTOR, 'rt-button[data-loadmoremanager="btnLoadMore:click"]')
            driver.execute_script("arguments[0].click();", rt_button)
            time.sleep(3)  # Wait for new items to load
            clicks += 1
            print(f"Clicked 'Load More' {clicks} times...")
        except (NoSuchElementException, ElementClickInterceptedException):
            print("No more 'Load More' button or couldn't click it.")
            break

    # After loading all content
    soup = BeautifulSoup(driver.page_source, "lxml")
    driver.quit()

    audience_reviews = []
    audience_reviews_tags = soup.find_all('div', class_='review-text-container')
    if audience_reviews_tags:
        for review in audience_reviews_tags:
            review_text = review.find('p', {"class": re.compile(r"audience-reviews")}).get_text(strip=True)
            audience_reviews.append(review_text)
    return audience_reviews

In [None]:
critics_reviews = {}

for title in top_10_movies:
    slug = title.lower().replace(" ", "_")
    slug = re.sub(r"[^\w_]", "", slug)
    url = f"https://www.rottentomatoes.com/m/{slug}/reviews"

    try:
        reviews = scrape_critics_reviews(url, max_clicks=1)
        critics_reviews[title] = reviews
        print(f"Fetched {len(reviews)} reviews for {title!r}")
    except Exception:
        print(f"Skipping {title!r}: could not scrape {url}")

In [None]:
audience_reviews = {}

for title in top_10_movies:
    slug = title.lower().replace(" ", "_")
    slug = re.sub(r"[^\w_]", "", slug)
    url = f"https://www.rottentomatoes.com/m/{slug}/reviews?type=user"

    try:
        reviews = scrape_audience_reviews(url, max_clicks=1)
        audience_reviews[title] = reviews
        print(f"Fetched {len(reviews)} reviews for {title!r}")
    except Exception:
        print(f"Skipping {title!r}: could not scrape {url}")

In [None]:
common = set(critics_reviews) & set(audience_reviews)

critics_reviews = { movie: critics_reviews[movie] for movie in common }
audience_reviews = { movie: audience_reviews[movie] for movie in common }

### Sentiment Analysis

In [None]:
nrc = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
count=0
emotion_dict=dict()
with open(nrc,'r') as f:
    all_lines = list()
    for line in f:
        if count < 46:
            count+=1
            continue
        line = line.strip().split('\t')
        if int(line[2]) == 1:
            if emotion_dict.get(line[0]):
                emotion_dict[line[0]].append(line[1])
            else:
                emotion_dict[line[0]] = [line[1]]

In [None]:
def emotion_analyzer(text, emotion_dict=emotion_dict):
    emotions = {x for y in emotion_dict.values() for x in y}
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0

    #Analyze the text and normalize by total number of words
    total_words = len(text.split())
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1/len(text.split())
    return emotion_count

In [None]:
def convert_to_string(reviews_dict): # critics_reviews/audience_reviews
    reviews_emotions = {}
    for title, reviews in reviews_dict.items():
        # turn the list of review‐strings into one giant string
        combined = " ".join(reviews)
        # analyze that combined text
        reviews_emotions[title] = emotion_analyzer(combined)
    return reviews_emotions

In [None]:
critics_emotions = convert_to_string(critics_reviews)
audience_emotions = convert_to_string(audience_reviews)

In [None]:
def convert_to_df(emotions_dict): # critics_emotions/ audience_emotions
    data_list = []
    for movie, emotions in emotions_dict.items():
        # Create a single row dictionary
        row = {'movie': movie}
        # Add the emotions - handling both dict and array cases
        if isinstance(emotions, dict):
            row.update(emotions)
        elif isinstance(emotions, np.ndarray):
            # Handle numpy array - adjust according to your data structure
            pass
        data_list.append(row)

    # Create DataFrame from list of dictionaries
    data_list = pd.DataFrame(data_list)
    data_list = data_list.set_index('movie')
    return data_list

In [None]:
critics_emotions = convert_to_df(critics_emotions)
audience_emotions = convert_to_df(audience_emotions)

In [None]:
critics_emotions = critics_emotions[['positive','negative']]
critics_emotions = critics_emotions.rename(columns={"positive": "critics_positive", "negative": "critics_negative"})
critics_emotions

In [None]:
audience_emotions = audience_emotions[['positive','negative']]
audience_emotions = audience_emotions.rename(columns={"positive": "audience_positive", "negative": "audience_negative"})
audience_emotions

In [None]:
combined_emotions = df_merged = pd.merge(
    critics_emotions,
    audience_emotions,
    on="movie")

pct_cols = ['critics_positive','critics_negative','audience_positive','audience_negative']
combined_emotions[pct_cols] = combined_emotions[pct_cols] * 100
combined_emotions

In [None]:
combined_emotions = pd.read_csv("combined_emotions.csv")
combined_emotions = combined_emotions.set_index('movie')
combined_emotions["pos_diff"] = combined_emotions["critics_positive"] - combined_emotions["audience_positive"]
combined_emotions["neg_diff"] = combined_emotions["critics_negative"] - combined_emotions["audience_negative"]
combined_emotions

In [None]:
sentiment = pd.merge(movies_cleaned, combined_emotions, left_on="Movie Name", right_on="movie", how="inner")
sentiment = sentiment.set_index('Movie Name')
sentiment.drop(columns=["Critic Score", 'Audience Score', 'Original Language', 'Runtime', 'Box Office (Gross USA)', 'Famous Director', 'Release Year', 'Release Season'], inplace=True)
sentiment = sentiment.loc[~sentiment.index.duplicated(keep='first')]
sentiment

In [None]:
sentiment.groupby("Rating")[["audience_positive","audience_negative",
                      "critics_positive","critics_negative"]].mean()

In [None]:
# takes a column where each entry is a list (or other iterable) and “unnests” it so that each element in the list gets its own row.
df_genre = sentiment.assign(Genre=sentiment["Genre"].str.split(", ")).explode("Genre")
df_genre.head()

### Visualizations

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left plot: positive sentiment comparison
axes[0].scatter(
    combined_emotions['critics_positive'],
    combined_emotions['audience_positive'],
    alpha=0.7
)
axes[0].plot(
    [0, combined_emotions['critics_positive'].max()],
    [0, combined_emotions['audience_positive'].max()],
    'k--', linewidth=1
)
axes[0].set_xlabel('Critics Positive (%)')
axes[0].set_ylabel('Audience Positive (%)')
axes[0].set_title('Critics vs Audience Positive Sentiment')
axes[0].grid(True)

# Right plot: negative sentiment comparison
axes[1].scatter(
    combined_emotions['critics_negative'],
    combined_emotions['audience_negative'],
    alpha=0.7,
    color='r'
)
axes[1].plot(
    [0, combined_emotions['critics_negative'].max()],
    [0, combined_emotions['audience_negative'].max()],
    'k--', linewidth=1
)
axes[1].set_xlabel('Critics Negative (%)')
axes[1].set_ylabel('Audience Negative (%)')
axes[1].set_title('Critics vs Audience Negative Sentiment')
axes[1].grid(True)

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
# Sentiment by genre
features = [
    "audience_positive",
    "audience_negative",
    "critics_positive",
    "critics_negative"
]
titles = [
    "Mean Audience Positive Sentiment by Genre",
    "Mean Audience Negative Sentiment by Genre",
    "Mean Critics Positive Sentiment by Genre",
    "Mean Critics Negative Sentiment by Genre"
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for ax, feature, title in zip(axes.flatten(), features, titles):
    means = df_genre.groupby("Genre")[feature].mean().sort_values()
    ax.barh(means.index, means.values)
    ax.set_title(title)
    ax.set_xlabel("Average %")
    ax.set_ylabel("Genre")

fig.tight_layout()
plt.show()

In [None]:
# 1) compute distance to center
xc = combined_emotions['critics_positive'].median()
yc = combined_emotions['audience_positive'].median()

dx = combined_emotions['critics_positive'] - xc
dy = combined_emotions['audience_positive'] - yc
dist = np.sqrt(dx*dx + dy*dy)

# 2) choose a cutoff: e.g. only top 20% furthest get labeled
cutoff = dist.quantile(0.80)

fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(combined_emotions['critics_positive'], combined_emotions['audience_positive'], alpha=0.5)
ax.axvline(xc, color='gray', linestyle='--')
ax.axhline(yc, color='gray', linestyle='--')

for movie, row in combined_emotions.iterrows():
    if dist[movie] < cutoff:
        continue     # skip the “middle” points
    xval = row['critics_positive']
    yval = row['audience_positive']
    if xval >= xc and yval >= yc:
        c = 'blue'
    elif xval >= xc and yval < yc:
        c = 'red'
    elif xval < xc and yval >= yc:
        c = 'green'
    else:
        c = 'purple'
    ax.text(xval, yval, movie, fontsize=8, color=c,
            ha='center', va='center')

ax.set_xlabel("Critics Positive (%)")
ax.set_ylabel("Audience Positive (%)")
ax.set_title("Quadrant Plot (only outer 20% labeled)")
plt.tight_layout()
plt.show()

# 3. Recommendation LSI Models

### Web Scrapping for LSI

In [None]:
!pip install imdbpy

In [None]:
top_10_percent = pd.read_csv("top_10_percent_movies.csv")
# Get rid of "wide"
top_10_percent["Release Date (Theaters)"] = top_10_percent["Release Date (Theaters)"].str.extract(r"^([A-Za-z]+\s\d{1,2},\s\d{4})")

# Convert to datetime object and pull out year
top_10_percent["Release Year"] = pd.to_datetime(top_10_percent["Release Date (Theaters)"]).dt.year
top_10_list = top_10_percent[["movie_name","Release Year"]].values.tolist()

In [None]:
top_10_list

In [None]:
import imdb
import time
import random
from requests.exceptions import HTTPError, ConnectionError, Timeout, ReadTimeout, RequestException

def get_movie_details(movie_titles, max_retries=5, initial_backoff=2):
    results = dict()
    for movie in movie_titles:
        # Create a new IMDb instance for each movie to avoid session issues
        ia = imdb.IMDb()
        print(f"Searching for: {movie[0]}")

        # Needed to add retries with exponential backoff
        for attempt in range(max_retries):
            try:
                # Set a timeout for the search request
                search_results = ia.search_movie(movie[0])

                found_movie = None
                for result in search_results:
                    # Check if year matches if provided
                    if len(movie) > 1 and movie[1] and 'year' in result and result['year'] == movie[1]:
                        found_movie = result
                        break

                # If no year match found, use the first result
                if not found_movie and search_results:
                    found_movie = search_results[0]

                if not found_movie:
                    print(f"No results found for {movie[0]}")
                    break

                # Add a small delay before fetching details
                time.sleep(1)

                movie_id = found_movie.movieID
                movie_data = ia.get_movie(movie_id)
                title = movie_data.get('title')

                if 'plot' in movie_data and movie_data['plot']:
                    plot = movie_data['plot'][0]
                    results[title] = plot
                    print(f"Successfully retrieved data for: {title}")
                else:
                    print(f"No plot found for {title}")

                # Break out of retry loop
                break

            except (HTTPError, ConnectionError, Timeout, ReadTimeout, RequestException,
                    imdb.IMDbDataAccessError, Exception) as e:
                # Calculate backoff time with exponential increase and jitter
                backoff_time = initial_backoff * (2 ** attempt) * (0.5 + random.random())

                if attempt < max_retries - 1:
                    print(f"Error accessing IMDb for {movie[0]}: {str(e)[:100]}... Retrying in {backoff_time:.2f} seconds...")
                    time.sleep(backoff_time)
                else:
                    print(f"Failed to get data for {movie[0]} after {max_retries} attempts: {str(e)[:100]}...")

        # Add a delay between movies to avoid rate limiting
        time.sleep(2)

    return results

In [None]:
comparison_docs = get_movie_details(top_10_list)

In [None]:
# Write to CSV so I don't need to run the webscrapping again
comparison_docs_df = pd.DataFrame.from_dict(comparison_docs, orient='index')

# Reset index to convert the movie names into a regular column
comparison_movies = comparison_docs_df.reset_index()
comparison_movies = comparison_movies.rename(columns={"index": "Movie Name", 0: "Summary"})

# Save to CSV without the index column
comparison_movies.to_csv("comparison_movie_sums.csv", index=False)

## Getting Comparison Movies for LSI

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException
from bs4 import BeautifulSoup
import time

def movie_web_scrapping_lsi(start_url, max_clicks=10):
    # Setup Chrome options
    options = Options()
    options.add_argument("--headless")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    # Initialize driver
    driver = webdriver.Chrome(options=options)
    driver.get(start_url)

    # Click "Load More" repeatedly
    clicks = 0
    while clicks < max_clicks:
        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, "button.ipc-see-more__button")
            driver.execute_script("arguments[0].click();", load_more_button)
            time.sleep(1)  # Wait for new items to load
            clicks += 1
            print(f"Clicked 'Load More' {clicks} times...")
        except (NoSuchElementException, ElementClickInterceptedException):
            print("No more 'Load More' button or couldn't click it.")
            break

    # After loading all content
    soup = BeautifulSoup(driver.page_source, "lxml")
    driver.quit()

    # Initialize list to store [title, year] pairs
    movie_data = {}

    # Find all movie containers
    movie_containers = soup.find_all("div", {"class": "ipc-metadata-list-summary-item__c"})

    for container in movie_containers:
        try:
            # Extract title
            title_element = container.find("h3", {"class": "ipc-title__text"})
            if title_element:
                title = title_element.get_text().split(".")[1].strip()

                # Extract year
                year_element = container.find("span", {"class": "sc-5179a348-7 idrYgr dli-title-metadata-item"})
                year = year_element.get_text() if year_element else "Unknown"

                # Add [title, year] to the movie_data list
                movie_data[title] = year
        except Exception as e:
            continue

    return movie_data


In [None]:
# movies from 1970 - 1990
reference_movies_lsi = movie_web_scrapping_lsi("https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01,1990-01-01&languages=en", max_clicks = 315)

In [None]:
# Write to CSV so I don't need to run the webscrapping again
reference_movies = pd.DataFrame.from_dict(reference_movies_lsi, orient='index')

# Reset index to convert the movie names into a regular column
reference_movies = reference_movies.reset_index()
reference_movies = reference_movies.rename(columns={"index": "Movie Name", 0: "Year"})

# Save to CSV without the index column
reference_movies.to_csv("reference_movies.csv", index=False)

In [None]:
lsi_reference_movies = pd.read_csv("reference_movies.csv")
lsi_reference_movies_list = lsi_reference_movies.values.tolist()

In [None]:
# Run it in batches
reference_movie_sums_11 = get_movie_details(lsi_reference_movies_list[8000:])

In [None]:
#Write to CSV so I don't need to run the webscrapping again
reference_movie_sums_csv = pd.DataFrame.from_dict(reference_movie_sums_11, orient='index')

# Reset index to convert the movie names into a regular column
reference_movie_sums_csv = reference_movie_sums_csv.reset_index()
reference_movie_sums_csv = reference_movie_sums_csv.rename(columns={"index": "Movie Name", 0: "Summary"})

# Save to CSV without the index column
reference_movie_sums_csv.to_csv("reference_movies_sum_11.csv", index=False)

In [None]:
# Merge all batched csvs
filenames = [f"reference_movies_sum_{i}.csv" for i in range (1,12)]
reference_movie_sums_df = [pd.read_csv(filename) for filename in filenames]
reference_movie_sums_final = pd.concat(reference_movie_sums_df, ignore_index=True)
reference_movie_sums_final.to_csv("reference_movie_sums.csv")

##Build LSI

In [None]:
import nltk

common_corpora = [
    'gutenberg', 'genesis', 'inaugural', 'nps_chat', 'webtext',
    'punkt', 'stopwords', 'wordnet', 'omw-1.4',
    'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words',
    'reuters', 'movie_reviews', 'treebank', 'tagsets'
]

for corpus in common_corpora:
    nltk.download(corpus)


In [None]:
!pip install numpy==1.24.3
!pip install -U gensim
# !pip install wordcloud
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk import sent_tokenize,word_tokenize
from nltk.book import *
# import gensim.summarization
from gensim import corpora, models, similarities


###Reference Docs

In [None]:
import pandas as pd

In [None]:
reference_movies = pd.read_csv("reference_movie_sums.csv")

In [None]:
reference_movies_sums = reference_movies["Summary"].to_list()
reference_movies_names = reference_movies["Movie Name"].to_list()

In [None]:
# reference_movies.to_csv("Reference Movies.csv")

In [None]:
# Code for LSI model for summaries goes here
documents = [doc for doc in reference_movies_sums]
texts = [[word for word in document.lower().split()
        if word not in STOPWORDS and word.isalnum()]
        for document in documents]
dictionary_summary = corpora.Dictionary(texts)
corpus_summary = [dictionary_summary.doc2bow(text) for text in texts]

lsi_ref_summary = models.LsiModel(corpus_summary, id2word=dictionary_summary, num_topics=150)

In [None]:
comparison_movies = pd.read_csv("comparison_movie_sums.csv")
comparison_movies_sums = comparison_movies["Summary"].to_list()
comparison_movies_names = comparison_movies["Movie Name"].to_list()

In [None]:
from gensim.similarities.docsim import Similarity
from gensim import corpora, models, similarities
import warnings

In [None]:
index = similarities.MatrixSimilarity(lsi_ref_summary[corpus_summary])
warnings.filterwarnings("ignore")
table_data = []
for i, movie_summary in enumerate(comparison_movies_sums):
    words = [word for word in movie_summary.lower().split()
              if word not in STOPWORDS and word.isalnum()]
    vec_bow = dictionary_summary.doc2bow(words)
    vec_lsi = lsi_ref_summary[vec_bow]
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    most_similar = sims[0]

    # Store the result
    table_data.append({
        "Movie": comparison_movies_names[i],
        "Most Similar Movie": reference_movies_names[most_similar[0]],
        "Similarity Score": most_similar[1]
    })

similarity_df = pd.DataFrame(table_data)

In [None]:
index = similarities.MatrixSimilarity(lsi_ref_summary[corpus_summary])
warnings.filterwarnings("ignore")
table_data = []
for i, movie_summary in enumerate(comparison_movies_sums):
    words = [word for word in movie_summary.lower().split()
             if word not in STOPWORDS and word.isalnum()]
    vec_bow = dictionary_summary.doc2bow(words)
    vec_lsi = lsi_ref_summary[vec_bow]
    sims = index[vec_lsi]
    sim_list = list(enumerate(sims))
    sorted_sims = sorted(sim_list, key=lambda item: -item[1])

    most_similar = None
    for ref_idx, similarity in sorted_sims:
        if comparison_movies_names[i] != reference_movies_names[ref_idx]:
            most_similar = (ref_idx, similarity)
            break

    if most_similar:
        table_data.append({
            "Movie": comparison_movies_names[i],
            "Most Similar Movie": reference_movies_names[most_similar[0]],
            "Similarity Score": most_similar[1]
        })
    else:
        print(f"No unique match found for: {comparison_movies_names[i]}")

similarity_df = pd.DataFrame(table_data)

In [None]:
similarity_df.to_csv("lsi_output.csv")