# Step 1: Load Data

In [1]:
import pandas as pd

# Load the prepared dataset
file_path = "data/movie_plots_subset.csv"
df = pd.read_csv(file_path)

# Display the first few rows
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1951,The Day the Earth Stood Still,American,Robert Wise,"Michael Rennie, Patricia Neal",science fiction,https://en.wikipedia.org/wiki/The_Day_the_Eart...,"When a flying saucer lands in Washington, D.C...."
1,1981,The Burning,American,Tony Maylam,"Brian Matthews, Holly Hunter, Jason Alexander",horror,https://en.wikipedia.org/wiki/The_Burning_(film),"One night at Camp Blackfoot, several campers p..."
2,2012,Nobel Chor,Bengali,Suman Ghosh,"Mithun Chakraborty, Saswata Chatterjee, Sudipt...",suspense / drama,https://en.wikipedia.org/wiki/Nobel_Chor,"The first Asian Nobel Laureate, Rabindranath T..."
3,1952,Trent's Last Case,British,Herbert Wilcox,"Michael Wilding, Margaret Lockwood, Orson Welles",detective,https://en.wikipedia.org/wiki/Trent%27s_Last_C...,A major international financier is found dead ...
4,1977,Aafat,Bollywood,Atma Ram,"Navin Nischol, Leena Chandavarkar, Amjad Khan,...",unknown,https://en.wikipedia.org/wiki/Aafat,Inspector Amar and Inspector Chhaya are after ...


# Step 2: Data Cleaning

In [13]:
# Check number of rows and columns in the dataset
print(f"Dataset Shape: {df.shape}")

# Display dataset information (columns, data types, non-null counts)
df.info()

# Check for missing values in each column
print("\nMissing Values:\n", df.isnull().sum())

# Replace "unknown" genre values with "Miscellaneous"
df["Genre"] = df["Genre"].replace("unknown", "Miscellaneous")
print("\n'Replaced 'unknown' in Genre with 'Miscellaneous'.")

Dataset Shape: (500, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      500 non-null    int64 
 1   Title             500 non-null    object
 2   Origin/Ethnicity  500 non-null    object
 3   Director          500 non-null    object
 4   Cast              481 non-null    object
 5   Genre             500 non-null    object
 6   Wiki Page         500 non-null    object
 7   Plot              500 non-null    object
 8   combined_text     500 non-null    object
 9   cleaned_text      500 non-null    object
dtypes: int64(1), object(9)
memory usage: 39.2+ KB

Missing Values:
 Release Year         0
Title                0
Origin/Ethnicity     0
Director             0
Cast                19
Genre                0
Wiki Page            0
Plot                 0
combined_text        0
cleaned_text         0
dtype: int64

'Repl

# Step 2': Feature engineering

In [5]:
# Create a new simple feature to improve movie recommendation performance.
df["combined_text"] = df["Genre"] + " " + " " + df["Plot"]

In [14]:
import re

def clean_text(text):
    """
    Cleans the input text by:
    - Converting to lowercase
    - Removing punctuation to ensure better text processing
    """

    # To lower case
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [15]:
# Apply the cleaning function to remove unnecessary punctuation
df["cleaned_text"] = df["combined_text"].apply(clean_text)

# Display a preview of the cleaned text
print(df[['Title', 'cleaned_text']].head())

                           Title  \
0  The Day the Earth Stood Still   
1                    The Burning   
2                     Nobel Chor   
3              Trent's Last Case   
4                          Aafat   

                                        cleaned_text  
0  science fiction  when a flying saucer lands in...  
1  horror  one night at camp blackfoot several ca...  
2  suspense  drama  the first asian nobel laureat...  
3  detective  a major international financier is ...  
4  miscellaneous  inspector amar and inspector ch...  


# Step 3: Load Pretrained BERT Model

In [16]:
from sentence_transformers import SentenceTransformer
import torch

# Load a pre-trained Sentence Transformer model
print("Loading BERT model... (This may take a few seconds)")
model = SentenceTransformer('all-mpnet-base-v2')

# Convert all movie descriptions into numerical embeddings
print("Generating BERT embeddings... This may take ~30 sec.")
movie_embeddings = model.encode(df["cleaned_text"].tolist(), convert_to_tensor=True)

# Ensure embeddings are stored on CPU for future computations
movie_embeddings = movie_embeddings.to("cpu")

# Save embeddings as a PyTorch tensor for later use
torch.save(movie_embeddings, "bert_movie_embeddings.pt")

print("Embedding generation complete. Saved as 'bert_movie_embeddings.pt'.")

Loading BERT model... (This may take a few seconds)
Generating BERT embeddings... This may take ~30 sec.
Embedding generation complete. Saved as 'bert_movie_embeddings.pt'.


# Step 4: Define the Recommendation System

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def movie_recommendation_system(user_input, df, model, movie_embeddings, top_n=5, min_threshold=50):
    """
    Recommends movies using BERT-based sentence embeddings with better filtering.

    Parameters:
    - user_input (str): The user's movie description.
    - df (DataFrame): The movie dataset.
    - model (SentenceTransformer): The BERT sentence transformer model.
    - movie_embeddings (Tensor): Precomputed embeddings for all movies.
    - top_n (int): Number of recommendations to return.
    - min_threshold (float): Minimum recommendation score (scaled 0-100).

    Returns:
    - List of tuples containing (movie title, recommendation score).
    """

    # Edge Case: Handle empty input
    if not user_input.strip():
        return [("!!! No input provided. Please describe your movie preferences. !!!", 0.0)]

    # Convert user input into an embedding
    user_embedding = model.encode([user_input], convert_to_tensor=True).to("cpu").numpy()

    # Compute cosine similarity with precomputed movie embeddings
    movie_embeddings_np = movie_embeddings.cpu().numpy()  # Ensure tensor is on CPU
    similarities = cosine_similarity(user_embedding, movie_embeddings_np).flatten() * 100  # Scale to 0-100

    # Get top N most similar movie indices
    top_indices = np.argsort(similarities)[-top_n:][::-1]

    # Extract top recommendations based on similarity threshold
    filtered_results = [(df.iloc[idx]["Title"], round(similarities[idx], 1))
                        for idx in top_indices if similarities[idx] >= min_threshold]

    return filtered_results

# Step 5: Test the Recommendation System

In [21]:
def test_recommendation_system(test_cases):
    """
    Tests the movie recommendation system with various inputs.

    Parameters:
    - test_cases (dict, list, or tuple): A collection of test cases where each test case
      contains a title and a query string.
    """

    # Ensure test_cases is converted into a dictionary if given as a list or tuple
    if isinstance(test_cases, (list, tuple)):
        test_cases = {f"Test {i+1}": query for i, query in enumerate(test_cases)}

    print("\n ~ ~ ~ ~ Running Movie Recommendation Tests ~ ~ ~ ~\n")

    for test_name, query in test_cases.items():
        print(f"🔹 **{test_name}**")
        print(f"   ➡ Query: `{query}`")

        result = movie_recommendation_system(query, df, model, movie_embeddings, top_n=5)

        # **Branching logic: Handle empty/no recommendations separately**
        if not result or (len(result) == 1 and result[0][1] == 0.0):
            print("   🚫 **No recommendations available. Try a different query.**")
        else:
            print("   🔽 **Top Recommendations:**")
            for i, (movie, score) in enumerate(result, start=1):
                print(f"      {i}. {movie} (Recommendation Score: {score:.1f}/100)")

        print("-" * 50)

    print("\n ~ ~ ~ All tests completed! ~ ~ ~")


In [24]:
# driver function for the system test

# Load stored embeddings (ensuring they are moved to CPU)
print("Loading precomputed embeddings(bert_movie_embeddings.pt)...")
movie_embeddings = torch.load("bert_movie_embeddings.pt").to("cpu")
print("✅ Embeddings loaded successfully.✅\n")


test_cases_dict = {
    "Basic Query": "A fun animated adventure movie about animals.",
    "Short Query": "Action movie",
    "Long Query": "A detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists.",
    "Empty Query": "",
    "Irrelevant Query": "asdfghjkl qwerty lorem ipsum blah blah"
}

test_recommendation_system(test_cases_dict)

Loading precomputed embeddings(bert_movie_embeddings.pt)...
✅ Embeddings loaded successfully.✅


 ~ ~ ~ ~ Running Movie Recommendation Tests ~ ~ ~ ~

🔹 **Basic Query**
   ➡ Query: `A fun animated adventure movie about animals.`
   🔽 **Top Recommendations:**
      1.  Blinky Bill the Movie (Recommendation Score: 55.2/100)
--------------------------------------------------
🔹 **Short Query**
   ➡ Query: `Action movie`
   🔽 **Top Recommendations:**
      1. Fight Club – Members Only (Recommendation Score: 56.9/100)
      2. Gallowwalkers (Recommendation Score: 51.3/100)
--------------------------------------------------
🔹 **Long Query**
   ➡ Query: `A detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists.`
   🔽 **Top Recommendations:**
      1. The Bone Collector (Recommendation Score: 55.0/100)
      2. The Arnelo Affair (Recommendation Score: 52.5/100)
      3. Young and Innocent (Recommendation Score: 51.3/100)
      4. Striking Distanc