# Step 1: Load Data

In [1]:
import pandas as pd

# Load the prepared dataset
file_path = "data/movie_plots_subset.csv"
df = pd.read_csv(file_path)

# Display the first few rows
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1951,The Day the Earth Stood Still,American,Robert Wise,"Michael Rennie, Patricia Neal",science fiction,https://en.wikipedia.org/wiki/The_Day_the_Eart...,"When a flying saucer lands in Washington, D.C...."
1,1981,The Burning,American,Tony Maylam,"Brian Matthews, Holly Hunter, Jason Alexander",horror,https://en.wikipedia.org/wiki/The_Burning_(film),"One night at Camp Blackfoot, several campers p..."
2,2012,Nobel Chor,Bengali,Suman Ghosh,"Mithun Chakraborty, Saswata Chatterjee, Sudipt...",suspense / drama,https://en.wikipedia.org/wiki/Nobel_Chor,"The first Asian Nobel Laureate, Rabindranath T..."
3,1952,Trent's Last Case,British,Herbert Wilcox,"Michael Wilding, Margaret Lockwood, Orson Welles",detective,https://en.wikipedia.org/wiki/Trent%27s_Last_C...,A major international financier is found dead ...
4,1977,Aafat,Bollywood,Atma Ram,"Navin Nischol, Leena Chandavarkar, Amjad Khan,...",unknown,https://en.wikipedia.org/wiki/Aafat,Inspector Amar and Inspector Chhaya are after ...


# Step 2: Data Cleaning

In [2]:
# Check number of rows and columns in the dataset
print(f"Dataset Shape: {df.shape}")

# Display dataset information (columns, data types, non-null counts)
df.info()

# Check for missing values in each column
print("\nMissing Values:\n", df.isnull().sum())

# Replace "unknown" genre values with "Miscellaneous"
df["Genre"] = df["Genre"].replace("unknown", "Miscellaneous")
print("\n'Replaced 'unknown' in Genre with 'Miscellaneous'.")

Dataset Shape: (500, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      500 non-null    int64 
 1   Title             500 non-null    object
 2   Origin/Ethnicity  500 non-null    object
 3   Director          500 non-null    object
 4   Cast              481 non-null    object
 5   Genre             500 non-null    object
 6   Wiki Page         500 non-null    object
 7   Plot              500 non-null    object
dtypes: int64(1), object(7)
memory usage: 31.4+ KB

Missing Values:
 Release Year         0
Title                0
Origin/Ethnicity     0
Director             0
Cast                19
Genre                0
Wiki Page            0
Plot                 0
dtype: int64

'Replaced 'unknown' in Genre with 'Miscellaneous'.


## Step 2': Feature Engineering

In [3]:
# Create a new simple feature to improve movie recommendation performance.
df["combined_text"] = df["Genre"] + " " + " " + df["Plot"]

In [4]:
import re

def clean_text(text):
    """
    Cleans the input text by:
    - Converting to lowercase
    - Removing punctuation to ensure better text processing
    """

    # To lower case
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [5]:
# Apply the cleaning function to remove unnecessary punctuation
df["cleaned_text"] = df["combined_text"].apply(clean_text)

# Display a preview of the cleaned text
print(df[['Title', 'cleaned_text']].head())

                           Title  \
0  The Day the Earth Stood Still   
1                    The Burning   
2                     Nobel Chor   
3              Trent's Last Case   
4                          Aafat   

                                        cleaned_text  
0  science fiction  when a flying saucer lands in...  
1  horror  one night at camp blackfoot several ca...  
2  suspense  drama  the first asian nobel laureat...  
3  detective  a major international financier is ...  
4  miscellaneous  inspector amar and inspector ch...  


# Step 3: Load Pretrained BERT Model

In [6]:
from sentence_transformers import SentenceTransformer
import torch

# Load a pre-trained Sentence Transformer model
print("Loading BERT model... (This may take a few seconds)")
model = SentenceTransformer('all-mpnet-base-v2')

# Convert all movie descriptions into numerical embeddings
print("Generating BERT embeddings... This may take ~30 sec.")
movie_embeddings = model.encode(df["cleaned_text"].tolist(), convert_to_tensor=True)

# Ensure embeddings are stored on CPU for future computations
movie_embeddings = movie_embeddings.to("cpu")

# Save embeddings as a PyTorch tensor for later use
torch.save(movie_embeddings, "bert_movie_embeddings.pt")

print("Embedding generation complete. Saved as 'bert_movie_embeddings.pt'.")

Loading BERT model... (This may take a few seconds)
Generating BERT embeddings... This may take ~30 sec.
Embedding generation complete. Saved as 'bert_movie_embeddings.pt'.


# Step 4: Define the Recommendation System

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load precomputed embeddings or generate if not available
DEFAULT_MOVIE_EMBEDDINGS = torch.load("bert_movie_embeddings.pt")

def rank_recommendations(user_input, df=df, model=model, movie_embeddings=DEFAULT_MOVIE_EMBEDDINGS, top_n=5, min_threshold=30):
    """
    Computes similarity scores and ranks movie recommendations.

    Returns:
    - List of tuples containing (movie title, recommendation score).
    """
    if not user_input.strip():
        return [("!!! No input provided. Please describe your movie preferences. !!!", 0.0)]

    # Convert user input into an embedding
    user_embedding = model.encode([user_input], convert_to_tensor=True).to("cpu").numpy()

    # Compute cosine similarity
    movie_embeddings_np = movie_embeddings.cpu().numpy()
    similarities = cosine_similarity(user_embedding, movie_embeddings_np).flatten() * 100  # Scale to 0-100

    # Get top N most similar movie indices
    top_indices = np.argsort(similarities)[-top_n:][::-1]

    # Extract top recommendations based on similarity threshold
    recommendations = [(df.iloc[idx]["Title"], round(float(similarities[idx]), 2))
                       for idx in top_indices if similarities[idx] >= min_threshold]

    return recommendations if recommendations else [("No relevant recommendations found. Try a different query.", 0.0)]



In [8]:
def print_recommendations(recommendations):
    """
    Formats and prints the recommendations in a clean and readable way.
    """
    if not recommendations or (len(recommendations) == 1 and recommendations[0][1] == 0.0):
        print("\n**🚫No recommendations available. Try a different query.**\n")
    else:
        print("\n**🔽Movie Recommendations:**\n")
        for title, score in recommendations:
            print(f"   '{title}', Recommendation score {score:.2f}/100,")
        print("\n")

In [9]:
def movie_recommendation_system(user_input):
    """
    Generates and prints ranked movie recommendations based on user input.
    """
    recommendations = rank_recommendations(user_input)
    print_recommendations(recommendations)

# Step 5: Test the Recommendation System

In [10]:
def unit_tests_recommendation_system():
    """
    Runs predefined test cases for the movie recommendation system.
    """
    test_cases_dict = {
        "Basic Query": "I like action movies set in space",
        "Short Query": "Action movie",
        "Long Query": "A detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists.",
        "Empty Query": "",
        "Irrelevant Query": "asdfghjkl qwerty lorem ipsum blah blah"
    }

    print("\n ~ ~ ~ Running Movie Recommendation Unit Tests ~ ~ ~ \n")

    for test_name, query in test_cases_dict.items():
        print(f"**🔹Test Case: {test_name}**")
        print(f"'User Input:' \"{query}\"\n")

        recommendations = movie_recommendation_system(query)

        print("-" * 80)  # Separator for readability

    print("\n~ ~ ~ All test cases completed! ~ ~ ~ \n")

In [11]:
# driver function for the system test

# Run the test cases
unit_tests_recommendation_system()



 ~ ~ ~ Running Movie Recommendation Unit Tests ~ ~ ~ 

**🔹Test Case: Basic Query**
'User Input:' "I like action movies set in space"


**🔽Movie Recommendations:**

   'The Doomsday Machine', Recommendation score 36.37/100,
   'Kaizoku Sentai Gokaiger vs. Space Sheriff Gavan: The Movie', Recommendation score 34.27/100,


--------------------------------------------------------------------------------
**🔹Test Case: Short Query**
'User Input:' "Action movie"


**🔽Movie Recommendations:**

   'Fight Club – Members Only', Recommendation score 56.86/100,
   'Gallowwalkers', Recommendation score 51.30/100,
   'Speedway Junky', Recommendation score 48.81/100,
   'Anjathe', Recommendation score 47.72/100,
   'Gopi Kishan', Recommendation score 47.11/100,


--------------------------------------------------------------------------------
**🔹Test Case: Long Query**
'User Input:' "A detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists."


**🔽Movi

In [12]:
user_input1 = "I want to see a romantic movie with my girl friend in weekend."
movie_recommendation_system(user_input1)


**🔽Movie Recommendations:**

   'Cool...Sakkath Hot Maga', Recommendation score 43.14/100,
   'Ala Modalaindi', Recommendation score 40.78/100,
   'Om Shanti Oshana', Recommendation score 40.75/100,
   'Jaan-E-Mann', Recommendation score 40.13/100,
   'Just Married', Recommendation score 39.17/100,




In [13]:
user_input2 = "Can you recommend a sci-fi movie with my friends for tonight?"
movie_recommendation_system(user_input2)


**🔽Movie Recommendations:**

   'The Doomsday Machine', Recommendation score 45.42/100,
   'Predators', Recommendation score 37.14/100,
   'Assassination Classroom: Graduation', Recommendation score 35.84/100,
   'Just Married', Recommendation score 34.13/100,
   'Love Bite', Recommendation score 32.69/100,


