In [15]:
# Step 1 load data
import pandas as pd

# Load the dataset
file_path = "data/movie_plots_subset.csv"
df = pd.read_csv(file_path)

# Display the first few rows
df.head()


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1951,The Day the Earth Stood Still,American,Robert Wise,"Michael Rennie, Patricia Neal",science fiction,https://en.wikipedia.org/wiki/The_Day_the_Eart...,"When a flying saucer lands in Washington, D.C...."
1,1981,The Burning,American,Tony Maylam,"Brian Matthews, Holly Hunter, Jason Alexander",horror,https://en.wikipedia.org/wiki/The_Burning_(film),"One night at Camp Blackfoot, several campers p..."
2,2012,Nobel Chor,Bengali,Suman Ghosh,"Mithun Chakraborty, Saswata Chatterjee, Sudipt...",suspense / drama,https://en.wikipedia.org/wiki/Nobel_Chor,"The first Asian Nobel Laureate, Rabindranath T..."
3,1952,Trent's Last Case,British,Herbert Wilcox,"Michael Wilding, Margaret Lockwood, Orson Welles",detective,https://en.wikipedia.org/wiki/Trent%27s_Last_C...,A major international financier is found dead ...
4,1977,Aafat,Bollywood,Atma Ram,"Navin Nischol, Leena Chandavarkar, Amjad Khan,...",unknown,https://en.wikipedia.org/wiki/Aafat,Inspector Amar and Inspector Chhaya are after ...


In [16]:
# Identify available columns
# The most important columns for recomendation, Title, Plot, Genre
print(df.columns)

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')


In [17]:
# Check NaN
# Since Title, Genre, Plot are no NaN
print(df.isnull().sum())

Release Year         0
Title                0
Origin/Ethnicity     0
Director             0
Cast                19
Genre                0
Wiki Page            0
Plot                 0
dtype: int64


In [19]:
# Genre contains "unknown" and replace with "Miscellaneous"
df["Genre"] = df["Genre"].replace("unknown", "Miscellaneous")

# Feature engineering

In [20]:
df["combined_text"] = df["Genre"] + " " + " " + df["Plot"]

In [21]:
import re

def clean_text(text):
    # To lower case
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [22]:
df["cleaned_text"] = df["combined_text"].apply(clean_text)

print(df[['Title', 'cleaned_text']].head())

                           Title  \
0  The Day the Earth Stood Still   
1                    The Burning   
2                     Nobel Chor   
3              Trent's Last Case   
4                          Aafat   

                                        cleaned_text  
0  science fiction  when a flying saucer lands in...  
1  horror  one night at camp blackfoot several ca...  
2  suspense  drama  the first asian nobel laureat...  
3  detective  a major international financier is ...  
4  miscellaneous  inspector amar and inspector ch...  


In [23]:
# Step 2: Generate Sentence Embeddings using BERT
from sentence_transformers import SentenceTransformer
import torch

# Load a pre-trained Sentence Transformer model (efficient and small)


model = SentenceTransformer('all-mpnet-base-v2')
# model = SentenceTransformer('sentence-t5-base')

# Convert all movie descriptions into numerical embeddings
print("Generating BERT embeddings... This may take a moment.")
movie_embeddings = model.encode(df["cleaned_text"].tolist(), convert_to_tensor=True)

# Convert to CPU tensor for computation
movie_embeddings = movie_embeddings.cpu()

# Save embeddings for later use (optional, speeds up repeated runs)
torch.save(movie_embeddings, "bert_movie_embeddings.pt")


Generating BERT embeddings... This may take a moment.


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

def movie_recommendation_system(user_input, df, model, movie_embeddings, top_n=5, min_threshold=50):
    """
    Recommends movies using BERT-based sentence embeddings with better filtering.

    Parameters:
    - user_input (str): The user's movie description.
    - df (DataFrame): The movie dataset.
    - model (SentenceTransformer): The BERT sentence transformer model.
    - movie_embeddings (Tensor): Precomputed embeddings for all movies.
    - top_n (int): Number of recommendations to return.
    - min_threshold (float): Minimum recommendation score (scaled 0-100).

    Returns:
    - List of tuples containing (movie title, recommendation score).
    """

    # Edge Case: Handle empty input
    if not user_input.strip():
        return [("!!! No input provided. Please describe your movie preferences. !!!", 0.0)]

    # Convert user input into an embedding
    user_embedding = model.encode([user_input], convert_to_tensor=True)

    # Compute similarity scores
    similarities = cosine_similarity(user_embedding.cpu().numpy(), movie_embeddings.cpu().numpy()).flatten()

    # Convert similarity scores to a percentage (0-100)
    similarities = similarities * 100

    # Get top N most similar movie indices
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Get top result's highest similarity score
    highest_score = similarities[top_indices[0]]

    # **ADAPTIVE FILTERING: Ensure highest_score is reasonably high**
    if highest_score < min_threshold:
        return [("= = = No highly relevant recommendations found. Try refining your input. = = =", 0.0)]

    # Apply threshold and format results
    filtered_results = [(df.iloc[idx]["Title"], round(similarities[idx], 1))
                        for idx in top_indices if similarities[idx] >= min_threshold]

    # Ensure there is at least 1 recommendation after filtering
    if not filtered_results:
        return [("= = = No highly relevant recommendations found. Try refining your input. = = =", 0.0)]

    return filtered_results


In [26]:
# Load stored embeddings (optional, speeds up repeated runs)
movie_embeddings = torch.load("bert_movie_embeddings.pt")

# Test queries
test_query_basic = "I like action movies set in space"
test_query_long = "A detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists."

# Get recommendations
recommended_movies_basic_query = movie_recommendation_system(test_query_basic, df, model, movie_embeddings)
recommended_movies_long_query = movie_recommendation_system(test_query_long, df, model, movie_embeddings)

# Print results
print("🔹 Basic Query Test:")
for i, (movie, score) in enumerate(recommended_movies_basic_query, start=1):
    print(f"{i}. {movie} (Recommendation Score: {score}/100)")

print("\n🔹 Long Query Test:")
for i, (movie, score) in enumerate(recommended_movies_long_query, start=1):
    print(f"{i}. {movie} (Recommendation Score: {score}/100)")


🔹 Basic Query Test:
1. = = = No highly relevant recommendations found. Try refining your input. = = = (Recommendation Score: 0.0/100)

🔹 Long Query Test:
1. The Bone Collector (Recommendation Score: 55.0/100)
2. The Arnelo Affair (Recommendation Score: 52.5/100)
3. Young and Innocent (Recommendation Score: 51.29999923706055/100)
4. Striking Distance (Recommendation Score: 51.20000076293945/100)
5. Romeo Is Bleeding (Recommendation Score: 51.0/100)


In [29]:
def test_recommendation_system(test_cases):
    """
    Tests the movie recommendation system with various inputs.

    Parameters:
    - test_cases (dict, list, or tuple): A collection of test cases where each test case
      contains a title and a query string.
    """

    # Ensure test_cases is converted into a dictionary if given as a list or tuple
    if isinstance(test_cases, (list, tuple)):
        test_cases = {f"Test {i+1}": query for i, query in enumerate(test_cases)}

    print("\n ~ ~ ~ ~ Running Movie Recommendation Tests ~ ~ ~ ~\n")

    for test_name, query in test_cases.items():
        print(f"🔹 **{test_name}**")
        print(f"   ➡ Query: `{query}`")

        result = movie_recommendation_system(query, df, model, movie_embeddings, top_n=5)

        if result:
            print("   🔽 **Top Recommendations:**")
            for i, (movie, score) in enumerate(result, start=1):  # Unpack only movie and score
                print(f"      {i}. {movie} (Recommendation Score: {score:.1f}/100)")
        else:
            print("= = = No recommendations found. = = = ")

        print("-" * 50)

    print("\n ~ ~ ~ All tests completed! ~ ~ ~")


In [30]:
test_cases_dict = {
    "Basic Query": "A fun animated adventure movie about animals.",
    "Long Query": "A detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists.",
    "Empty Query": "",
    "Irrelevant Query": "asdfghjkl qwerty lorem ipsum blah blah",
    "Short Query": "Action movie"
}

test_recommendation_system(test_cases_dict)


 ~ ~ ~ ~ Running Movie Recommendation Tests ~ ~ ~ ~

🔹 **Basic Query**
   ➡ Query: `A fun animated adventure movie about animals.`
   🔽 **Top Recommendations:**
      1.  Blinky Bill the Movie (Recommendation Score: 55.2/100)
--------------------------------------------------
🔹 **Long Query**
   ➡ Query: `A detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists.`
   🔽 **Top Recommendations:**
      1. The Bone Collector (Recommendation Score: 55.0/100)
      2. The Arnelo Affair (Recommendation Score: 52.5/100)
      3. Young and Innocent (Recommendation Score: 51.3/100)
      4. Striking Distance (Recommendation Score: 51.2/100)
      5. Romeo Is Bleeding (Recommendation Score: 51.0/100)
--------------------------------------------------
🔹 **Empty Query**
   ➡ Query: ``
   🔽 **Top Recommendations:**
      1. !!! No input provided. Please describe your movie preferences. !!! (Recommendation Score: 0.0/100)
-----------------------------