Dataset Can be downloaded from the link mentioned in the notebook.
https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots/data
- Step 1: Download the dataset and upload it to the environment
- Step 2: Execute the second column of the colab notebook.

During run time give the query as user input and the
notebook will be executed successfully.

In [1]:
# Step 1 load data
import pandas as pd

# Load the dataset
file_path = "data/movie_plots_subset.csv"
df = pd.read_csv(file_path)

# Display the first few rows
df.head()


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1951,The Day the Earth Stood Still,American,Robert Wise,"Michael Rennie, Patricia Neal",science fiction,https://en.wikipedia.org/wiki/The_Day_the_Eart...,"When a flying saucer lands in Washington, D.C...."
1,1981,The Burning,American,Tony Maylam,"Brian Matthews, Holly Hunter, Jason Alexander",horror,https://en.wikipedia.org/wiki/The_Burning_(film),"One night at Camp Blackfoot, several campers p..."
2,2012,Nobel Chor,Bengali,Suman Ghosh,"Mithun Chakraborty, Saswata Chatterjee, Sudipt...",suspense / drama,https://en.wikipedia.org/wiki/Nobel_Chor,"The first Asian Nobel Laureate, Rabindranath T..."
3,1952,Trent's Last Case,British,Herbert Wilcox,"Michael Wilding, Margaret Lockwood, Orson Welles",detective,https://en.wikipedia.org/wiki/Trent%27s_Last_C...,A major international financier is found dead ...
4,1977,Aafat,Bollywood,Atma Ram,"Navin Nischol, Leena Chandavarkar, Amjad Khan,...",unknown,https://en.wikipedia.org/wiki/Aafat,Inspector Amar and Inspector Chhaya are after ...


In [2]:
# Identify available columns
# The most important columns for recomendation, Title, Plot, Genre
print(df.columns)

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')


In [3]:
# Check NaN
# Since Title, Genre, Plot are no NaN
print(df.isnull().sum())

Release Year         0
Title                0
Origin/Ethnicity     0
Director             0
Cast                19
Genre                0
Wiki Page            0
Plot                 0
dtype: int64


In [4]:
# Genre contains "unknown" and replace with "Miscellaneous"
df["Genre"] = df["Genre"].replace("unknown", "Miscellaneous")

# Feature engineering

In [5]:
df["combined_text"] = df["Genre"] + " " + " " + df["Plot"]

In [6]:
import re

def clean_text(text):
    # To lower case
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [7]:
df["cleaned_text"] = df["combined_text"].apply(clean_text)

print(df[['Title', 'cleaned_text']].head())

                           Title  \
0  The Day the Earth Stood Still   
1                    The Burning   
2                     Nobel Chor   
3              Trent's Last Case   
4                          Aafat   

                                        cleaned_text  
0  science fiction  when a flying saucer lands in...  
1  horror  one night at camp blackfoot several ca...  
2  suspense  drama  the first asian nobel laureat...  
3  detective  a major international financier is ...  
4  miscellaneous  inspector amar and inspector ch...  


In [8]:
# Step 2 (TF-IDF Vectorization)
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply TF-IDF to the cleaned text
# Create a TF-IDF transformer and stop_words = 'english' removes common word such as 'the', 'and'
# Fine-tuning, ngram_range=(1, 2) Captures single words & 2-word phrases (bigrams)
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_df=0.85,
    min_df=2,
    sublinear_tf=True,
    smooth_idf=True,
    norm='l2'  # Normalizes feature vectors
)

# fit_transform()
# fit: learns unique words across all movies,
# transform: Converts each movie's text into a vector representation.
tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])

# Check TF-IDF shape
# It shows (number of movies, number of unique words)
print(tfidf_matrix.shape)

(500, 10427)


In [9]:
# Step 3 Compute Similarity and construct similarity matrix with cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between all movies
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check shape
print(similarity_matrix.shape)


(500, 500)


In [10]:
# Step 4 Set up recommendation system
def movie_recommendation_system(user_input, df, vectorizer, tfidf_matrix, similarity_matrix, top_n=5):
    """
    Recommends movies based on a user-provided text description.

    Parameters:
    - user_input (str): The user's movie description (e.g., "I like action movies set in space")
    - df (DataFrame): The movie dataset
    - vectorizer (TfidfVectorizer): The trained TF-IDF vectorizer
    - tfidf_matrix (sparse matrix): The TF-IDF matrix for all movies
    - similarity_matrix (array): The cosine similarity matrix
    - top_n (int): Number of recommendations to return (default=5)

    Returns:
    - List of recommended movie titles
    """
    # Edge Case: Handle empty input
    if not user_input.strip():
        return [("!!!No input provided. Please describe your movie preferences.!!!", 0.0)]

    # Transform user input into a TF-IDF vector
    user_vector = vectorizer.transform([user_input])

    # Compute similarity between user input and all movies
    user_similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()

    # Get top N most similar movie indices
    top_movie_indices = user_similarities.argsort()[-top_n:][::-1]

    # Return top movie titles with similarity scores
    return [(df.iloc[idx]["Title"], round(user_similarities[idx], 4)) for idx in top_movie_indices]


In [11]:
# Step 5 basic test
test_query = "I like action movies set in space"
recommended_movies_basic_query = movie_recommendation_system(test_query, df, vectorizer, tfidf_matrix, similarity_matrix)

test_long_query = "I want to watch a movie about a detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists."
recommended_movies_long_query = movie_recommendation_system(test_long_query, df, vectorizer, tfidf_matrix, similarity_matrix)

print("🔹 Basic Query Test:")
for i, (movie, score) in enumerate(recommended_movies_basic_query, start=1):
    print(f"{i}. {movie} (Similarity: {score})")

print("\n\n🔹 Long Query Test:")
for i, (movie, score) in enumerate(recommended_movies_long_query, start=1):
    print(f"{i}. {movie} (Similarity: {score})")


🔹 Basic Query Test:
1. Kaizoku Sentai Gokaiger vs. Space Sheriff Gavan: The Movie (Similarity: 0.1082)
2. Wu Kong (Similarity: 0.1041)
3. Be Kind Rewind (Similarity: 0.0853)
4. Kamen Rider Kabuto: GOD SPEED LOVE (Similarity: 0.0851)
5. Sandook (Similarity: 0.0782)


🔹 Long Query Test:
1. Father Brown, Detective (Similarity: 0.1551)
2. Meet Maxwell Archer (Similarity: 0.1126)
3. Nobel Chor (Similarity: 0.0851)
4. Scream and Scream Again (Similarity: 0.0703)
5. The Precipice Game (Similarity: 0.0689)


In [12]:
def test_recommendation_system():
    # Test with a normal input
    test_query = "A fun animated adventure movie about animals."
    result = movie_recommendation_system(test_query, df, vectorizer, tfidf_matrix, similarity_matrix, top_n=5)

    print("🔹 Basic Query Test:")
    for i, movie in enumerate(result, start=1):
        print(f"{i}. {movie}")
    print("\n")

    # Edge case: Long descriptive input
    test_query = "A detective solving a mystery in a dark and rainy city, filled with suspense, drama, and unexpected twists."
    result = movie_recommendation_system(test_query, df, vectorizer, tfidf_matrix, similarity_matrix, top_n=5)

    print("🔹 Long Query Test:")
    for i, movie in enumerate(result, start=1):
        print(f"{i}. {movie}")
    print("\n")

    # Edge case: Empty input
    test_query = ""
    result = movie_recommendation_system(test_query, df, vectorizer, tfidf_matrix, similarity_matrix, top_n=5)

    print("🔹 Empty Query Test:")
    for i, movie in enumerate(result, start=1):
        print(f"{i}. {movie}")
    print("\n")


In [13]:
# Run the test
test_recommendation_system()

🔹 Basic Query Test:
1. (' Blinky Bill the Movie', 0.1001)
2. ('The Adventurers', 0.0734)
3. ('The Roots of Heaven', 0.0698)
4. ('Poochudava', 0.0685)
5. ('Pudsey: The Movie', 0.0607)


🔹 Long Query Test:
1. ('Father Brown, Detective', 0.1694)
2. ('Meet Maxwell Archer', 0.1229)
3. ('The Precipice Game', 0.0753)
4. ('Nobel Chor', 0.0745)
5. ('The Riddle', 0.0683)


🔹 Empty Query Test:
1. ('!!!No input provided. Please describe your movie preferences.!!!', 0.0)


