### STEP 1 â€“ Install / check libraries (Colab)

In [1]:
import sys

import numpy as np
import pandas as pd
import sklearn

print("Python:", sys.version)
print("NumPy:", np.__version__)
print("pandas:", pd.__version__)
print("scikit-learn:", sklearn.__version__)


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
NumPy: 2.0.2
pandas: 2.2.2
scikit-learn: 1.6.1


###STEP 2 â€“ Download the MovieLens dataset in Colab


In [2]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -o ml-latest-small.zip


--2025-11-19 17:10:27--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: â€˜ml-latest-small.zipâ€™


2025-11-19 17:10:27 (4.35 MB/s) - â€˜ml-latest-small.zipâ€™ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


### STEP 3 â€“ Load movies.csv with pandas

In [4]:
movies_path = "/content/ml-latest-small/movies.csv"
movies = pd.read_csv(movies_path)

movies.head(10)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [6]:
movies.shape, movies.columns


((9742, 3), Index(['movieId', 'title', 'genres'], dtype='object'))

### STEP 4 â€“ Create a combined text feature

Weâ€™ll use title + genres as our textual description of each movie.

In [7]:
# Make a copy just to be safe (optional)
movies_df = movies.copy()

# Replace missing values in title/genres with empty string (defensive)
movies_df['title'] = movies_df['title'].fillna('')
movies_df['genres'] = movies_df['genres'].fillna('')

# Genres are like "Adventure|Animation|Children"
# Replace '|' with space so it looks more like normal text
movies_df['genres_clean'] = movies_df['genres'].str.replace('|', ' ', regex=False)

# Combine title + cleaned genres into one text field
movies_df['combined'] = movies_df['title'] + " " + movies_df['genres_clean']

movies_df[['movieId', 'title', 'genres', 'combined']].head(10)


Unnamed: 0,movieId,title,genres,combined
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure Animation Children ...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy
5,6,Heat (1995),Action|Crime|Thriller,Heat (1995) Action Crime Thriller
6,7,Sabrina (1995),Comedy|Romance,Sabrina (1995) Comedy Romance
7,8,Tom and Huck (1995),Adventure|Children,Tom and Huck (1995) Adventure Children
8,9,Sudden Death (1995),Action,Sudden Death (1995) Action
9,10,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye (1995) Action Adventure Thriller


### STEP 5 â€“ Vectorize using TF-IDF

Now we use TfidfVectorizer from scikit-learn.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
# stop_words='english' removes common words like "the", "and"
tfidf = TfidfVectorizer(stop_words='english')

# Learn vocabulary and transform the combined text into TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movies_df['combined'])

tfidf_matrix.shape

tfidf_matrix[:1].toarray()



array([[0., 0., 0., ..., 0., 0., 0.]])

### STEP 6 â€“ Cosine similarity matrix

Concept (super short):

Each movie = TF-IDF vector

In [14]:
from sklearn.metrics.pairwise import linear_kernel  # fast cosine for sparse matrices

# Compute cosine similarity between all pairs of movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim.shape
cosine_sim[:10]

array([[1.        , 0.35797045, 0.12863151, ..., 0.        , 0.05346934,
        0.01147169],
       [0.35797045, 1.        , 0.12368123, ..., 0.        , 0.        ,
        0.        ],
       [0.12863151, 0.12368123, 1.        , ..., 0.        , 0.        ,
        0.01044372],
       ...,
       [0.24126041, 0.27336141, 0.10694894, ..., 0.        , 0.        ,
        0.        ],
       [0.13065291, 0.14803698, 0.11894515, ..., 0.        , 0.0292986 ,
        0.        ],
       [0.19745459, 0.22372699, 0.12554709, ..., 0.        , 0.03092479,
        0.        ]])

### STEP 7 â€“ Build a lookup from title â†’ index

We need a way to quickly go from a movie title to its row index in movies_df.

In [16]:
# Reset index to be safe and keep a clean mapping
movies_df = movies_df.reset_index()
# Now we have a column named 'index' plus 'movieId', 'title', etc.

# Build a mapping from title to DataFrame index
# If there are duplicate titles, this keeps the LAST occurrence
title_to_index = pd.Series(movies_df.index, index=movies_df['title'].str.lower())

# Quick check for a known title (case-insensitive)
title_to_index['toy story (1995)']


np.int64(0)

STEP 8 â€“ Define get_recommendations(title, n=10)

Now the fun part: given a title, find similar movies by cosine similarity.

In [17]:
def get_recommendations(title, n=10):
    """
    Given a movie title (string), return a DataFrame of the top-n most similar movies.
    """
    title_lower = title.lower()

    if title_lower not in title_to_index:
        # If exact match not found, try a partial search to help the user
        matches = movies_df[movies_df['title'].str.lower().str.contains(title_lower)]
        print(f"Title '{title}' not found as exact match.")
        if not matches.empty:
            print("Did you mean one of these?")
            print(matches['title'].head(10).to_string(index=False))
        return None

    # Get the index of the movie that matches the title
    idx = title_to_index[title_lower]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # The first result is the movie itself, so skip it
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]

    # Return titles and similarity scores as a DataFrame
    results = movies_df.iloc[movie_indices][['title', 'genres']].copy()
    results['similarity'] = scores

    return results


STEP 9 â€“ Test the recommender

Now letâ€™s try it with a few example movies.


In [28]:
recommendations = get_recommendations("Notebook, The (2004)", n=3)
recommendations


Unnamed: 0,title,genres,similarity
5352,P.S. (2004),Comedy|Drama|Fantasy|Romance,0.415773
5834,D.E.B.S. (2004),Action|Comedy,0.333216
5261,Before Sunset (2004),Drama|Romance,0.294783


In [29]:
def interactive_recommender():
    print("ðŸŽ¬ Movie Recommender â€“ type a title, or 'q' to quit.\n")
    while True:
        user_input = input("Enter a movie title: ").strip()
        if user_input.lower() in ['q', 'quit', 'exit']:
            print("Goodbye! ðŸ‘‹")
            break

        recs = get_recommendations(user_input, n=10)
        if recs is None:
            continue  # go back and ask again

        print("\nTop recommendations:")
        for i, row in recs.reset_index(drop=True).iterrows():
            print(f"{i+1:2d}. {row['title']}  |  {row['genres']}  (similarity: {row['similarity']:.3f})")
        print("\n" + "-"*60 + "\n")

# Run this to start the loop:
interactive_recommender()


ðŸŽ¬ Movie Recommender â€“ type a title, or 'q' to quit.

Enter a movie title: terminator
Title 'terminator' not found as exact match.
Did you mean one of these?
        Terminator 2: Judgment Day (1991)
                   Terminator, The (1984)
Terminator 3: Rise of the Machines (2003)
              Terminator Salvation (2009)
                Terminator Genisys (2015)
Enter a movie title: love bird
Title 'love bird' not found as exact match.
Enter a movie title: ghost
Title 'ghost' not found as exact match.
Did you mean one of these?
                              Ghost (1990)
Ghost in the Shell (KÃ´kaku kidÃ´tai) (1995)
           Ghost and Mrs. Muir, The (1947)
        Ghost and the Darkness, The (1996)
              Ghosts of Mississippi (1996)
                 Blackbeard's Ghost (1968)
Ghostbusters (a.k.a. Ghost Busters) (1984)
                    Ghostbusters II (1989)
  Ghost Dog: The Way of the Samurai (1999)
                        Ghost World (2001)
Enter a movie title: q
Good