# Movie Recommendation System

|Activity | Value|
|---------|------|
|Developer| HTET AUNG LYNN|
|Finalized date| 14-Aug-2025|
|Libraries| sklearn, difflib, ipywidgets, pandas, numpy, os, re|


In [1]:
# import major libraries
import os
from gdown import download
import numpy as np
import pandas as pd
from urllib import request
import re  # python regular expression
import difflib # compare sequences for strings and list

import ipywidgets as widgets
from IPython.display import display

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Defined functions

In [2]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

def find_similar_movies(movie_id):
    cols = ['movieId', 'score', 'title', 'genres']

    # Extract userId of only users who rated higher than 4
    similar_users = ratings.query("movieId == @movie_id and rating > 4")["userId"].unique()
    # Extract movieId from those similar users, keeping rating above 4
    similar_user_recs = ratings.query("userId in @similar_users and rating > 4")["movieId"]


    # Compute percentages of similar users who liked each movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    # Filter only movies that more than 10% of similar users liked (removed weak recommendation)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]


    # Filter all users who liked recommended movieId, keeping rating above 4
    all_users = ratings.query("movieId in @similar_user_recs.index & rating > 4")
    # Compute percentage of all users who liked each candidate movie
    all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

    # movieId for "Similar users like percentage" and "all users like percentage"
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    # Compute recommendation socre
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    # Return largest 20 rows merging with movie data
    return rec_percentages.nlargest(n=20, columns="score").merge(df, on='movieId')[cols]


def search(title, n_mov):
    title = clean_title(title) # clean the title
    query_vec = vectorizer.transform([title]) # transform title into number (coordinates & values)
    similarity = cosine_similarity(query_vec, tfidf).flatten() # 2-D to 1-D (shape: (62423,)) --> btw 0 and 1
    indices = np.argsort(similarity)[-n_mov:][::-1] # sort the similarity in reverse order
    # start from the end and go backward one step at a time
    results = mvf.iloc[indices].reset_index(drop=True).rename(lambda x: x+1)
    return results

In [3]:
def similar_movies(features, vectorizer, mv_name, range_):

    # Output features
    out_features = ['title','genres', 'director', 'popularity']

    mvd[features] = mvd[features].fillna('') # imputation
    combined_features = mvd[features].agg('__'.join, axis=1) # feature combination
    feature_vectors = vectorizer.fit_transform(combined_features) # transform to sparse matrix

    similarity = cosine_similarity(feature_vectors) # apply similarity score
    movie_titles = [x.lower() for x in mvd['title'].unique()] # get all movie titles (total=4800)


    # Rollback state included for UPPER CASE input
    # Find similar title matches
    matched_one = difflib.get_close_matches(mv_name, movie_titles)[0]
    # Rollback to normal string and find matched movie title
    matched_movie = mvd.query("title.str.lower() == @matched_one")['title'].values[0]

    # Find the index of the first matched movie
    mv_idx = mvd.query("title == @matched_movie")['index'].values[0]

    # Find similarity scores of the first matched movie
    similarity_scores = list(enumerate(similarity[mv_idx]))

    # Sort similar movies based on similarity scores in reverse order
    sorted_similar_movies = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


    # Get indices of the sorted similar movies based on range_slider
    indices = [m[0] for m in sorted_similar_movies[:range_]]

    return mvd.loc[indices, out_features].reset_index(drop=True).rename(lambda x: x+1)

In [4]:
# def user_interface(df, vectorizer, tfidf):
def user_interface(based_on):

    def search_operation():
        """
        Handle search operation when button is clicked or Enter is pressed
        """
        title = movie_input.value.strip().lower()


        # can use due to global variables
        with movie_list:  # setup and clean up operations automatically
            movie_list.clear_output()
            if len(title) > 2:
                try:
                    if based_on == "ratings":
                        display(search(title, range_slider.value))
                    else:
                        display(similar_movies(features, vectorizer, title, range_slider.value))
                except Exception as e:
                    print(f"Search error: {e}")
            else:
                print("Please enter at least 3 characters")

    # Text input widgets
    movie_input = widgets.Text(value = '',                          # empty initial value
                            placeholder = 'Type a movie title...',  # place holder message
                            description = 'Movie Title:',           # description
                            style = {'description_width': '100px'}, # description width
                            layout = widgets.Layout(width='300px')) # layout for text box

    # Label
    n_movies = widgets.Label("No. of recommended movies:")    # label before the range

    # Create search button
    search_button = widgets.Button(description='Search',      # search button
                                button_style='primary')


    # Slider widget
    range_slider = widgets.IntSlider(min=0, max=20, step=1, value=2)
    min_label = widgets.Label("min")
    max_label = widgets.Label("max:20")


    # Create horizontal layout
    # Put input text box and button at the same row
    search_box = widgets.HBox([movie_input, search_button])
    range = widgets.HBox([min_label, range_slider, max_label])

    # Output area for results
    movie_list = widgets.Output()  # declare as global variable

    # Connect ONLY button click and Enter key
    search_button.on_click(lambda c: search_operation())
    movie_input.on_submit(lambda s: search_operation())

    display(search_box, n_movies, range, movie_list)

## Based on ratings

### Download data

In [5]:
# Download movie data
mv_file = "https://raw.githubusercontent.com/htetaunglynn94/portfolio_projects/refs/heads/main/data/mv.csv"
root = os.getcwd()
# path = os.path.join(root, "movies.csv")
# request.urlretrieve(mv_file, path)

# # Download class file
# class_file = "https://drive.google.com/uc?export=download&id=1aeS4F5QWJhmGWFhqNGId2XUuboG5NFF_"
# root = os.getcwd()
# path = os.path.join(root, "Uinterface.py")
# download(class_file, path, quiet=False)

# File size is very large and cannot read due to google virus scanning method
# need '!pip install gdown'
rating = "https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1"
path = os.path.join(root, "ratings.csv")
download(rating, path, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1
From (redirected): https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1&confirm=t&uuid=2ccf6106-16b2-40d5-96b0-673d5e546286
To: /content/ratings.csv
100%|██████████| 678M/678M [00:09<00:00, 72.7MB/s]


'/content/ratings.csv'

### Data loading

In [6]:
# Load movie file
mvf = pd.read_csv(mv_file)
mvf.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


## Based on selected features

In [8]:
url = "https://raw.githubusercontent.com/htetaunglynn94/portfolio_projects/refs/heads/main/data/movies.csv"
mvd = pd.read_csv(url)
mvd.sample(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
3257,3257,7000000,Thriller Drama Crime,,1359,based on novel wall street psychopath white co...,en,American Psycho,A wealthy New York investment banking executiv...,45.310443,...,102.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,I think my mask of sanity is about to slip.,American Psycho,7.3,2066,Christian Bale Willem Dafoe Jared Leto Josh Lu...,"[{'name': 'Suzanne Smith', 'gender': 1, 'depar...",Mary Harron
2506,2506,0,Action Adventure Drama,,38717,sport independent film,en,Madison,"In 1971, air-conditioner repairman and boat en...",0.212108,...,99.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A town's future is riding with one man.,Madison,5.3,3,Jim Caviezel Jake Lloyd Mary McCormack Bruce D...,"[{'name': 'William Bindley', 'gender': 0, 'dep...",William Bindley
1978,1978,24000000,Action Comedy Drama,http://readytorumble.warnerbros.com/flash.html,20697,wrestling sport,en,Ready to Rumble,Two slacker wrestling fans are devastated by t...,3.113009,...,107.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,They're headed to the big time...face first.,Ready to Rumble,4.7,49,David Arquette Scott Caan Oliver Platt Rose Mc...,"[{'name': 'Steven Reuther', 'gender': 2, 'depa...",Brian Robbins


## Comparison

`TfidfVectorizer` converts a collection of text documents into a matrix of `TF-IDF` features, where:

- `TF` (Term Frequency) → How often a word appears in a document
- `IDF` (Inverse Document Frequency) → How unique that word is across all documents

It’s basically a way to transform text into numerical vectors while down-weighting common words like “the” and “is”.

__Based on Ratings__

In [9]:
mvf['clean_title'] = mvf['title'].apply(clean_title)
# Consider for unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1,2)) #(unigram, bigram)
tfidf = vectorizer.fit_transform(mvf['clean_title']) # output is sparse matrix
user_interface(based_on = "ratings")

HBox(children=(Text(value='', description='Movie Title:', layout=Layout(width='300px'), placeholder='Type a mo…

Label(value='No. of recommended movies:')

HBox(children=(Label(value='min'), IntSlider(value=2, max=20), Label(value='max:20')))

Output()

__Based on features__

In [10]:
# Based on features
features = ['title','genres','overview','director','cast','production_companies','keywords','tagline']
vectorizer = TfidfVectorizer()
user_interface(based_on = "selected_features")

HBox(children=(Text(value='', description='Movie Title:', layout=Layout(width='300px'), placeholder='Type a mo…

Label(value='No. of recommended movies:')

HBox(children=(Label(value='min'), IntSlider(value=2, max=20), Label(value='max:20')))

Output()