# Movie Recommendation System

In [1]:
# import major libraries
import os
from gdown import download
import numpy as np
import pandas as pd
from urllib import request
import re  # python regular expression

import ipywidgets as widgets
from IPython.display import display

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Based on ratings

### Download data

In [37]:
# Download movie data
mv_file = "https://raw.githubusercontent.com/htetaunglynn94/portfolio_projects/refs/heads/main/data/mv.csv"
root = os.getcwd()
# path = os.path.join(root, "movies.csv")
# request.urlretrieve(mv_file, path)

# # Download class file
# class_file = "https://drive.google.com/uc?export=download&id=1aeS4F5QWJhmGWFhqNGId2XUuboG5NFF_"
# root = os.getcwd()
# path = os.path.join(root, "Uinterface.py")
# download(class_file, path, quiet=False)

# File size is very large and cannot read due to google virus scanning method
# need '!pip install gdown'
rating = "https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1"
path = os.path.join(root, "ratings.csv")
download(rating, path, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1
From (redirected): https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1&confirm=t&uuid=084260f3-91a7-4203-9c14-6f7f37c75035
To: /content/ratings.csv
100%|██████████| 678M/678M [00:08<00:00, 83.3MB/s]


'/content/ratings.csv'

### Defined functions

In [7]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

def find_similar_movies(movie_id):
    cols = ['movieId', 'score', 'title', 'genres']

    # Extract userId of only users who rated higher than 4
    similar_users = ratings.query("movieId == @movie_id and rating > 4")["userId"].unique()
    # Extract movieId from those similar users, keeping rating above 4
    similar_user_recs = ratings.query("userId in @similar_users and rating > 4")["movieId"]


    # Compute percentages of similar users who liked each movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    # Filter only movies that more than 10% of similar users liked (removed weak recommendation)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]


    # Filter all users who liked recommended movieId, keeping rating above 4
    all_users = ratings.query("movieId in @similar_user_recs.index & rating > 4")
    # Compute percentage of all users who liked each candidate movie
    all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

    # movieId for "Similar users like percentage" and "all users like percentage"
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    # Compute recommendation socre
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    # Return largest 20 rows merging with movie data
    return rec_percentages.nlargest(n=20, columns="score").merge(df, on='movieId')[cols]


def search(title, n_mov):
    title = clean_title(title) # clean the title
    query_vec =vectorizer.transform([title]) # transform title into number (coordinates & values)
    similarity = cosine_similarity(query_vec, tfidf).flatten() # 2-D to 1-D (shape: (62423,)) --> btw 0 and 1
    indices = np.argsort(similarity)[-n_mov:][::-1] # sort the similarity in reverse order
    results = df.iloc[indices] # start from the end and go backward one step at a time
    return results


# def user_interface(df, vectorizer, tfidf):
def user_interface():

    def search_operation():
        """
        Handle search operation when button is clicked or Enter is pressed
        """
        title = movie_input.value.strip()
        # print(title)

        # can use due to global variables
        with movie_list:  # setup and clean up operations automatically
            movie_list.clear_output()
            if len(title) > 2:
                try:
                    display(search(title, range_slider.value))
                except Exception as e:
                    print(f"Search error: {e}")
            else:
                print("Please enter at least 3 characters")

    # Text input widgets
    movie_input = widgets.Text(value = '',                          # empty initial value
                            placeholder = 'Type a movie title...',  # place holder message
                            description = 'Movie Title:',           # description
                            style = {'description_width': '100px'}, # description width
                            layout = widgets.Layout(width='300px')) # layout for text box

    # Label
    n_movies = widgets.Label("No. of recommended movies:")    # label before the range

    # Create search button
    search_button = widgets.Button(description='Search',      # search button
                                button_style='primary')


    # Slider widget
    range_slider = widgets.IntSlider(min=0, max=20, step=1, value=2)
    min_label = widgets.Label("min")
    max_label = widgets.Label("max")


    # Create horizontal layout
    # Put input text box and button at the same row
    search_box = widgets.HBox([movie_input, search_button])
    range = widgets.HBox([min_label, range_slider, max_label])

    # Output area for results
    movie_list = widgets.Output()  # declare as global variable

    # Connect ONLY button click and Enter key
    search_button.on_click(lambda c: search_operation())
    movie_input.on_submit(lambda s: search_operation())

    display(search_box, n_movies, range, movie_list)

### Data loading

In [38]:
# Load movie file
df = pd.read_csv(mv_file)
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


`re.sub(pattern, replacement, string)`

* `[^...]` means "NOT any of these characters"
* `a-zA-Z` means all lowercase and uppercase letters
* `0-9`**bold text** means all digits

In [8]:
df['clean_title'] = df['title'].apply(clean_title)
df.sample(5)

Unnamed: 0,movieId,title,genres,clean_title
50989,182185,Elmer Elephant (1936),Animation,Elmer Elephant 1936
61739,206775,Girl From Nowhere (2017),Thriller,Girl From Nowhere 2017
24594,121857,Too Tough to Die: A Tribute to Johnny Ramone (...,Documentary,Too Tough to Die A Tribute to Johnny Ramone 2006
54655,190081,Lucia (1968),Drama,Lucia 1968
15632,82283,Secret Ceremony (1968),Drama|Thriller,Secret Ceremony 1968


`TfidfVectorizer` converts a collection of text documents into a matrix of `TF-IDF` features, where:

- `TF` (Term Frequency) → How often a word appears in a document
- `IDF` (Inverse Document Frequency) → How unique that word is across all documents

It’s basically a way to transform text into numerical vectors while down-weighting common words like “the” and “is”.

In [9]:
# Consider for unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1,2)) #(unigram, bigram)
tfidf = vectorizer.fit_transform(df['clean_title']) # output is sparse matrix
tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 446566 stored elements and shape (62423, 170073)>

In [39]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [27]:
user_interface()

Unnamed: 0,movieId,title,genres
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
