# Movie Recommendation System

## Based on ratings

In [1]:
# import major libraries
import os
from gdown import download
import numpy as np
import pandas as pd
from urllib import request
import re  # python regular expression

import ipywidgets as widgets
from IPython.display import display

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

### Download data

In [2]:
# Download movie data
mv_file = "https://raw.githubusercontent.com/htetaunglynn94/portfolio_projects/refs/heads/main/data/mv.csv"
# root = os.getcwd()
# path = os.path.join(root, "movies.csv")
# request.urlretrieve(mv_file, path)

# # Download class file
# class_file = "https://drive.google.com/uc?export=download&id=1aeS4F5QWJhmGWFhqNGId2XUuboG5NFF_"
# root = os.getcwd()
# path = os.path.join(root, "Uinterface.py")
# download(class_file, path, quiet=False)

# File size is very large and cannot read due to google virus scanning method
# need '!pip install gdown'
rating = "https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1"
path = os.path.join(root, "ratings.csv")
download(rating, path, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1
From (redirected): https://drive.google.com/uc?export=download&id=12SjCQWIAmb1TxZ1OLt5Cp7gcXffs9bt1&confirm=t&uuid=9e680c5d-488e-4b89-9108-ee7131b93988
To: /home/htetaunglynn/git/portfolio_projects/machine_learning/logistic_regression/ratings.csv
100%|████████████████████████████████████████| 678M/678M [01:07<00:00, 10.1MB/s]


'/home/htetaunglynn/git/portfolio_projects/machine_learning/logistic_regression/ratings.csv'

### Defined functions

In [66]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

def find_similar_movies(movie_id):
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 5)] \
                    ['userId'].unique()
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
    all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(20).merge(df, left_index=True, right_on='movieId')[['score', 'title', 'genres', 'rating']]


def search(title, n_mov):
    title = clean_title(title)
    query_vec =vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten() # shape: (, 62423)
    indices = np.argsort(similarity)[-n_mov:][::-1] # sort the similarity in reverse order
    results = df.iloc[indices] # start from the end and go backward one step at a time
    return results


# def user_interface(df, vectorizer, tfidf):
def user_interface():

    def search_operation():
        """
        Handle search operation when button is clicked or Enter is pressed
        """
        title = movie_input.value.strip()
        # print(title)

        # can use due to global variables
        with movie_list:  # setup and clean up operations automatically
            movie_list.clear_output()
            if len(title) > 2:
                try:
                    display(search(title, range_slider.value))
                except Exception as e:
                    print(f"Search error: {e}")
            else:
                print("Please enter at least 3 characters")

    # Text input widgets
    movie_input = widgets.Text(value = 'toy story',                             # empty initial value
                            placeholder = 'Type a movie title...',  # place holder message
                            description = 'Movie Title:',           # description
                            style = {'description_width': '100px'}, # description width
                            layout = widgets.Layout(width='300px')) # layout for text box

    # Label
    n_movies = widgets.Label("No. of recommended movies:")    # label before the range

    # Create search button
    search_button = widgets.Button(description='Search',      # search button
                                button_style='primary')


    # Slider widget
    range_slider = widgets.IntSlider(min=0, max=10, step=1, value=2)
    min_label = widgets.Label("min")
    max_label = widgets.Label("max")


    # Create horizontal layout
    # Put input text box and button at the same row
    search_box = widgets.HBox([movie_input, search_button])
    range = widgets.HBox([min_label, range_slider, max_label])

    # Output area for results
    movie_list = widgets.Output()  # declare as global variable

    # Connect ONLY button click and Enter key
    search_button.on_click(lambda c: search_operation())
    movie_input.on_submit(lambda s: search_operation())

    display(search_box, n_movies, range, movie_list)

### Data loading

In [4]:
# Load movie file
df = pd.read_csv(mv_file)
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


`re.sub(pattern, replacement, string)`

* `[^...]` means "NOT any of these characters"
* `a-zA-Z` means all lowercase and uppercase letters
* `0-9`**bold text** means all digits

In [55]:
df['clean_title'] = df['title'].apply(clean_title)
df.sample(5)

Unnamed: 0,movieId,title,genres,clean_title
60829,204254,Schlock (1973),Comedy|Horror|Sci-Fi,Schlock 1973
33629,143601,Two People (1973),Drama|Romance,Two People 1973
61149,205094,El Americano: The Movie (2016),Animation|Children,El Americano The Movie 2016
9176,27343,When Strangers Appear (2001),Action|Mystery|Thriller,When Strangers Appear 2001
34248,144980,Friends (1971),Drama|Romance,Friends 1971


In [56]:
# Consider for unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1,2)) #(unigram, bigram)
tfidf = vectorizer.fit_transform(df['clean_title']) # output is sparse matrix

In [65]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [67]:
user_interface()

HBox(children=(Text(value='toy story', description='Movie Title:', layout=Layout(width='300px'), placeholder='…

Label(value='No. of recommended movies:')

HBox(children=(Label(value='min'), IntSlider(value=2, max=10), Label(value='max')))

Output()

In [62]:
from IPython.display import HTML

display(HTML('''
  <div>
    <img src="https://upload.wikimedia.org/wikipedia/en/6/60/Toy_Story_1995.jpg" width="200">
    <p><a href="https://www.imdb.com/title/tt0114709/" target="_blank">IMDb Link</a></p>
  </div>
'''))


### Detail calculation

In [9]:
# Test with sample data

a = [5,2,4,1,9,3,8,1]
#    0,1,2,3,4,5,6  <-- indices
# Original array with indices:
# Index: 0  1  2  3  4  5  6
# Value: 5  2  4  1  9  3  8

# Sorted by value:       1, 2, 3, 4, 5, 6
# Corresponding indices: 3, 1, 5, 2, 0, 4

b = np.argsort(a)[::-1]
[a[i] for i in b]

[9, 8, 5, 4, 3, 2, 1, 1]

In [36]:
title = 'Toy Story 1995'
title = clean_title(title)
# print(title)
query_vec = vectorizer.transform([title])
# print(query_vec)
similarity = cosine_similarity(query_vec, tfidf).flatten() # shape: (, 62423)
indices = np.argsort(similarity)[-5:][::-1]
print(indices)
results = df.iloc[indices] # start from the end and go backward one step at a time
display(results)

[    0 14813  3021 59767 20497]


Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [7]:
# Interactive search box widget
import ipywidgets as widgets
from IPython.display import display

# Text input widgets
movie_input = widgets.Text(value = '',                             # empty initial value
                           placeholder = 'Type a movie title...',  # place holder message
                           description = 'Movie Title:',           # description
                           style = {'description_width': '100px'}, # description width
                           layout = widgets.Layout(width='300px')) # layout for text box

# Label
n_movies = widgets.Label("No. of recommended movies:")    # label before the range

# Create search button
search_button = widgets.Button(description='Search',      # search button
                               button_style='primary')


# Slider widget
range_slider = widgets.IntSlider(min=0, max=10, step=1, value=2)
min_label = widgets.Label("min")
max_label = widgets.Label("max")


# Create horizontal layout
# Put input text box and button at the same row
search_box = widgets.HBox([movie_input, search_button])
range = widgets.HBox([min_label, range_slider, max_label])

# Output area for results
movie_list = widgets.Output()  # declare as global variable

# Connect ONLY button click and Enter key
search_button.on_click(lambda c: search_operation())
movie_input.on_submit(lambda s: search_operation())

display(search_box, n_movies, range, movie_list)

HBox(children=(Text(value='', description='Movie Title:', layout=Layout(width='300px'), placeholder='Type a mo…

Label(value='No. of recommended movies:')

HBox(children=(Label(value='min'), IntSlider(value=2, max=10), Label(value='max')))

Output()

the hulk
the hulk
the hulk
the hulk
hello
hello
hello
hello
hellosdfasdf
hellosdfasdf


In [38]:
ratings.shape

(25000095, 4)

In [39]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [40]:
movie_id = 1

In [41]:
conds = (ratings['movieId'] == movie_id) & (ratings['rating'] > 4)
len(ratings[conds]['userId'].unique())

18835

In [42]:
len(ratings[conds]['userId'])

18835

In [43]:
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 5)]['userId'].unique()
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530])

In [44]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']
similar_user_recs

Unnamed: 0,movieId
5101,1
5105,34
5111,110
5114,150
5127,260
...,...
24998388,3706
24998389,3735
24998391,3763
24998392,4187


In [45]:
similar_user_recs.value_counts()

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,13506
318,5599
260,5464
356,4690
296,4628
...,...
121432,1
115414,1
110750,1
92726,1


In [46]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,1.000000
318,0.414556
260,0.404561
356,0.347253
296,0.342663
...,...
121432,0.000074
115414,0.000074
110750,0.000074
92726,0.000074


In [47]:
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs
# Need to fix results

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,1.000000
318,0.414556
260,0.404561
356,0.347253
296,0.342663
...,...
1259,0.102991
7361,0.101881
1206,0.101362
1307,0.101066


In [48]:
all_users = ratings[ratings['movieId'].isin(similar_user_recs.index) & (ratings['rating'] > 4 )]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000055,162541,4973,4.5,1240950790
25000057,162541,4993,5.0,1240952610
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613


In [49]:
all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
all_users_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
318,0.345282
296,0.287220
2571,0.246217
356,0.237370
593,0.227930
...,...
1387,0.047886
1307,0.046195
745,0.037362
78499,0.035445


In [50]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.125844
318,0.414556,0.345282
260,0.404561,0.224195
356,0.347253,0.237370
296,0.342663,0.287220
...,...,...
1259,0.102991,0.049349
7361,0.101881,0.105172
1206,0.101362,0.087500
1307,0.101066,0.046195


In [51]:
rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.125844,7.946323
318,0.414556,0.345282,1.200632
260,0.404561,0.224195,1.804507
356,0.347253,0.237370,1.462916
296,0.342663,0.287220,1.193030
...,...,...,...
1259,0.102991,0.049349,2.087002
7361,0.101881,0.105172,0.968704
1206,0.101362,0.087500,1.158430
1307,0.101066,0.046195,2.187804


In [52]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


In [69]:
rec_percentages.head(10).merge(df, left_index=True, right_on='movieId')[['score', 'title', 'genres','rating']]

KeyError: "['rating'] not in index"