<a href="https://colab.research.google.com/github/itsyoru/movie-recommendation-system/blob/main/recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

movies = pd.read_csv("movies.csv")

In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
import re

def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]","",title)

In [None]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies['clean_title'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices][::-1]
  return results

In [None]:
import ipywidgets as widgets
import IPython.display as ipd

movie_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)


Text(value='', description='Movie Title:')

Output()

In [None]:
ratings = pd.read_csv("ratings.csv")

In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
738061,5033,2353,3.0,1032808165
738062,5033,2455,3.0,1032808566
738063,5033,2501,2.0,1032807392
738064,5033,2640,5.0,1032808165


In [None]:
ratings.dtypes

Unnamed: 0,0
userId,int64
movieId,int64
rating,float64
timestamp,int64


In [None]:
movie_id = 1

In [None]:
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] > 4)]['userId'].unique()

In [None]:
similar_users

array([  36,   75,   86,   90,   93,   95,   96,   98,  120,  127,  143,
        152,  158,  162,  186,  188,  211,  229,  230,  249,  259,  297,
        298,  302,  329,  355,  359,  369,  371,  381,  392,  428,  435,
        447,  468,  477,  484,  513,  537,  540,  541,  551,  553,  561,
        582,  609,  611,  623,  624,  631,  644,  653,  654,  670,  683,
        686,  694,  697,  709,  733,  741,  749,  752,  765,  768,  773,
        785,  793,  796,  803,  805,  807,  811,  830,  834,  856,  904,
        905,  911,  927,  947,  950,  956,  966,  969,  986, 1007, 1010,
       1013, 1036, 1065, 1079, 1092, 1096, 1101, 1118, 1123, 1138, 1140,
       1141, 1143, 1146, 1150, 1167, 1169, 1171, 1176, 1179, 1192, 1198,
       1199, 1200, 1228, 1230, 1240, 1268, 1273, 1304, 1305, 1313, 1334,
       1336, 1344, 1378, 1395, 1397, 1398, 1422, 1445, 1448, 1476, 1477,
       1478, 1480, 1494, 1502, 1510, 1527, 1540, 1548, 1558, 1560, 1569,
       1585, 1610, 1635, 1652, 1653, 1676, 1681, 16

In [None]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings["rating"] > 4)]['movieId']

In [None]:
similar_user_recs

Unnamed: 0,movieId
5101,1
5105,34
5111,110
5114,150
5127,260
...,...
733872,78499
733874,81834
733875,81847
733876,88125


In [None]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [None]:
similar_user_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,1.000000
318,0.410628
260,0.364734
356,0.326087
296,0.323671
...,...
1259,0.103865
377,0.103865
111,0.101449
1527,0.101449


In [None]:
all_users = (ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)])

In [None]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
79,2,318,5.0,1141417181
...,...,...,...,...
738037,5033,1258,5.0,1032807392
738038,5033,1259,5.0,1032806990
738045,5033,1307,5.0,1032806615
738056,5033,2028,5.0,1032806550


In [None]:
all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

all_user_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
318,0.347289
296,0.286551
2571,0.240781
356,0.231887
593,0.223644
...,...
1259,0.046421
377,0.044035
1580,0.043818
78499,0.036659


In [None]:
rec_percentage = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentage.columns = ['similar', 'all']

In [None]:
rec_percentage

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124295
318,0.410628,0.347289
260,0.364734,0.210629
356,0.326087,0.231887
296,0.323671,0.286551
...,...,...
1259,0.103865,0.046421
377,0.103865,0.044035
111,0.101449,0.073536
1527,0.101449,0.060521


In [None]:
rec_percentage['score'] = rec_percentage['similar'] / rec_percentage['all']

In [None]:
rec_percentage  = rec_percentage.sort_values('score', ascending=False)

In [None]:
rec_percentage

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124295,8.045375
3114,0.287440,0.051193,5.614816
2355,0.118357,0.024729,4.786211
78499,0.142512,0.036659,3.887460
588,0.253623,0.066377,3.820925
...,...,...,...
2571,0.251208,0.240781,1.043304
2858,0.169082,0.165293,1.022925
4973,0.108696,0.107375,1.012297
58559,0.130435,0.135358,0.963629


In [None]:
rec_percentage.head(10).merge(movies, left_index=True, right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124295,8.045375,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.28744,0.051193,5.614816,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.118357,0.024729,4.786211,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.142512,0.036659,3.88746,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.253623,0.066377,3.820925,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.217391,0.061822,3.5164,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.166667,0.055098,3.024934,34,Babe (1995),Children|Drama,Babe 1995
1047,0.147343,0.050108,2.940481,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
359,0.253623,0.086551,2.930333,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
4780,0.190821,0.065727,2.903254,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001


In [None]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentage = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentage.columns = ["similar", "all"]

    rec_percentage["score"] = rec_percentage["similar"] / rec_percentage["all"]
    rec_percentage = rec_percentage.sort_values("score", ascending=False)
    return rec_percentage.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [None]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [None]:
import pickle

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the tfidf matrix (sparse matrix)
from scipy import sparse
sparse.save_npz("tfidf.npz", tfidf)

# Save the cleaned movies dataframe
movies.to_csv("movies.csv", index=False)


In [None]:
from google.colab import files
files.download('vectorizer.pkl')
files.download('tfidf.npz')
files.download('movies.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>