<a href="https://colab.research.google.com/github/intheblueside/machine_learning_projects/blob/main/movie_rec/movie_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A simple movie recommendation system

In [1]:
# import necessary libraries
import pandas as pd

In [2]:
# read movies data
movies = pd.read_csv('movies.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [5]:
# regular expressions
import re

#function to clean dataset, using regex
def clean_title(title):
  title = re.sub("[^a-zA-Z0-9 ]", "", title )
  return title

In [6]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [7]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


Building the model
- all data is saved as binary terms (0/1)
- inverse document frequency
- Tf * Idf
- finding similarity by the matrix

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# checks title together (term frequency inverse document frequency)
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title]) #transform title into vectors
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:] #5 most similar into a index
  results = movies.iloc[indices][::-1] #reverse results
  return results

In [None]:
# gives top based on num set = indices
# vector of similar titles with search terms = similarity
# query_vec

In [11]:
# create widgets
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text ( # creates a iniput box
    value="Toy Story",
    description ="Movie Title: ",
    disabled=False
)
movie_list = widgets.Output()

def on_type(data): # function is called when data is entered in input box
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title: ')

Output()

In [12]:
ratings = pd.read_csv('ratings.csv')

In [13]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
778723,5264,173197,3.5,1544403047
778724,5264,173291,4.0,1501709831
778725,5264,173729,3.5,1508433951
778726,5264,175141,4.0,1538074303


In [14]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [None]:
# find users who like similar movies, and what are the other movies the users like

In [15]:
movie_id = 1

In [16]:
# users who have watched movie_id, and gave ratings more than 5, and different users -no repeat
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)] ["userId"].unique()

In [17]:
similar_users

array([  36,   75,   86,   90,   93,   95,   96,   98,  120,  127,  143,
        152,  158,  162,  186,  188,  211,  229,  230,  249,  259,  297,
        298,  302,  329,  355,  359,  369,  371,  381,  392,  428,  435,
        447,  468,  477,  484,  513,  537,  540,  541,  551,  553,  561,
        582,  609,  611,  623,  624,  631,  644,  653,  654,  670,  683,
        686,  694,  697,  709,  733,  741,  749,  752,  765,  768,  773,
        785,  793,  796,  803,  805,  807,  811,  830,  834,  856,  904,
        905,  911,  927,  947,  950,  956,  966,  969,  986, 1007, 1010,
       1013, 1036, 1065, 1079, 1092, 1096, 1101, 1118, 1123, 1138, 1140,
       1141, 1143, 1146, 1150, 1167, 1169, 1171, 1176, 1179, 1192, 1198,
       1199, 1200, 1228, 1230, 1240, 1268, 1273, 1304, 1305, 1313, 1334,
       1336, 1344, 1378, 1395, 1397, 1398, 1422, 1445, 1448, 1476, 1477,
       1478, 1480, 1494, 1502, 1510, 1527, 1540, 1548, 1558, 1560, 1569,
       1585, 1610, 1635, 1652, 1653, 1676, 1681, 16

In [18]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [19]:
similar_user_recs

5101         1
5105        34
5111       110
5114       150
5127       260
          ... 
775504    1617
775505    1663
775510    1784
775517    2396
775521    5060
Name: movieId, Length: 29014, dtype: int64

In [20]:
similar_user_recs.value_counts()

movieId
1         439
318       179
260       163
356       145
296       141
         ... 
112370      1
159690      1
166918      1
637         1
3524        1
Name: count, Length: 4846, dtype: int64

In [21]:
#find movies greater than 10%
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [22]:
similar_user_recs

movieId
1       1.000000
318     0.407745
260     0.371298
356     0.330296
296     0.321185
          ...   
111     0.102506
1527    0.102506
1278    0.102506
2918    0.100228
1961    0.100228
Name: count, Length: 93, dtype: float64

In [23]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [24]:
# find what % all users rec
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [25]:
all_users_recs

movieId
318      0.346727
296      0.285626
2571     0.239229
356      0.231773
593      0.224938
           ...   
377      0.043911
1580     0.043496
1278     0.037283
78499    0.036247
2355     0.024648
Name: count, Length: 93, dtype: float64

In [26]:
rec_percent = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percent.columns = ["similar", "all"]

In [27]:
rec_percent

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124482
318,0.407745,0.346727
260,0.371298,0.211889
356,0.330296,0.231773
296,0.321185,0.285626
...,...,...
111,0.102506,0.075808
1527,0.102506,0.060066
1278,0.102506,0.037283
2918,0.100228,0.051988


In [28]:
rec_percent["score"] = rec_percent["similar"] / rec_percent["all"]

In [29]:
rec_percent = rec_percent.sort_values("score", ascending=False)

In [30]:
rec_percent #higher score, better rec

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124482,8.033278
3114,0.293850,0.051574,5.697615
2355,0.120729,0.024648,4.898145
78499,0.145786,0.036247,4.022024
588,0.255125,0.066073,3.861269
...,...,...,...
79132,0.129841,0.122618,1.058902
2858,0.177677,0.168186,1.056431
3578,0.116173,0.111226,1.044476
58559,0.129841,0.134217,0.967392


In [31]:
rec_percent.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124482,8.033278,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.29385,0.051574,5.697615,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.120729,0.024648,4.898145,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.145786,0.036247,4.022024,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.255125,0.066073,3.861269,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.214123,0.061723,3.46908,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.164009,0.054681,2.999379,34,Babe (1995),Children|Drama,Babe 1995
4780,0.193622,0.065452,2.958248,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
359,0.252847,0.086785,2.913478,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
1047,0.143508,0.04971,2.886902,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971


In [32]:
def find_similar_movies(movie_id):
  #finding rec from similar users
  similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)] ["userId"].unique()
  similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

  #adjust so rec only 10% > usrs
  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > .1]

  #common rec among all users
  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
  all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

  rec_percent = pd.concat([similar_user_recs, all_users_recs], axis=1)
  rec_percent.columns = ["similar", "all"]

  rec_percent["score"] = rec_percent["similar"] / rec_percent["all"]

  rec_percent = rec_percent.sort_values("score", ascending=False)
  #only show 3 columns
  return rec_percent.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]



In [33]:
# widget
movie_name_input = widgets.Text(
    value="Toy Story",
    description="Movie Title: ",
    disabled=False
)

#output
recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:

    recommendation_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names="value")
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title: ')

Output()