<a href="https://colab.research.google.com/github/karimaljundi/recommend-movies-model/blob/main/movie_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re


In [2]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [4]:
movies["clean-title"] = movies["title"].apply(clean_title)

In [5]:
movies

Unnamed: 0,movieId,title,genres,clean-title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies['clean-title'])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec=vectorizer.transform([title])
  similarity= cosine_similarity(query_vec, tfidf).flatten()
  indicies = np.argpartition(similarity, -5)[-5:]
  results= movies.iloc[indicies]
  return results

In [8]:
ratings = pd.read_csv('ratings.csv')

In [9]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
494376,3397,784,2.0,1.058077e+09
494377,3397,785,3.0,1.058076e+09
494378,3397,788,3.5,1.058077e+09
494379,3397,832,2.5,1.058079e+09


In [10]:
movie_id = 1

In [11]:
similar_users = ratings[(ratings["movieId"]==movie_id) & (ratings["rating"] ==5)]["userId"].unique()
similar_users

array([  36,   75,   86,   90,   93,   95,   96,   98,  120,  127,  143,
        152,  158,  162,  186,  188,  211,  229,  230,  249,  259,  297,
        298,  302,  329,  355,  359,  369,  371,  381,  392,  428,  435,
        447,  468,  477,  484,  513,  537,  540,  541,  551,  553,  561,
        582,  609,  611,  623,  624,  631,  644,  653,  654,  670,  683,
        686,  694,  697,  709,  733,  741,  749,  752,  765,  768,  773,
        785,  793,  796,  803,  805,  807,  811,  830,  834,  856,  904,
        905,  911,  927,  947,  950,  956,  966,  969,  986, 1007, 1010,
       1013, 1036, 1065, 1079, 1092, 1096, 1101, 1118, 1123, 1138, 1140,
       1141, 1143, 1146, 1150, 1167, 1169, 1171, 1176, 1179, 1192, 1198,
       1199, 1200, 1228, 1230, 1240, 1268, 1273, 1304, 1305, 1313, 1334,
       1336, 1344, 1378, 1395, 1397, 1398, 1422, 1445, 1448, 1476, 1477,
       1478, 1480, 1494, 1502, 1510, 1527, 1540, 1548, 1558, 1560, 1569,
       1585, 1610, 1635, 1652, 1653, 1676, 1681, 16

In [12]:
similar_users_recs = ratings[(ratings["userId"].isin(similar_users) & (ratings["rating"] ==5))]['movieId']
similar_users_recs

5101         1
5105        34
5111       110
5114       150
5127       260
          ... 
486745    1060
486747    1079
486749    1083
486764    1391
486767    1476
Name: movieId, Length: 14394, dtype: int64

In [13]:
similar_users_recs = similar_users_recs.value_counts()/len(similar_users)
similar_users_recs = similar_users_recs[similar_users_recs > 0.1]

In [14]:
similar_users_recs

movieId
1        1.000000
318      0.398524
260      0.343173
527      0.276753
296      0.276753
           ...   
2716     0.103321
2324     0.103321
58559    0.103321
904      0.103321
223      0.103321
Name: count, Length: 73, dtype: float64

In [15]:
all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index) & (ratings["rating" ] > 4))]

In [16]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
72,2,110,5.0,1.141417e+09
76,2,260,5.0,1.141417e+09
79,2,318,5.0,1.141417e+09
82,2,356,4.5,1.141417e+09
...,...,...,...,...
494321,3397,223,5.0,1.058076e+09
494324,3397,260,4.5,1.058132e+09
494329,3397,296,4.5,1.058080e+09
494331,3397,318,5.0,1.058084e+09


In [17]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_recs

movieId
318     0.351573
296     0.289974
2571    0.242136
593     0.233617
527     0.230013
          ...   
1259    0.048493
1580    0.048165
380     0.047837
1234    0.046855
2355    0.027195
Name: count, Length: 73, dtype: float64

In [18]:
rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124836
318,0.398524,0.351573
260,0.343173,0.221822
527,0.276753,0.230013
296,0.276753,0.289974
...,...,...
2716,0.103321,0.058650
2324,0.103321,0.082569
58559,0.103321,0.136959
904,0.103321,0.060944


In [19]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [20]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [21]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124836,8.010499
3114,0.243542,0.054718,4.450847
2355,0.110701,0.027195,4.070600
588,0.210332,0.068152,3.086219
595,0.202952,0.065858,3.081640
...,...,...,...
2858,0.136531,0.166121,0.821881
5952,0.121771,0.151048,0.806173
7153,0.125461,0.159240,0.787876
58559,0.103321,0.136959,0.754392


In [22]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean-title
0,1.0,0.124836,8.010499,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.243542,0.054718,4.450847,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110701,0.027195,4.0706,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
580,0.210332,0.068152,3.086219,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.202952,0.065858,3.08164,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
1047,0.162362,0.054063,3.003198,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
375,0.129151,0.047837,2.699793,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,True Lies 1994
33,0.140221,0.057339,2.445461,34,Babe (1995),Children|Drama,Babe 1995
1120,0.125461,0.055701,2.252399,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,Wallace Gromit The Wrong Trousers 1993
2305,0.118081,0.05308,2.224591,2396,Shakespeare in Love (1998),Comedy|Drama|Romance,Shakespeare in Love 1998


In [40]:
def rec_movies(movie_name):
  movie_id = search(movie_name).iloc[0]['movieId']
  similar_users = ratings[(ratings["movieId"]==movie_id) & (ratings["rating"] ==5)]["userId"].unique()
  similar_users_recs = ratings[(ratings["userId"].isin(similar_users) & (ratings["rating"] ==5))]['movieId']
  similar_users_recs = similar_users_recs.value_counts()/len(similar_users)
  similar_users_recs = similar_users_recs[similar_users_recs > 0.1]
  all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index) & (ratings["rating" ] > 4))]
  all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
  rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
  rec_percentages.columns = ["similar", "all"]
  rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
  rec_percentages = rec_percentages.sort_values("score", ascending=False)
  return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [45]:
rec_movies("Matrix")

Unnamed: 0,score,title,genres
6809,48.571429,"Matrix Revolutions, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX
10971,34.615385,Crank (2006),Action|Thriller
10553,26.470588,Underworld: Evolution (2006),Action|Fantasy|Horror
6247,23.294118,"Matrix Reloaded, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX
4264,22.5,"Fast and the Furious, The (2001)",Action|Crime|Thriller
13879,20.0,Law Abiding Citizen (2009),Drama|Thriller
5111,18.947368,Resident Evil (2002),Action|Horror|Sci-Fi|Thriller
7248,17.142857,Hellboy (2004),Action|Adventure|Fantasy|Horror
9284,17.142857,"Animatrix, The (2003)",Action|Animation|Drama|Sci-Fi
6631,16.666667,Underworld (2003),Action|Fantasy|Horror
