In [67]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate


In [68]:
movies = pd.read_csv("movies_metadata.csv",
                     usecols=["id","overview","title","vote_average","vote_count","release_date"],low_memory=False)
rating = pd.read_csv("ratings_small.csv")
movies.head()

Unnamed: 0,id,overview,release_date,title,vote_average,vote_count
0,862,"Led by Woody, Andy's toys live happily in his ...",30/10/1995,Toy Story,7.7,5415.0
1,8844,When siblings Judy and Peter discover an encha...,15/12/1995,Jumanji,6.9,2413.0
2,15602,A family wedding reignites the ancient feud be...,22/12/1995,Grumpier Old Men,6.5,92.0
3,31357,"Cheated on, mistreated and stepped on, the wom...",22/12/1995,Waiting to Exhale,6.1,34.0
4,11862,Just when George Banks has recovered from his ...,10/02/1995,Father of the Bride Part II,5.7,173.0


In [69]:
movies.shape

(100004, 6)

In [70]:
movies.isnull().sum()

id              54538
overview        55492
release_date    54625
title           54544
vote_average    54544
vote_count      54544
dtype: int64

In [71]:
movies = movies.dropna()
movies.duplicated().sum()

28

In [72]:
movies = movies.drop_duplicates()
movies = movies.rename(columns={"id":"movieId"})
movies["movieId"] = movies["movieId"].astype("int64")
movies = movies.reset_index(drop=True)
movies["overview"].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [73]:
movies["overview"].loc[1]

"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."

In [74]:
movies["overview"] = movies["overview"].str.replace(r"[^\w\s]"," ",regex=True).str.replace(r"[\d]"," ",regex=True)
tfidf = TfidfVectorizer(stop_words="english", min_df = 4)
tfidf_matrix = tfidf.fit_transform(movies["overview"])
tfidf_matrix.shape

(44407, 23499)

In [75]:
similarity = cosine_similarity(tfidf_matrix,tfidf_matrix)
similarity.shape

MemoryError: Unable to allocate 4.05 GiB for an array with shape (544068475,) and data type float64

In [None]:
similarity[1]

array([0.01616292, 1.        , 0.04922993, ..., 0.        , 0.02371782,
       0.01128353])

In [None]:
index = movies[movies["movieId"] == 8844].index[0]

In [None]:
# tfidf.get_feature_names()

feature_names = tfidf.get_feature_names_out()

feature_names

array(['aa', 'aamir', 'aaron', ..., 'не', 'но', 'по'], dtype=object)

In [None]:
tfidf_matrix.toarray() # the scores at the intersection of documents and terms.

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
similarity_scores = pd.DataFrame(similarity[index],
                                 columns=["similarity"])
movie_indices = similarity_scores.sort_values("similarity", ascending=False)[1:11].index
movies['title'].iloc[movie_indices]

21441         Table No. 21
43372    Liar Game: Reborn
40696         Snowed Under
44206                 Quiz
34895             The Mend
17118       The Dark Angel
8770               Quintet
6137             Brainscan
9467             Word Wars
8049               Masques
Name: title, dtype: object

**Creating User Movie DataFrame**

In [None]:
df.shape

(6346678, 28)

In [None]:
df["title"].nunique()

42275

In [None]:
values_pd = df["title"].value_counts() # the number of comments for each movie

values_pd

title
Beauty and the Beast    1233
Alice in Wonderland     1166
Jane Eyre                939
A Christmas Carol        852
Les Misérables           828
                        ... 
Tercera Llamada            1
The Thompsons              1
Princesse Tam Tam          1
#chicagoGirl               1
Je hais les enfants!       1
Name: count, Length: 42275, dtype: int64

In [None]:
rare_movies = values_pd[values_pd < 5].index

rare_movies

Index(['Red White & Blue', 'Burton and Taylor', 'My One and Only',
       'A Star for Two', 'The Goodbye Kiss', 'I Live My Life', 'Kicks',
       'That Summer of White Roses', 'Ilsa, Harem Keeper of the Oil Sheiks',
       'The Year of Living Vicariously',
       ...
       'The Case Against 8', 'Maps to the Stars', 'Backstairs',
       'Annabel Takes a Tour', 'Andy Hardy Meets Debutante', 'Tercera Llamada',
       'The Thompsons', 'Princesse Tam Tam', '#chicagoGirl',
       'Je hais les enfants!'],
      dtype='object', name='title', length=3495)

In [None]:
user_title_df = df_.groupby(["userId","title"])["rating"].mean().unstack().notnull()
user_title_df.shape

(671, 38780)

In [None]:
user_title_df.head()

title,!Women Art Revolution,#1 Cheerleader Camp,#Horror,"$1,000 on the Black","$100,000 for Ringo",$5 a Day,$9.99,$ellebrity,'49-'17,'71,...,Приключения Шерлока Холмса и доктора Ватсона: Двадцатый век начинается,Семь кабинок,Совершенно серьезно,Убить дракона,Юленька,هیچ کجا هیچ کس,‘Rameau’s Nephew’ by Diderot (Thanx to Dennis Young) by Wilma Schoen,’Round Midnight,…And the Fifth Horseman Is Fear,ファンタスティポ
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,True,False,True,...,False,True,False,False,False,True,False,False,False,False
5,True,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [None]:
user_title_df.columns

Index(['!Women Art Revolution', '#1 Cheerleader Camp', '#Horror',
       '$1,000 on the Black', '$100,000 for Ringo', '$5 a Day', '$9.99',
       '$ellebrity', ''49-'17', ''71',
       ...
       'Приключения Шерлока Холмса и доктора Ватсона: Двадцатый век начинается',
       'Семь кабинок', 'Совершенно серьезно', 'Убить дракона', 'Юленька',
       'هیچ کجا هیچ کس',
       '‘Rameau’s Nephew’ by Diderot (Thanx to Dennis Young) by Wilma Schoen',
       '’Round Midnight', '…And the Fifth Horseman Is Fear', 'ファンタスティポ'],
      dtype='object', name='title', length=38780)

In [None]:
sample_guy = user_title_df.sample(1,random_state=45).index[0]
random_user_df = user_title_df[user_title_df.index == sample_guy] # observation units belonging to the sample.
movies_watched = random_user_df.dropna(axis=1).columns.tolist() # the movies that the sample has voted for
movies_watched_df = user_title_df[movies_watched]
user_movie_count = movies_watched_df.notnull().sum(axis=1) # the number of movies each user has watched in the sample
user_movie_count.max()

38780

In [None]:
users_same_movies = user_movie_count[user_movie_count > (movies_watched_df.shape[1] * 60 ) / 100].index # people who watched more than 60% of the movies that the sample watched

users_same_movies

Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
       ...
       662, 663, 664, 665, 666, 667, 668, 669, 670, 671],
      dtype='int64', name='userId', length=671)

In [None]:
filted_df = movies_watched_df[movies_watched_df.index.isin(users_same_movies)]

filted_df

title,!Women Art Revolution,#1 Cheerleader Camp,#Horror,"$1,000 on the Black","$100,000 for Ringo",$5 a Day,$9.99,$ellebrity,'49-'17,'71,...,Приключения Шерлока Холмса и доктора Ватсона: Двадцатый век начинается,Семь кабинок,Совершенно серьезно,Убить дракона,Юленька,هیچ کجا هیچ کس,‘Rameau’s Nephew’ by Diderot (Thanx to Dennis Young) by Wilma Schoen,’Round Midnight,…And the Fifth Horseman Is Fear,ファンタスティポ
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,True,False,True,...,False,True,False,False,False,True,False,False,False,False
5,True,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
668,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
669,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
670,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [None]:
corr_df = filted_df.T.corr().unstack().drop_duplicates() # the correlations between users
corr_df.sort_values(ascending=False).head(20)

userId  userId
1       1         1.000000
151     369       0.897083
279     400       0.863863
151     400       0.857359
369     400       0.850806
151     279       0.843812
279     369       0.839704
252     329       0.835211
329     459       0.834939
92      590       0.832829
191     513       0.831038
        317       0.824203
108     225       0.822903
82      400       0.815009
225     375       0.801133
82      191       0.800207
375     568       0.797001
151     590       0.795936
64      657       0.795863
        513       0.793077
dtype: float64

In [None]:
movies_similarity = movies_similarity.sort_values(ascending=False).reset_index()
movies_similarity.columns = ["movieId","movies_similarity"]
movies_similarity.head()

NameError: name 'movies_similarity' is not defined

In [None]:
#df = pd.merge(movies, rating, how="inner", on="movieId")
#df.head()

KeyError: 'movieId'

In [None]:
print("Movie DataFrame columns:", movies.columns)
print("Rating DataFrame columns:", rating.columns)

Movie DataFrame columns: Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'movieId'],
      dtype='object')
Rating DataFrame columns: Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [None]:
movie_ids = [130219, 356, 4422, 541]

movies = ["The Dark Knight (2011)",
          "Cries and Whispers (Viskningar och rop) (1972)",
          "Forrest Gump (1994)",
          "Blade Runner (1982)"]
sample_df = df[df.movieId.isin(movie_ids)]
sample_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,movieId,userId,rating,timestamp
544168,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,11010,tt0110877,it,Il postino,Simple Italian postman learns to love poetry w...,...,Released,,The Postman,False,7.6,181.0,356,2,3.0,835355628
544169,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,11010,tt0110877,it,Il postino,Simple Italian postman learns to love poetry w...,...,Released,,The Postman,False,7.6,181.0,356,3,5.0,1298862167
544170,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,11010,tt0110877,it,Il postino,Simple Italian postman learns to love poetry w...,...,Released,,The Postman,False,7.6,181.0,356,4,5.0,949919763
544171,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,11010,tt0110877,it,Il postino,Simple Italian postman learns to love poetry w...,...,Released,,The Postman,False,7.6,181.0,356,5,4.0,1163374152
544172,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,11010,tt0110877,it,Il postino,Simple Italian postman learns to love poetry w...,...,Released,,The Postman,False,7.6,181.0,356,7,3.0,851868188


In [None]:
sample_df.shape

(137622, 28)

In [None]:
user_movie_df = sample_df.pivot_table(index=["userId"],columns = ["title"], values ="rating")
user_movie_df.shape

(392, 247)

In [None]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(sample_df[['userId','movieId','rating']],reader)


In [None]:
trainset, testset = train_test_split(data, test_size=.25)
svd_model = SVD()
svd_model.fit(trainset)
predictions = svd_model.test(testset)

In [None]:
svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.12   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.124500144501593, details={'was_impossible': False})

In [None]:
svd_model.predict(uid=1.0, iid=356, verbose=True)

user: 1.0        item: 356        r_ui = None   est = 4.00   {'was_impossible': False}


Prediction(uid=1.0, iid=356, r_ui=None, est=3.9988149607301637, details={'was_impossible': False})

In [None]:
sample_df[sample_df["userId"] == 1]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,movieId,userId,rating,timestamp


In [None]:
param_grid = {'n_epochs': [5, 10, 20],
              'lr_all': [0.002, 0.005, 0.007]}

In [None]:
gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)


In [None]:
gs.fit(data)


NameError: name 'data' is not defined

In [None]:
gs.best_score['rmse']

AttributeError: 'GridSearchCV' object has no attribute 'best_score'

In [None]:
gs.best_params['rmse']

AttributeError: 'GridSearchCV' object has no attribute 'best_params'

In [None]:
# dir(svd_model)
svd_model.n_epochs

NameError: name 'svd_model' is not defined

In [None]:
svd_model = SVD(**gs.best_params['rmse'])
data = data.build_full_trainset()
svd_model.fit(data)

AttributeError: 'GridSearchCV' object has no attribute 'best_params'

In [None]:
def suggest(df,user_id,sug):
    
    didnt_watch = df["movieId"][~(df["userId"] == user_id)].drop_duplicates().values.tolist()
    temp_dict={}
    
    for i in didnt_watch:
        
        temp_dict[i] = svd_model.predict(uid=user_id, iid=i)[3]
        
    suggestions = pd.DataFrame(temp_dict.items(),columns=["movieId",'possible_rate']).sort_values(by="possible_rate", ascending=False).head(sug)
    merged = pd.merge(suggestions,movie[["movieId","title"]], how="inner", on="movieId")
    
    return merged


In [None]:
suggest(df,21,15).sort_values(by="title", ascending=False)