Movies Recomendation System

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import wordcloud
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns
sns.set_style('whitegrid')
sns.set(font_scale=1.5)
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from scipy.sparse.linalg import svds
from surprise import Reader, Dataset, SVD, evaluate

# Gathering and Preparing Data

In [2]:
ratings = pd.read_csv('ratings.csv')

In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
len(ratings.movieId.unique())

9724

In [5]:
tags = pd.read_csv('tags.csv')

In [6]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [7]:
movies = pd.read_csv('movies.csv')

In [8]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

In [9]:
final = movies.merge(ratings, on='movieId')

In [10]:
final.title = final.title.str.slice(0, -7)

In [11]:
final

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1,4.0,964982703
1,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",5,4.0,847434962
2,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",7,4.5,1106635946
3,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",15,2.5,1510577970
4,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic,"['Action', 'Animation', 'Comedy', 'Fantasy']",184,4.0,1537109082
100832,193583,No Game No Life: Zero,"['Animation', 'Comedy', 'Fantasy']",184,3.5,1537109545
100833,193585,Flint,['Drama'],184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple,"['Action', 'Animation']",184,3.5,1537110021


In [18]:
finalf = final.sample(frac =.25) 
if (0.25*(len(final))== len(rows)): 
    print( "Cool") 
    print(len(final), len(rows)) 


Cool
100836 25209


# EDA

In [None]:
final['title'] = final['title'].fillna("").astype('str')
title_corpus = ' '.join(final['title'])
title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', height=2000, width=4000).generate(title_corpus)

In [None]:
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()

In [None]:
tags.tag = tags.tag.fillna("").astype('str')
title_corpus = ' '.join(tags['tag'])
title_wordcloud2 = WordCloud(stopwords=STOPWORDS, background_color='black', height=2000, width=4000).generate(title_corpus)

In [None]:
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud2)
plt.axis('off')
plt.show()

In [None]:
ratings

In [None]:
plt.hist(ratings['rating'], bins=5, ec='black')
plt.xlabel('Rating')
plt.ylabel('Count')

In [None]:
sns.distplot(ratings['rating'].fillna(ratings['rating'].median()))

In [None]:
# Make a census of the genre keywords
genre_labels = set()
for s in movies['genres'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))

# Function that counts the number of times each of the genre keywords appear
def count_word(dataset, ref_col, census):
    keyword_count = dict()
    for s in census: 
        keyword_count[s] = 0
    for census_keywords in dataset[ref_col].str.split('|'):        
        if type(census_keywords) == float and pd.isnull(census_keywords): 
            continue        
        for s in [s for s in census_keywords if s in census]: 
            if pd.notnull(s): 
                keyword_count[s] += 1
    #______________________________________________________________________
    # convert the dictionary in a list to sort the keywords by frequency
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

# Calling this function gives access to a list of genre keywords which are sorted by decreasing frequency
keyword_occurences, dum = count_word(movies, 'genres', genre_labels)
keyword_occurences

In [None]:
# Define the dictionary used to produce the genre wordcloud
genres = dict()
trunc_occurences = keyword_occurences[0:18]
for s in trunc_occurences:
    genres[s[0]] = s[1]

# Create the wordcloud
genre_wordcloud = WordCloud(width=1000,height=400, background_color='white')
genre_wordcloud.generate_from_frequencies(genres)

# Plot the wordcloud
f, ax = plt.subplots(figsize=(16, 8))
plt.imshow(genre_wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
movies['genres'].astype(str)

# Model Building

## Content Based

In [12]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(final['genres'])
tfidf_matrix.shape

(100836, 177)

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

In [None]:
titles = movies.title
indicies = pd.Series(movies.index, index=movies['title'])

def genre_rec(title):
    idx = indicies[title]
    similar_score = list(enumerate(cosine_sim[idx]))
    similar_score = sorted(similar_score, key=lambda x: x[1], reverse=True)
    similar_score = similar_score[1:21]
    movie_indices = [i[0] for i in similar_score]
    return titles.iloc[movie_indices]

In [None]:
genre_rec('Toy Story').head(5)

# Matrix Factorization

In [37]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]

In [39]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
R = Ratings.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

  """Entry point for launching an IPython kernel.


In [41]:
U, sigma, Vt = svds(Ratings_demeaned, k = 50)

In [42]:
sigma = np.diag(sigma)

In [43]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [44]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.167328,0.402751,0.840184,-0.076281,-0.551337,2.504091,-0.890114,-0.026443,0.196974,1.593259,...,-0.023453,-0.019967,-0.026939,-0.026939,-0.023453,-0.026939,-0.023453,-0.023453,-0.023453,-0.058732
1,0.211459,0.006658,0.033455,0.017419,0.18343,-0.062473,0.083037,0.024158,0.04933,-0.15253,...,0.019498,0.016777,0.022219,0.022219,0.019498,0.022219,0.019498,0.019498,0.019498,0.032281
2,0.003588,0.030518,0.046393,0.008176,-0.006247,0.107328,-0.012416,0.003779,0.007297,-0.059362,...,0.005909,0.006209,0.00561,0.00561,0.005909,0.00561,0.005909,0.005909,0.005909,0.008004
3,2.051549,-0.387104,-0.252199,0.087562,0.130465,0.27021,0.477835,0.040313,0.025858,-0.017365,...,0.004836,0.004172,0.0055,0.0055,0.004836,0.0055,0.004836,0.004836,0.004836,-0.023311
4,1.344738,0.778511,0.065749,0.111744,0.273144,0.584426,0.25493,0.128788,-0.085541,1.023455,...,-0.008042,-0.007419,-0.008664,-0.008664,-0.008042,-0.008664,-0.008042,-0.008042,-0.008042,-0.010127


In [51]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [None]:
already_rated, predictions = recommend_movies(preds, 20, movies, ratings, 20)

In [54]:
already_rated

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
87,20,2300,5.0,1054147430,"Producers, The (1968)",['Comedy']
36,20,1025,5.0,1054038107,"Sword in the Stone, The (1963)","['Animation', 'Children', 'Fantasy', 'Musical']"
231,20,5991,5.0,1054037184,Chicago (2002),"['Comedy', 'Crime', 'Drama', 'Musical']"
43,20,1073,5.0,1054038071,Willy Wonka & the Chocolate Factory (1971),"['Children', 'Comedy', 'Fantasy', 'Musical']"
45,20,1097,5.0,1054038065,E.T. the Extra-Terrestrial (1982),"['Children', 'Drama', 'Sci-Fi']"
...,...,...,...,...,...,...
221,20,5504,0.5,1054037842,Spy Kids 2: The Island of Lost Dreams (2002),"['Adventure', 'Children']"
189,20,4821,0.5,1054037528,Joy Ride (2001),"['Adventure', 'Thriller']"
216,20,5419,0.5,1054037857,Scooby-Doo (2002),"['Adventure', 'Children', 'Comedy', 'Fantasy',..."
171,20,4367,0.5,1054147289,Lara Croft: Tomb Raider (2001),"['Action', 'Adventure']"


In [56]:
predictions.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
2284,3175,Galaxy Quest (1999),"['Adventure', 'Comedy', 'Sci-Fi']"
817,1136,Monty Python and the Holy Grail (1975),"['Adventure', 'Comedy', 'Fantasy']"
424,500,Mrs. Doubtfire (1993),"['Comedy', 'Drama']"
4118,6377,Finding Nemo (2003),"['Adventure', 'Animation', 'Children', 'Comedy']"


In [60]:
# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the dataset for 5-fold evaluation
data.split(n_folds=5)

In [61]:
svd = SVD()

# Compute the RMSE of the SVD algorithm.
evaluate(svd, data, measures=['RMSE'])




Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 0.8656
------------
Fold 2
RMSE: 0.8728
------------
Fold 3
RMSE: 0.8699
------------
Fold 4
RMSE: 0.8742
------------
Fold 5
RMSE: 0.8767
------------
------------
Mean RMSE: 0.8719
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8656276533828758,
                             0.872824269233869,
                             0.8699245474898156,
                             0.8741670684125324,
                             0.8767303790021899]})

In [62]:
# Compute the RMSE of the SVD algorithm.
evaluate(svd, data, measures=['MAE'])


Evaluating MAE of algorithm SVD.

------------
Fold 1
MAE:  0.6703
------------
Fold 2
MAE:  0.6722
------------
Fold 3
MAE:  0.6679
------------
Fold 4
MAE:  0.6726
------------
Fold 5
MAE:  0.6746
------------
------------
Mean MAE : 0.6715
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.6703177076559269,
                             0.6721668455049675,
                             0.6678736349357063,
                             0.6725904817695088,
                             0.6745607736122694]})

In [63]:
trainset = data.build_full_trainset()
svd.train(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a242b40b8>

In [64]:
svd.predict(20, 1)

Prediction(uid=20, iid=1, r_ui=None, est=4.317584259418656, details={'was_impossible': False})