In [2]:
# importing the packages required for the model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [3]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(3883, 127)

In [5]:
#now we will calculate cosine similarity
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.14193614, 0.09010857, 0.1056164 ],
       [0.14193614, 1.        , 0.        , 0.        ],
       [0.09010857, 0.        , 1.        , 0.1719888 ],
       [0.1056164 , 0.        , 0.1719888 , 1.        ]])

In [6]:
#now we will build a fucntion to show top 20 suggestion
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
print(indices)

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

title
Toy Story (1995)                         0
Jumanji (1995)                           1
Grumpier Old Men (1995)                  2
Waiting to Exhale (1995)                 3
Father of the Bride Part II (1995)       4
                                      ... 
Meet the Parents (2000)               3878
Requiem for a Dream (2000)            3879
Tigerland (2000)                      3880
Two Family House (2000)               3881
Contender, The (2000)                 3882
Length: 3883, dtype: int64


In [7]:
#output of the above implementation
genre_recommendations('Good Will Hunting (1997)')

25                                        Othello (1995)
26                                   Now and Then (1995)
29     Shanghai Triad (Yao a yao yao dao waipo qiao) ...
30                                Dangerous Minds (1995)
35                               Dead Man Walking (1995)
39                       Cry, the Beloved Country (1995)
42                                    Restoration (1995)
52                                       Lamerica (1994)
54                                        Georgia (1995)
56                          Home for the Holidays (1995)
61                             Mr. Holland's Opus (1995)
66                                       Two Bits (1995)
77                            Crossing Guard, The (1995)
79          White Balloon, The (Badkonake Sefid ) (1995)
81                       Antonia's Line (Antonia) (1995)
82       Once Upon a Time... When We Were Colored (1995)
89                    Journey of August King, The (1995)
92                             

In [8]:
#from here we are going to implement the collaborative model
# # Fill NaN values in user_id and movie_id column with 0
ratings['user_id'] = ratings['user_id'].fillna(0)
ratings['movie_id'] = ratings['movie_id'].fillna(0)

# Replace NaN values in rating column with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())

In [9]:
# Randomly sample 1% of the ratings dataset
small_data = ratings.sample(frac=0.02)
# Check the sample info
print(small_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20004 entries, 238817 to 396269
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   user_id   20004 non-null  int64
 1   movie_id  20004 non-null  int64
 2   rating    20004 non-null  int64
dtypes: int64(3)
memory usage: 625.1 KB
None


In [10]:
#splitting the data set in testing and training dataset
#from sklearn import cross_validation as cv
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(small_data, test_size=0.2)
print(train_data,test_data)

        user_id  movie_id  rating
656344     3955      1541       3
544036     3353      3608       3
682285     4085      1836       1
233302     1420       590       5
448186     2761      1081       4
...         ...       ...     ...
649348     3913      1395       1
692007     4140       169       1
671308     4033      1391       4
472932     2907       527       4
93377       624      3526       5

[16003 rows x 3 columns]         user_id  movie_id  rating
512474     3163         1       5
915402     5533      2019       5
641687     3859      2231       4
776583     4637       785       4
8189         56       474       4
...         ...       ...     ...
713743     4277      1694       4
520956     3216       728       3
34265       231      1748       4
997257     6022      2150       3
157439     1011      3916       4

[4001 rows x 3 columns]


In [11]:
# Create two user-item matrices, one for training and another for testing
# #train_data_matrix = train_data.as_matrix(columns = ['userid', 'movieid', 'rating'])
train_data_matrix = train_data.to_numpy()
test_data_matrix = test_data.to_numpy()

# Check their shape
print(train_data_matrix.shape)
print(test_data_matrix.shape)

(16003, 3)
(4001, 3)


In [12]:
from sklearn.metrics.pairwise import pairwise_distances
#using this method we will calculate pearson correlation
# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[1.         0.75555098 0.99763259 0.99961778]
 [0.75555098 1.         0.7988123  0.77337267]
 [0.99763259 0.7988123  1.         0.99915246]
 [0.99961778 0.77337267 0.99915246 1.        ]]


In [13]:
# this code will calculate the Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])

[[ 1.         -0.00881376  0.00992702]
 [-0.00881376  1.         -0.06550328]
 [ 0.00992702 -0.06550328  1.        ]]


In [14]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [18]:
from sklearn.metrics import mean_squared_error
from math import sqrt
# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [19]:
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 45.040703015587475
Item-based CF RMSE: 52.07785567484252


In [20]:
# RMSE on the train data
print('User-based RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
print('Item-based RMSE: ' + str(rmse(item_prediction, train_data_matrix)))

User-based RMSE: 22.06841533059549
Item-based RMSE: 4.013679375168611
