In [1]:
import pandas as pd
import numpy as np
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
ratings = ratings.drop(columns = 'timestamp')
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
tags = tags.drop(columns = 'timestamp')
tags

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highly quotable
2,2,60756,will ferrell
3,2,89774,Boxing story
4,2,89774,MMA
...,...,...,...
3678,606,7382,for katie
3679,606,7936,austere
3680,610,3265,gun fu
3681,610,3265,heroic bloodshed


In [5]:
tagsRatings = pd.merge(tags, ratings, on = ['userId', 'movieId'])
tagsRatings

Unnamed: 0,userId,movieId,tag,rating
0,2,60756,funny,5.0
1,2,60756,Highly quotable,5.0
2,2,60756,will ferrell,5.0
3,2,89774,Boxing story,5.0
4,2,89774,MMA,5.0
...,...,...,...,...
3471,606,6107,World War II,4.0
3472,606,7382,for katie,4.5
3473,610,3265,gun fu,5.0
3474,610,3265,heroic bloodshed,5.0


In [6]:
df = pd.merge(movies, tagsRatings, on = ('movieId'))

In [7]:
df

Unnamed: 0,movieId,title,genres,userId,tag,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,3.5
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,4.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,4.0
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,star wars,4.0
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,anime,3.5
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,comedy,3.5
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,gintama,3.5


# Data Cleaning

In [8]:
df.isna().sum()

movieId    0
title      0
genres     0
userId     0
tag        0
rating     0
dtype: int64

In [9]:
df['rating'].value_counts()

4.0    999
5.0    883
3.5    577
4.5    496
3.0    274
2.0    102
2.5     80
1.0     31
1.5     26
0.5      8
Name: rating, dtype: int64

In [10]:
ratingList = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

def makeNotFloat(oldList, columnName):
    binaryList = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    newVals = dict(zip(oldList, binaryList))
    return df[columnName].replace(newVals, inplace = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3476 entries, 0 to 3475
Data columns (total 6 columns):
movieId    3476 non-null int64
title      3476 non-null object
genres     3476 non-null object
userId     3476 non-null int64
tag        3476 non-null object
rating     3476 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 190.1+ KB


In [11]:
#makeNotFloat(ratingList, 'rating')

In [12]:
df['rating'].value_counts()

4.0    999
5.0    883
3.5    577
4.5    496
3.0    274
2.0    102
2.5     80
1.0     31
1.5     26
0.5      8
Name: rating, dtype: int64

In [13]:
df['rating'] = df['rating'].astype(int)
df['rating'].value_counts()

4    1495
5     883
3     851
2     182
1      57
0       8
Name: rating, dtype: int64

# Testing Models

In [14]:
#user_34_prediction = knn_means.predict('100', '222')
#user_34_prediction

In [15]:
#predictions[:10]

In [16]:
#user = 222
#item = 222
#knn_baseline.predict(user, item)

In [17]:
#from surprise import BaselineOnly
#from surprise.model_selection import cross_validate

In [18]:
#cross_validate(BaselineOnly(), data, verbose = True)

In [19]:
#from surprise import KNNBasic
#KNN = KNNBasic().fit(train)

In [20]:
#KNN.get_neighbors(iid=item, k=1)

In [21]:
import time

from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split

#loading the .CSV file into surprise
reader = Reader()
data = Dataset.load_from_df(ratings,reader)
train, test = train_test_split(data, test_size=0.2)

rmseScores = []

from surprise.prediction_algorithms import knns
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
start = time.time()
basic_pearson.fit(train)
predictions = basic_pearson.test(test)
thePrediction = f'KNNBasic: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
start = time.time()
knn_means.fit(train)
predictions = knn_means.test(test)
thePrediction = f'KNNWithMeans: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

knnZ = knns.KNNWithZScore(sim_options=sim_pearson)
start = time.time()
knnZ.fit(train)
predictions = knnZ.test(test)
thePrediction = f'KNNWithZScore: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

sim_pearson = {'name':'pearson', 'user_based':False}
knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
start = time.time()
knn_baseline.fit(train)
predictions = knn_baseline.test(test)
thePrediction = f'KNNBaseline: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

from surprise.prediction_algorithms import SVD
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
start = time.time()
svd.fit(train)
predictions = svd.test(test)
thePrediction = f'SVD: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

from surprise import NormalPredictor
normPred = NormalPredictor()
start = time.time()
normPred.fit(train)
predictions = normPred.test(test)
thePrediction = f'NormalPredictor: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

from surprise import BaselineOnly
baseline = BaselineOnly()
start = time.time()
baseline.fit(train)
predictions = baseline.test(test)
thePrediction = f'BaselineOnly: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

from surprise.prediction_algorithms import NMF
NMF = NMF()
start = time.time()
NMF.fit(train)
predictions = NMF.test(test)
thePrediction = f'NMF: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

from surprise.prediction_algorithms import SlopeOne
slopeOne = SlopeOne()
start = time.time()
slopeOne.fit(train)
predictions = slopeOne.test(test)
thePrediction = f'SlopeOne: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

from surprise.prediction_algorithms import CoClustering
cluster = CoClustering()
start = time.time()
cluster.fit(train)
predictions = cluster.test(test)
thePrediction = f'CoClustering: {accuracy.rmse(predictions)}'
end = time.time()
store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'
rmseScores.append(store)

rmseScores

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9722
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9071
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9107
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8852
RMSE: 0.8958
RMSE: 1.4300
Estimating biases using als...
RMSE: 0.8782
RMSE: 0.9274
RMSE: 0.9024
RMSE: 0.9447


['KNNBasic: 0.9721855123482787 | Time Elapsed: 94.19/sec',
 'KNNWithMeans: 0.9071457876455937 | Time Elapsed: 82.8/sec',
 'KNNWithZScore: 0.9106858517937164 | Time Elapsed: 95.72/sec',
 'KNNBaseline: 0.8851829360744152 | Time Elapsed: 83.16/sec',
 'SVD: 0.8958117524800123 | Time Elapsed: 8.83/sec',
 'NormalPredictor: 1.4299830461457264 | Time Elapsed: 0.59/sec',
 'BaselineOnly: 0.8782438626339207 | Time Elapsed: 0.49/sec',
 'NMF: 0.927398030526231 | Time Elapsed: 17.04/sec',
 'SlopeOne: 0.902414949562756 | Time Elapsed: 29.53/sec',
 'CoClustering: 0.9447301092732988 | Time Elapsed: 7.38/sec']

In [22]:
# DON'T run thhis cell. SVDpp takes WAY too long to run.

#from surprise.prediction_algorithms import SVDpp
#SVDpp = SVDpp()
#SVDpp.fit(train)
#predictions = SVDpp.test(test)
#thePrediction = f'KNNBaseline: {accuracy.rmse(predictions)}'

SVD and BaselineOnly seems to have the closest accuracy while maintaining a very short runtime. Although KNNBaseline has the second best score, all KNN models have a substantial runtime that holds it back.

In [29]:
num = input ("Enter number :")
print(num)
name1 = input("Enter name : ")
print(name1)
  
# Printing type of input value
print ("type of number", type(num))
print ("type of name", type(name1))

Enter number :5
5
Enter name : johnny
johnny
type of number <class 'str'>
type of name <class 'str'>


In [30]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [31]:
# test_list = []
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


In [55]:
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n -= 1
            if n == 0:
                break

In [77]:
def movie_recommender(movie_df, num_of_rated_movies, genre=None):
    userID = 1000
    rating_list = []
    print(f'Thank you for participating! In order to obtain your recommendations, please rate {num_of_rated_movies} movies.')
    
    while num_of_rated_movies > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('On a scale of 1 - 5, how would you rate this movie? press n if you have not seen this movie. Press enter to submit your answer: \n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID, 'movieId':movie['movieId'].values[0], 'rating':rating}
            rating_list.append(rating_one_movie)
            num_of_rated_movies -= 1
    
    new_rating_df = ratings.append(rating_list, ignore_index = True)
    new_data = Dataset.load_from_df(new_rating_df, reader)
    svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
    start = time.time()
    svd.fit(new_data.build_full_trainset())
    predictions = svd.test(test)
    thePrediction = f'SVD: {accuracy.rmse(predictions)}'
    end = time.time()
    store = f'{thePrediction} | Time Elapsed: {np.round(end - start, 2)}/sec'

    moviesList = []
    for m_id in ratings['movieId'].unique():
        moviesList.append((m_id, svd.predict(1000, m_id)[3]))
        
    ranked_movies = sorted(moviesList, key=lambda x:x[1], reverse=True)
    
    return recommended_movies(ranked_movies,movies,5), print(thePrediction)

In [78]:
movie_recommender(movies, 4, 'Comedy')

Thank you for participating! In order to obtain your recommendations, please rate 4 movies.
      movieId                           title          genres
6799    60737  Watching the Detectives (2007)  Comedy|Romance
On a scale of 1 - 5, how would you rate this movie? press n if you have not seen this movie. Press enter to submit your answer: 
2
      movieId               title          genres
2309     3061  Holiday Inn (1942)  Comedy|Musical
On a scale of 1 - 5, how would you rate this movie? press n if you have not seen this movie. Press enter to submit your answer: 
2
      movieId               title  genres
1895     2518  Night Shift (1982)  Comedy
On a scale of 1 - 5, how would you rate this movie? press n if you have not seen this movie. Press enter to submit your answer: 
2
      movieId             title  genres
2532     3392  She-Devil (1989)  Comedy
On a scale of 1 - 5, how would you rate this movie? press n if you have not seen this movie. Press enter to submit your answer:

(None, None)

In [None]:
movies

In [42]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
userId     100836 non-null int64
movieId    100836 non-null int64
rating     100836 non-null float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB
