In [112]:
import pandas as pd
import numpy as np

class UserItemData:
    def __init__(self, path, from_date = "", to_date = "", min_ratings = 0):
        self.path = path
        self.from_date = from_date
        self.to_date = to_date
        self.min_ratings = min_ratings
        self.df = pd.read_csv(self.path , sep = "\t")
        if ((self.from_date != "") and (self.to_date != "")):
            fromdate = self.from_date.split(".")
            fd = fromdate[0]
            fm = fromdate[1]
            fy = fromdate[2]
            todate = self.to_date.split(".")
            td = todate[0]
            tm = todate[1]
            ty = todate[2]
            self.df = self.df[((self.df.date_year == int(ty)) & ((self.df.date_month < int(tm)) | ((self.df.date_month == int(tm)) & (self.df.date_day < int(td))))) | 
                               ((self.df.date_year == int(fy)) & ((self.df.date_month > int(fm)) | ((self.df.date_month == int(fm)) & (self.df.date_day > int(fd))))) |
                               ((self.df.date_year < int(ty)) & (self.df.date_year > int(ty)))]
        if (self.min_ratings > 0):
            m = self.df.groupby(['movieID']).size() > self.min_ratings
            filt = list(m.index[m.values==True])
            self.df = self.df[self.df.movieID.isin(filt)]
    def nrating(self):
        return(len(self.df))

In [114]:
uim = UserItemData(path = "user_ratedmovies.dat")
print(uim.nrating())

uim = UserItemData(path = "user_ratedmovies.dat", from_date = '12.1.2007', to_date='16.2.2008', min_ratings=100)
print(uim.nrating())

855598
72724


In [3]:
class MovieData:
    def __init__(self, path):
        self.path = path
        self.df = pd.read_csv(self.path , sep = "\t",encoding = "ISO-8859-1")
    def get_title(self, n):
        return(self.df['title'][self.df.id == n].values[0])

In [115]:
md = MovieData('movies.dat')
print(md.get_title(1))

Toy story


In [116]:
import random

class RandomPredictor():
    def __init__(self, min_random, max_random):
        self.min_random = min_random
        self.max_random = max_random
        self.df = pd.DataFrame()
        self.dict_userID = {}
    def fit(self, X):
        self.df = X.df
    def predict(self, n):
        user_ratings = list(set(self.df.movieID))
        randnums = np.random.randint(self.min_random, self.max_random+1, len(user_ratings))
        randnums = list(randnums)
        for i in range(0, len(randnums)):
            self.dict_userID[user_ratings[i]] = randnums[i]
        return(self.dict_userID)

In [118]:
md = MovieData('movies.dat')
uim = UserItemData('user_ratedmovies.dat') 
rp = RandomPredictor(1, 5) 
rp.fit(uim) 
pred = rp.predict(78) 
print(type(pred)) 
items = [1, 3, 20, 50, 100] 
for item in items: 
    print("Movie: {}, score: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Movie: Toy story, score: 2
Movie: Grumpy Old Men, score: 4
Movie: Money Train, score: 1
Movie: The Usual Suspects, score: 5
Movie: City Hall, score: 3


In [119]:
class Recommender():
    def __init__(self, rd):
        self.rd = rd
        self.dict_userID = {}
    def fit(self, X):
        self.rd.fit(X)
    def recommend(self, userID, n = 10, rec_seen = True):
        new_dict = {}
        if (rec_seen):
            new_dict = self.rd.predict(userID)
        else:
            new_dict = self.rd.predict(userID)
            movie_list = list(set(self.rd.df.movieID[self.rd.df.userID == userID]))
            for i in movie_list:
                new_dict.pop(i)
        sort_dict = dict(sorted(new_dict.items(),key = lambda x:x[1]))
        sort_list = list(sort_dict.items())
        for i in range(len(sort_list)-1, (len(sort_list)-n-1), -1):
            self.dict_userID[sort_list[i][0]] = sort_list[i][1]
        return(self.dict_userID)               

In [122]:
md = MovieData('movies.dat') 
uim = UserItemData('user_ratedmovies.dat') 
rp = RandomPredictor(1, 5) 
rec = Recommender(rp) 
rec.fit(uim) 
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items.items(): 
    print("Movie: {}, score: {}".format(md.get_title(idmovie), val)) 

Movie: The Spirit of '76, score: 5
Movie: Sin City, score: 5
Movie: Knockin' on Heaven's Door, score: 5
Movie: Profundo carmesí, score: 5
Movie: Dai-bosatsu tôge, score: 5


In [123]:
class AveragePredictor():
    def __init__(self, b):
        self.b = b
    def fit(self, X):
        df = X.df
        sum_movieID = dict(df.groupby(['movieID']).sum().rating)
        size_movieID = dict(df.groupby(['movieID']).size())
        mean_movieID = (sum(list(df.rating))/len(list(df.rating)))
        indexes_movieID = list(sum_movieID.keys())
        actual_rating = {}
        for i in indexes_movieID:
            actual_rating[i] = (sum_movieID[i] + self.b * mean_movieID) / (size_movieID[i] + self.b)
        return(actual_rating)

In [124]:
uim = UserItemData('user_ratedmovies.dat')
avg_pred1 = AveragePredictor(0)
rating1 = avg_pred1.fit(uim)
items1 = [53355, 61236, 64280, 7409]
for item in items1: 
    print("Movie: {}, score: {}".format(md.get_title(item), rating1[item]))

avg_pred2 = AveragePredictor(100)
rating2 = avg_pred2.fit(uim)
items2 = [50, 1221, 6016, 58559, 1203]
for item in items2: 
    print("Movie: {}, score: {}".format(md.get_title(item), rating2[item]))

Movie: Sonnenallee, score: 5.0
Movie: Vals Im Bashir, score: 5.0
Movie: Britannia Hospital, score: 5.0
Movie: Shu dan long wei, score: 5.0
Movie: The Usual Suspects, score: 4.225944245560473
Movie: The Godfather: Part II, score: 4.146907937910189
Movie: Cidade de Deus, score: 4.116538340205236
Movie: The Dark Knight, score: 4.10413904093503
Movie: 12 Angry Men, score: 4.103639627096175


In [125]:
import operator

class ViewsPredictor():
    def __init__(self):
        self.df = pd.DataFrame()
    def fit(self, X):
        self.df = X.df
        m = dict(self.df.groupby(['movieID']).size())
        sorted_d = dict( sorted(m.items(), key=operator.itemgetter(1),reverse=True))
        return(sorted_d)

In [126]:
uim = UserItemData('user_ratedmovies.dat')
vp = ViewsPredictor()
vp_dictionary = vp.fit(uim)
for x in list(vp_dictionary)[0:5]:
    print("Movie: {}, score: {}".format(md.get_title(x), vp_dictionary[x]))

Movie: The Matrix, score: 1670
Movie: The Lord of the Rings: The Fellowship of the Ring, score: 1576
Movie: Forrest Gump, score: 1568
Movie: Pulp Fiction, score: 1537
Movie: The Lord of the Rings: The Two Towers, score: 1528


In [133]:
from numpy.linalg import norm
import itertools
import operator


class ItemBasedPredictor():
    def __init__(self, min_values=0, threshold=0):
        self.min_values = min_values
        self.threshold = threshold
        self.df = pd.DataFrame()
    def fit(self, X):
        self.df = X.df
    def similarity(self, p1, p2):
        movie1 = list(self.df.userID[self.df.movieID == p1])
        movie2 = list(self.df.userID[self.df.movieID == p2])
        result = list(filter(lambda x: x in movie1, movie2))
        sim_df = self.df[self.df.userID.isin(result)]
        ratings1 = list(sim_df[sim_df.movieID == p1].rating)
        ratings2 = list(sim_df[(sim_df.movieID == p2)].rating)
        cos_sim = np.dot(ratings1, ratings2)/(norm(ratings1)*norm(ratings2))
        if ((len(sim_df[sim_df.movieID == p1]) > self.min_values) and 
            (len(sim_df[sim_df.movieID == p2]) > self.min_values) and
            (cos_sim < self.threshold)):
            cos_sim = 0
        return(cos_sim)
    def predict(self, n):
        user_df = list(self.df.movieID[self.df.userID == n])
        pred_dict = {}
        for i in list(set(self.df.movieID)):
            pred = 0
            save_dict = {}
            for j in user_df:
                if (i != j):
                    save_dict[j] = self.similarity(i,j)
            pred_enum = 0
            pred_denom = 0
            for index, val in save_dict.items():
                pred_enum += val * self.df.rating[(self.df.movieID == index) & (self.df.userID == n)].values[0]
                pred_denom += val
            pred = pred_enum/pred_denom
            pred_dict[i] = pred
        return(pred_dict)
    def similarItems(self, item, n):
        movies_list = list(set(self.df.movieID))
        save_dict = {}
        for i in movies_list:
            if (i != item):
                save_dict[i] = self.similarity(item,i)
        
        sort_dict = dict( sorted(save_dict.items(), key=operator.itemgetter(1),reverse=True))
        ret_dict = dict(itertools.islice(sort_dict.items(), n))
        return(ret_dict)
    def mostSimilar(self):
        movies_list = list(set(self.df.movieID))
        save_dict = {}
        for i in movies_list:
            for j in movies_list:
                if (i != j):
                    save_dict[(i,j)] = self.similarity(i,j)
        sort_dict = dict( sorted(save_dict.items(), key=operator.itemgetter(1),reverse=True))
        ret_dict = dict(itertools.islice(sort_dict.items(), 20))
        return(ret_dict)
                    
        
        

In [135]:
md = MovieData('movies.dat')
uim = UserItemData('user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
print("Similarity between the movies 'Men in black'(1580) and 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Similarity between the movies 'Men in black'(1580) and 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Similarity between the movies 'Men in black'(1580) and 'Independence day'(780): ", rp.similarity(1580, 780))
print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Movie: {}, score: {}".format(md.get_title(idmovie), val))

Similarity between the movies 'Men in black'(1580) and 'Ghostbusters'(2716):  0.970548659511179
Similarity between the movies 'Men in black'(1580) and 'Schindler's List'(527):  0.9619614385295201
Similarity between the movies 'Men in black'(1580) and 'Independence day'(780):  0.9536050237785273
Predictions for 78: 
Movie: Sin City, score: 3.9694091991434592
Movie: The Usual Suspects, score: 3.9691526760075937
Movie: The Silence of the Lambs, score: 3.9686036137411818
Movie: Shichinin no samurai, score: 3.9685866520222435
Movie: Rain Man, score: 3.9675576234132275
Movie: The Incredibles, score: 3.9674499151153153
Movie: Batman, score: 3.967232985186039
Movie: Monsters, Inc., score: 3.9671265020773543
Movie: The Fifth Element, score: 3.9671131441074277
Movie: Batman Begins, score: 3.9670883403197696
Movie: Good Will Hunting, score: 3.9669952175521286
Movie: Toy story, score: 3.966982644503403
Movie: A Beautiful Mind, score: 3.9669740875201103
Movie: Die Hard, score: 3.9668620236772227
Mo

In [109]:
uim = UserItemData('user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
rec_items = rp.mostSimilar()
for idmovie, val in rec_items.items():
    print("Movie1: {}, Movie2: {}, similarity: {}".format(md.get_title(idmovie[0]),md.get_title(idmovie[1]), val))


Movie1: The Lord of the Rings: The Two Towers, Movie2: The Lord of the Rings: The Return of the King, similarity: 0.9941530195371168
Movie1: The Lord of the Rings: The Return of the King, Movie2: The Lord of the Rings: The Two Towers, similarity: 0.9941530195371168
Movie1: The Lord of the Rings: The Two Towers, Movie2: The Lord of the Rings: The Fellowship of the Ring, similarity: 0.9934681655627514
Movie1: The Lord of the Rings: The Fellowship of the Ring, Movie2: The Lord of the Rings: The Two Towers, similarity: 0.9934681655627514
Movie1: The Lord of the Rings: The Fellowship of the Ring, Movie2: The Lord of the Rings: The Return of the King, similarity: 0.9927892914676263
Movie1: The Lord of the Rings: The Return of the King, Movie2: The Lord of the Rings: The Fellowship of the Ring, similarity: 0.9927892914676263
Movie1: Star Wars: Episode V - The Empire Strikes Back, Movie2: Star Wars, similarity: 0.9904424054653655
Movie1: Star Wars, Movie2: Star Wars: Episode V - The Empire Str

In [136]:
rec_items = rp.similarItems(4993, 10)
print('Movies similar to "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items.items():
    print("Movie: {}, score: {}".format(md.get_title(idmovie), val))

Movies similar to "The Lord of the Rings: The Fellowship of the Ring": 
Movie: The Lord of the Rings: The Two Towers, score: 0.9934681655627514
Movie: The Lord of the Rings: The Return of the King, score: 0.9927892914676263
Movie: Raiders of the Lost Ark, score: 0.9734510001832855
Movie: Indiana Jones and the Last Crusade, score: 0.9728850679878953
Movie: Star Wars: Episode V - The Empire Strikes Back, score: 0.9726021706937393
Movie: Star Wars, score: 0.9721491648232138
Movie: Batman Begins, score: 0.9721482282947644
Movie: The Incredibles, score: 0.9719212276327879
Movie: The Usual Suspects, score: 0.9718243304335739
Movie: Star Wars: Episode VI - Return of the Jedi, score: 0.9710562134338746


In [137]:
md = MovieData('movies.dat')
uim = UserItemData('user_ratedmovies.dat', min_ratings = 1000)
items = [{'userID':71535, 'movieID':1, 'rating':4.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':4886, 'rating':4.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':4306, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':5349, 'rating':2.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':32, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':47, 'rating':3.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':50, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':296, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':318, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':5445, 'rating':3.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':858, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':367, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':377, 'rating':4.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':4993, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':457, 'rating':4.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':480, 'rating':3.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':2028, 'rating':4.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':7153, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':1527, 'rating':3.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44},
         {'userID':71535, 'movieID':3578, 'rating':5.0, 'date_day':27, 'date_month':5, 'date_year':2021, 'date_hour':16, 'date_minute':22, 'date_year':44}]
uim.df = uim.df.append(items, ignore_index=True)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
print("Predictions for 71535: ")
rec_items = rec.recommend(71535, n=5, rec_seen=False)
for idmovie, val in rec_items.items():
    print("Movie: {}, score: {}".format(md.get_title(idmovie), val))

Predictions for 71535: 
Movie: Fargo, score: 4.202642547257465
Movie: Reservoir Dogs, score: 4.202600207301209
Movie: Wo hu cang long, score: 4.202534492030078
Movie: Monty Python and the Holy Grail, score: 4.20245748575363
Movie: Blade Runner, score: 4.202368604369937


In [138]:
class SlopeOnePredictor():
    def __init__(self):
        self.df = pd.DataFrame()
    def fit(self, X):
        self.df = X.df
    def predict(self, n):
        predicted_movies = {}
        user_df = self.df[self.df.userID == n]
        for j in list(set(self.df.movieID)):
            if (j not in list(set(user_df.movieID))):
                save_dict = {}
                users1 = list(self.df.userID[self.df.movieID == j])
                for i in list(set(self.df.movieID)):
                    if (i != j and i in list(set(user_df.movieID))):
                        users2 = list(self.df.userID[self.df.movieID == i])
                        result = list(filter(lambda x: x in users1, users2))
                        dev = 0
                        for k in result:
                            dev += self.df.rating[(self.df.movieID == j) & (self.df.userID == k)].values[0] - self.df.rating[(self.df.movieID == i) & (self.df.userID == k)].values[0]
                        save_dict[i] = (dev/len(result),len(result))
                pred_enum = 0
                pred_denom = 0
                for idmovie,value in save_dict.items():
                    pred_enum += (value[0] + user_df.rating[user_df.movieID==idmovie].values[0])*value[1]
                    pred_denom += value[1]
                predicted_movies[j] = pred_enum/pred_denom
        return(predicted_movies)


In [140]:
md = MovieData('movies.dat') 
uim = UserItemData("user_ratedmovies.dat", from_date = '12.1.2007', to_date='16.2.2008', min_ratings=300)
rp = SlopeOnePredictor() 
rec = Recommender(rp) 
rec.fit(uim)

print("Predictions for 78: ") 
rec_items = rec.recommend(78, n=5, rec_seen=True) 
for idmovie, val in rec_items.items():
    print("Movie: {}, score: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Movie: The Shawshank Redemption, score: 4.633333333333334
Movie: Fight Club, score: 4.575388026607539
Movie: Pulp Fiction, score: 4.527088036117381
Movie: The Matrix, score: 4.4978991596638656
Movie: Memento, score: 4.41147132169576
