In [1]:
import numpy as np
from datetime import date
import pandas as pd
class UserItemData:
    def __init__(self,path,start_date=None,end_date=None,min_ratings=0):
        self.start_date = start_date
        self.end_date = end_date
        self.min_ratings = min_ratings
        self.data_csv = pd.read_csv(path,sep='\t')
        if(self.start_date is not None):
            self.data_csv = self.date_from()
        if(self.end_date is not None):
            self.data_csv = self.date_to()
            
        self.data_csv = self.data_csv.pivot(index='userID', columns='movieID', values='rating')    
        self.data_csv = self.data_csv.loc[:, (self.data_csv.notnull().sum(axis=0) > self.min_ratings)]
        self.count = 0
        for i,j in self.data_csv.iterrows():
            self.count+=j.count()
        
            
    def date_from(self):
        day_s, month_s, year_s = self.start_date.split(".")
        filtered = self.data_csv[(self.data_csv['date_year'] >= int(year_s))]
        filtered = filtered.drop(filtered[(filtered['date_year'] <= int(year_s)) & (filtered['date_month'] <= int(month_s)) & (filtered['date_day'] < int(day_s))].index)
        filtered = filtered.drop(filtered[(filtered['date_year'] <= int(year_s)) & (filtered['date_month'] < int(month_s))].index)
        return filtered
    
    def date_to(self):
        day, month, year = self.end_date.split(".")
        filtered = self.data_csv[self.data_csv['date_year'] <= int(year)]
        filtered = filtered.drop(filtered[(filtered['date_year'] >= int(year)) & (filtered['date_month'] >= int(month)) & (filtered['date_day'] >= int(day))].index)
        filtered = filtered.drop(filtered[(filtered['date_year'] >= int(year)) & (filtered['date_month'] > int(month))].index)
        return filtered
    
    def nratings(self):
        return self.count
    
    def add_user(self, d, user_id):
        if user_id not in self.data_csv.index.tolist():
            self.data_csv = self.data_csv.append(pd.Series(name=user_id, dtype = "float64"))
            for k in d.keys():
                self.data_csv.at[user_id,k] = d[k]   

    
uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())


uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
72784


In [2]:
#druga
import pandas as pd
class MovieData:
    def __init__(self,path):
        self.data = pd.read_csv(path,sep='\t', encoding="latin1")
     
    def get_title(self,movieID):
        for index,value in self.data.iterrows():
            if(value['id']==movieID):
                return value['title']

md = MovieData('data/movies.dat')
print(md.get_title(1))  

Toy story


In [3]:
import random

class RandomPredictor():
    def __init__(self,min_ocena,max_ocena):
        self.min_ocena = min_ocena
        self.max_ocena = max_ocena
        
    def fit(self, X):
        self.uim = X
    
    def predict(self, user_id):
        d = {}
        for movie, rat in self.uim.data_csv.loc[user_id].iteritems():
            if rat:
                rat = random.randint(self.min_ocena, self.max_ocena)         
            d[movie] = rat
        return d 
        
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))
    

<class 'dict'>
Film: Toy story, ocena: 2
Film: Grumpy Old Men, ocena: 2
Film: Money Train, ocena: 1
Film: The Usual Suspects, ocena: 3
Film: City Hall, ocena: 3


In [4]:
#cetvrta
#nije fixed
import operator

class Recommender:
    def __init__(self,predictor):
        self.predictor = predictor
    
    def fit(self,X):
        self.uim = X
        self.predictor.fit(X)
        
    def recommend(self,userID,n=10,rec_seen=True):
        d = self.predictor.predict(userID)
        if rec_seen == False:
            seznam = [i for i in self.uim.data_csv.loc[userID][self.uim.data_csv.loc[userID].notnull()].index]
            for i in seznam:
                if i in d.keys():
                    del d[i]
        
        dic = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))
        final = []
        for index,key in enumerate(dic.keys()):
            if(index < n):
                final.append((key,dic[key]))
        return final
            
        
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')

rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Toy story, ocena: 5
Film: Waiting to Exhale, ocena: 5
Film: Sudden Death, ocena: 5
Film: Casino, ocena: 5
Film: Money Train, ocena: 5


In [5]:
class AveragePredictor():
    def __init__(self,b=0):
        if(b < 0):
            self.b = 0
        else:
            self.b = b
    
    def fit(self, X):
        self.uim = X
        self.d = {}
        g_avg = self.uim.data_csv.sum().sum() / self.uim.count
        #avg = (vs + b * g_avg) / (n + b)
        for i in self.uim.data_csv:
            vs = self.uim.data_csv[i].sum()
            n = self.uim.data_csv[i].count()
            avg = (vs + self.b * g_avg) / (n+self.b)
            self.d[i] = avg
            
    
    def predict(self,userID):
        return self.d
        
            
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')

rp = AveragePredictor(b=100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))        
    

Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


In [6]:
class ViewsPredictor():
    def fit(self, X):
        self.uim = X
    def predict(self, userID):
        d = {}
        for i in self.uim.data_csv:
            d[i] = self.uim.data_csv[i][self.uim.data_csv[i].notnull()].count()
        return d

        
        
rp = ViewsPredictor()
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val)) 

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: The Silence of the Lambs, ocena: 1431
Film: Shrek, ocena: 1404


In [7]:
class STDPredictor:
    def __init__(self, min_ratings):
        self.min_ratings = min_ratings
    
    def fit(self, X):
        self.uim = X
        
    def predict(self, userID):
        d = {}
        for i in uim.data_csv:
            if(uim.data_csv[i][uim.data_csv[i].notnull()].count() >= self.min_ratings):
                d[i] = np.std(uim.data_csv[i])
        return d


md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = STDPredictor(100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))         


Film: Plan 9 from Outer Space, ocena: 1.3386525190884058
Film: The Passion of the Christ, ocena: 1.2790640858537328
Film: The Texas Chainsaw Massacre, ocena: 1.2306963889088796
Film: Jackass Number Two, ocena: 1.2134235427973517
Film: White Chicks, ocena: 1.1841675574946435


In [8]:
from itertools import combinations
class ItemBasedPredictor:
    def __init__(self,min_values=0,threshold=0):
        self.min_values = 0
        self.threshold = 0
    def fit(self,X):
        self.uim = X
        self.all_sims = {}
        
        movies = [i for i in self.uim.data_csv]
        movies_comb = list(combinations(movies,2))
        
        for prvi, drugi in movies_comb:
            if not (drugi,prvi) in self.all_sims.keys():
                sim = self.similarity(prvi,drugi)
                self.all_sims[(prvi,drugi)] = sim
       
    
        
    def similarity(self, p1, p2):
        filtered = self.uim.data_csv[(self.uim.data_csv[p1].notnull()) & (self.uim.data_csv[p2].notnull())]
        p1_rat = np.array(filtered[p1])
        p2_rat = np.array(filtered[p2])
        mean_rat = np.array(filtered.mean(axis=1))
        gore = sum((p1_rat - mean_rat) * (p2_rat - mean_rat))
        prvi_sqr = sum((p1_rat - mean_rat) ** 2)
        drugi_sqr = sum((p2_rat - mean_rat)**2)
        dole = np.sqrt(prvi_sqr) * np.sqrt(drugi_sqr)
        
        st_rat = len(filtered.index)
        rez = gore/dole
        if(rez < 0  or st_rat < self.min_values):
            return 0
        
        return gore/dole
    
    def predict(self, userID):
        d = {}
        #all not rated movies
        movies = self.uim.data_csv.loc[userID][self.uim.data_csv.loc[userID].isnull()].index.tolist()
        #(sim izmadju dva filma * ocena tog usera za taj film drugi )
        for movie in movies:
            gore = 0
            dole = 0
            for index, ratings in self.uim.data_csv.iteritems():
                if(index != movie and index not in movies):
                    ocena = self.uim.data_csv.loc[userID][index]
                    par = tuple(sorted([movie,index]))
                    sim = self.all_sims[par]
                    gore += sim * ocena
                    dole += sim
            if(movie not in d.keys() and dole > 0):
                d[movie] = gore/dole
        return d
    
    def num_of_similar(self, number):
        dic = dict(sorted(self.all_sims.items(), key=operator.itemgetter(1),reverse=True))    
        final = []
        for index,key in enumerate(dic.keys()):
            if(index < number):
                final.append((key,dic[key]))
        return final
    
    def similarItems(self, item, n):
        most_similar = {}
        for par in self.all_sims.keys():
            if(item in par):
                if(par.index(item) == 0):
                    most_similar[par[1]] = self.all_sims[par]
                else:
                    most_similar[par[0]] = self.all_sims[par]
                    
        dic = dict(sorted(most_similar.items(), key=operator.itemgetter(1),reverse=True)) 
        final = []
        for index,key in enumerate(dic.keys()):
            if(index < n):
                final.append((key,dic[key]))
        return final
                
        
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
#print(uim.movies)
print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))


Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.2339552317675661
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.42466125844687547


In [9]:
print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: Shichinin no samurai, ocena: 4.35573479031016
Film: The Usual Suspects, ocena: 4.3546817280678365
Film: The Silence of the Lambs, ocena: 4.335305303472517
Film: Sin City, ocena: 4.278687166899101
Film: Monsters, Inc., ocena: 4.2175811369435205
Film: The Incredibles, ocena: 4.2070985832817485
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.152792107348347
Film: Batman Begins, ocena: 4.146413806700199
Film: Die Hard, ocena: 4.125915602232819
Film: Rain Man, ocena: 4.07153524295855
Film: The Lord of the Rings: The Return of the King, ocena: 4.020237449257013
Film: A Beautiful Mind, ocena: 4.0151424900648385
Film: Good Will Hunting, ocena: 4.0092808069228205
Film: The Lord of the Rings: The Two Towers, ocena: 3.9414763050955934
Film: Indiana Jones and the Last Crusade, ocena: 3.7969764963789236


In [10]:
top20 = rp.num_of_similar(20)
for (i, j), val in top20:
    print("Film1: {}, Film2: {}, podobnost: {}".format(md.get_title(i), md.get_title(j), val))

Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8439842148481411
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8231885401761887
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8079374897442487
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7372340224381033
Film1: Star Wars, Film2: Star Wars: Episode V - The Empire Strikes Back, podobnost: 0.7021321132220316
Film1: Ace Ventura: Pet Detective, Film2: The Mask, podobnost: 0.6616471778494041
Film1: Star Wars: Episode V - The Empire Strikes Back, Film2: Star Wars: Episode VI - Return of the Jedi, podobnost: 0.5992253753778951
Film1: Independence Day, Film2: Star Wars: Episode I - The Phantom Menace, podobnost: 0.5610426219249982
Film1: Ace Ventura: Pet Detective, Film2: Austin Powers: The Spy Who Shagged Me, podob

In [11]:
rec_items = rp.similarItems(4993, 20)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": 
Film: The Lord of the Rings: The Two Towers, ocena: 0.8231885401761887
Film: The Lord of the Rings: The Return of the King, ocena: 0.8079374897442487
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.23961943073496453
Film: Star Wars, ocena: 0.21965586527074088
Film: The Matrix, ocena: 0.2151555270688026
Film: Raiders of the Lost Ark, ocena: 0.19944276706345052
Film: The Usual Suspects, ocena: 0.18321188451910767
Film: Blade Runner, ocena: 0.16399681315410303
Film: Schindler's List, ocena: 0.16105905138148724
Film: Monty Python and the Holy Grail, ocena: 0.15780453798519137
Film: Memento, ocena: 0.1384561128951657
Film: The Shawshank Redemption, ocena: 0.13284849173017357
Film: Le fabuleux destin d'Amélie Poulain, ocena: 0.11443985183299189
Film: Fight Club, ocena: 0.1123358559988544
Film: Star Wars: Episode VI - Return of the Jedi, ocena: 0.11138731777631859
Film: The Sixth Sense, ocena: 0.101835916083476

In [12]:
my_ratings = {2:3.5, 50: 5.0, 69: 4.0,593: 5.0, 296: 4.0, 318: 4.0, 858: 5, 1721: 3.5, 2571: 4.0, 2959: 4.0, 3578: 4.0, 4306: 3.5, 4963: 4.0, 6539: 3.5, 5349: 3.5, 344:4.5, 523: 3.5, 574: 2.5, 768: 2.0, 1178: 3.5 }
for idmovie, val in my_ratings.items():
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Jumanji, ocena: 3.5
Film: The Usual Suspects, ocena: 5.0
Film: Friday, ocena: 4.0
Film: The Silence of the Lambs, ocena: 5.0
Film: Pulp Fiction, ocena: 4.0
Film: The Shawshank Redemption, ocena: 4.0
Film: The Godfather, ocena: 5
Film: Titanic, ocena: 3.5
Film: The Matrix, ocena: 4.0
Film: Fight Club, ocena: 4.0
Film: Gladiator, ocena: 4.0
Film: Shrek, ocena: 3.5
Film: Ocean's Eleven, ocena: 4.0
Film: Pirates of the Caribbean: The Curse of the Black Pearl, ocena: 3.5
Film: Spider-Man, ocena: 3.5
Film: Ace Ventura: Pet Detective, ocena: 4.5
Film: Ruby in Paradise, ocena: 3.5
Film: Spanking the Monkey, ocena: 2.5
Film: None, ocena: 2.0
Film: Paths of Glory, ocena: 3.5


In [13]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=950)
uim.add_user(my_ratings, 1)

rp = AveragePredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Average predictions: ")
rec_items = rec.recommend(1, n=10, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Average predictions: 
Film: One Flew Over the Cuckoo's Nest, ocena: 4.195020746887967
Film: Le fabuleux destin d'Amélie Poulain, ocena: 4.190413368513632
Film: Memento, ocena: 4.1840909090909095
Film: Schindler's List, ocena: 4.174786324786325
Film: Raiders of the Lost Ark, ocena: 4.137269938650307
Film: Monty Python and the Holy Grail, ocena: 4.127715030408341
Film: American Beauty, ocena: 4.107676630434782
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 4.104517611026034
Film: American History X, ocena: 4.103182751540041
Film: Eternal Sunshine of the Spotless Mind, ocena: 4.100249584026622


In [14]:
class SlopeOnePredictor:
    def fit(self, X):
        self.uim = X
        
        movies = [i for i in self.uim.data_csv]
        movies_comb = list(combinations(movies,2))
        
        self.all_deviations = {}
        for mov1, mov2 in movies_comb:
            if not (mov2, mov1) in self.all_deviations.keys():
                dev = self.deviation(mov1,mov2)
                self.all_deviations[(mov1,mov2)] = dev
                
    def deviation(self, p1, p2):
        filtered = self.uim.data_csv[(self.uim.data_csv[p1].notnull()) & (self.uim.data_csv[p2].notnull())]
        p1_rat = np.array(filtered[p1])
        p2_rat = np.array(filtered[p2])
        
        gore = sum(p1_rat - p2_rat)
        n = len(p1_rat)
        
        dev = gore / n
        
        return (dev,n)
    
    def predict(self, userID):
        d = {}
        #all not rated movies
        movies = self.uim.data_csv.loc[userID][self.uim.data_csv.loc[userID].isnull()].index.tolist()
        #((cena usera + dev)*n + ....)/(n1+n2+...)
        for movie in movies:
            dole = 0
            gore = 0
            for index, rating in self.uim.data_csv.iteritems():
                if(index != movie and index not in movies):
                    ocena = self.uim.data_csv.loc[userID][index]
                    par = tuple(sorted([movie,index]))
                    dev, n = self.all_deviations[par]
                    dole += n
                    gore += (ocena + dev) * n
                if( movie not in d.keys() and dole>0):
                    d[movie] = gore/dole
        return d
    



md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))


Predictions for 78: 
Film: Star Wars: Episode I - The Phantom Menace, ocena: 6.16971916971917
Film: Ace Ventura: Pet Detective, ocena: 6.097074468085107
Film: The Mask, ocena: 5.994148244473342
Film: Mrs. Doubtfire, ocena: 5.901639344262295
Film: Pretty Woman, ocena: 5.898470097357441
Film: Austin Powers: The Spy Who Shagged Me, ocena: 5.888342696629214
Film: Titanic, ocena: 5.829069767441861
Film: Speed, ocena: 5.805882352941176
Film: Mission: Impossible III, ocena: 5.783980582524272
Film: Batman, ocena: 5.538653366583541
Film: Men in Black, ocena: 5.536977491961415
Film: Spider-Man, ocena: 5.492271105826397
Film: The Lion King, ocena: 5.438307030129125
Film: Ocean's Eleven, ocena: 5.331495098039215
Film: The Fifth Element, ocena: 5.304948216340621
