# Implementation of reccomender system algorithms in Python

#### Hey there! This is coursework completed for Decision Support Systems elective, written by Hristijan Marinkovski. 
#### It entails different algorithms used by reccomender systems and their implementation using Python. We use the movielens dataset for testing, which can be found in the repository you found this code on.
#### This project got 100/100 points.

In [19]:
import pandas as pd
from datetime import date
import datetime
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity

# Reading number of ratings

In [5]:
class UserItemData():
    def __init__(self, path, start_date="1.1.1900", end_date="28.12.2100", min_ratings = 0):
        self.data = pd.read_table(path,encoding = "ISO-8859-1")
        
        
        startdatearray = start_date.split(".")
        enddatearray = end_date.split(".")
        self.startdate = datetime.datetime(int(startdatearray[2]),int(startdatearray[1]),int(startdatearray[0]))
        self.enddate = datetime.datetime(int(enddatearray[2]),int(enddatearray[1]),int(enddatearray[0]))
        
        self.data['date'] = pd.to_datetime([f'{y}-{m}-{d}' for y, m, d in zip(self.data.date_year, self.data.date_month, self.data.date_day)])
        self.data = self.data[(self.data.date >= self.startdate) & (self.data.date < self.enddate)]
        
        self.data = self.data.groupby("movieID").filter(lambda x: len(x) > min_ratings)
        
    def nratings(self):
        rows = len(self.data.index,)
        return rows



In [7]:
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat',min_ratings=1000)
print(uim.nratings())
#The path is local, please change it for testing. You can find the dataset in this repository :)


97681


# Reading movies

In [8]:
class MovieData():
    def __init__(self, path):
        self.data = pd.read_table(path,encoding = "ISO-8859-1")

    def get_title(self, movieID):
        return self.data[self.data.id==movieID].title.tolist()[0]


In [9]:
b = MovieData("C:/Users/user/Desktop/movielens/movies.dat")
print(b.get_title(1))


Toy story


# Random predictor


In [10]:
import random
class RandomPredictor():
    def __init__(self,min,max):
        self.min = min
        self.max = max
        
    def fit(self,UIM):
        self.data=UIM.data
        movies=self.data.movieID.unique()
        self.res = {}
        for movie in movies:
            rating = (random.randrange(self.min, self.max+1))
            self.res[movie] = rating
            
    def predict(self,userID):
        return self.res

In [11]:
md = MovieData('C:/Users/user/Desktop/movielens/movies.dat')
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat') 
rp = RandomPredictor(1, 5) 
rp.fit(uim) 
pred = rp.predict(78)
print(type(pred)) 


items = [1, 3, 20, 50, 100] 
for item in items: 
    print("Movie: {}, score: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Movie: Toy story, score: 4
Movie: Grumpy Old Men, score: 2
Movie: Money Train, score: 2
Movie: The Usual Suspects, score: 2
Movie: City Hall, score: 4


# Reccomender with basic evaluator

In [14]:
class Recommender():
    def __init__(self,pred):
        self.pred = pred
    
    def fit(self,x):
        self.pred.fit(x)
        
    def recommend(self,userID,n=10,rec_seen=True):
       
        ratings = self.pred.predict(userID)
        ratings = sorted(ratings.items(), key=lambda x: x[1], reverse=True)#this sorts them
        if(rec_seen==False):
            seen = set(self.pred.data[(self.pred.data.userID == userID)].movieID.unique()) # movies that user has seen
            ratings = [t for t in ratings if t[0] not in seen] #we make a new dict with movies he hasnt seen
            
        return ratings[:n]
    
    def evaluate(self,accurate_data,predicted_data):
   
        hits = 0
        for acc in accurate_data:
            for pred in predicted_data:
                if acc[0]==pred[0] and acc[1]==pred[1]:
                    hits = hits +1 # now we have the number of hits

        maetop = 0
        for acc in accurate_data:
            for pred in predicted_data:
                if acc[0]==pred[0]:
                    maetop = maetop + abs(pred[1]-acc[1])
                    
        mae = maetop / len(accurate_data)

        rmsetop = 0
        for acc in accurate_data:
            for pred in predicted_data:
                if acc[0]==pred[0]:
                    rmsetop = maetop + (abs(pred[1]-acc[1])**2)
        
        rmse = math.sqrt(rmsetop/len(accurate_data))
       
        percision = hits/len(predicted_data) 
        recall = hits/len(accurate_data)
        f1 = (2 * percision * recall)/(percision + recall)
        
        return mae,rmse,percision,recall,f1
        

In [260]:
md = MovieData('C:/Users/user/Desktop/movielens/movies.dat')
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = RandomPredictor(1,5)
rec = Recommender(rp)

acc = [(203,4),(111,5),(3512,3.5),(161,2),(1211,4),(621,3),(181,5),(1665,1)]
pred = [(203,4),(111,1),(3512,3.5),(161,4),(1211,3.5),(621,2),(181,5),(1665,2.5)]
rec.evaluate(acc,pred)


(1.125, 1.1858541225631423, 0.375, 0.375, 0.375)

# Average predictor

In [12]:
class AveragePredictor():
    def __init__(self,b):
        self.b = b
    
    def fit(self,UIM):
        self.data = UIM.data
        
        g_avg = self.data.rating.mean()
        
        movies=self.data.movieID.unique()
        moviecounts = self.data.movieID.value_counts()
        self.res = {}
        for movie in movies:
            n = (self.data.loc[self.data.movieID == movie].count()[1])
            vs = self.data[self.data.movieID==movie].rating.sum()
            rating = (vs + self.b * g_avg) / (n + self.b)
            self.res[movie] = rating
        
    def predict(self,userID):
        return self.res

        

In [15]:
md = MovieData('C:/Users/user/Desktop/movielens/movies.dat') 
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat') 
rp = AveragePredictor(100) 
rec = Recommender(rp) 
rec.fit(uim) 
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items: 
   print("Movie: {}, score: {}".format(md.get_title(idmovie), val)) 


Movie: The Usual Suspects, score: 4.225944245560473
Movie: The Godfather: Part II, score: 4.146907937910189
Movie: Cidade de Deus, score: 4.116538340205236
Movie: The Dark Knight, score: 4.10413904093503
Movie: 12 Angry Men, score: 4.103639627096175


# Prediction by view count

In [16]:
class ViewsPredictor():
    def fit(self,UIM):
        self.data=UIM.data
        movies=self.data.movieID.unique()
        self.res = {}
        for movie in movies:
            rating = self.data.loc[self.data.movieID == movie].count()[1]
            self.res[movie] = rating
        
    def predict(self,userID):
        return self.res
        

In [17]:
md = MovieData('C:/Users/user/Desktop/movielens/movies.dat') 
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat') 
rp = ViewsPredictor() 
rec = Recommender(rp) 
rec.fit(uim) 
rec_items = rec.recommend(78, n=5, rec_seen=False)

for idmovie, val in rec_items: 
    print("Movie: {}, score: {}".format(md.get_title(idmovie), val)) 

Movie: The Lord of the Rings: The Fellowship of the Ring, score: 1576
Movie: The Lord of the Rings: The Two Towers, score: 1528
Movie: The Lord of the Rings: The Return of the King, score: 1457
Movie: The Silence of the Lambs, score: 1431
Movie: Shrek, score: 1404


# Item based predictor

### _This class and its algorithms are accurate but inneficient*_

In [40]:
class ItemBasedPredictor:
    def __init__ (self,min_values=0,threshhold=0):
        self.min_values=min_values
        self.threshhold=threshhold

        self.k=1
    
        
    def similarItems(self,item,n): 
        movies=self.data[self.data.movieID != item].movieID.unique() # movies without our item
        itemSims = {}
        for movie in movies:
            rating = self.sims[item,movie]
            itemSims[movie] = rating# this gives us similarities for our item and all other movies
            
        itemSims = sorted(itemSims.items(), key=lambda x: x[1], reverse=True)#this sorts them
        # now we just sort them and give top n
        
        return itemSims[:n]
        
        
    def fit(self,UID):
        self.data = UID.data # just the data
        self.avgratings = self.data.groupby(["userID"]).agg({"rating" : ["mean"]})
        self.movies=self.data.movieID.unique() ## every unique movie
        
        self.sims = {} ## this contains all similarities, just do self.sims[movie1,movie2] and you get the sim
        #calculate all similarities
        for movie1 in self.movies:
            for movie2 in self.movies:
                if (movie2,movie1) in self.sims: # if the already calculated the similarity between the movies
                     continue # we already have it
                    
                else: #we need to calculate it
                    sim = self.similarity(movie1,movie2)
                    self.sims[movie1,movie2] = sim
                    self.sims[movie2,movie1] = sim
    
    def predict(self,userID): 
        self.predictions = {}
        
        movies_for_user = self.data[self.data.userID==userID].movieID.unique()
        
        top_sum = 0
        bot_sum = 0
        top_val = 0
        bot_val = 0
        
        for movie1 in self.movies: 
            for movie2 in movies_for_user: #for each movie, we need sim with others
                
                if(movie1!=movie2):
                 
                
                    
                    rating_i = self.data[(self.data['userID'] == userID) & (self.data['movieID'] == movie2)].rating.values[0] ## what if the movie isnt rated...
                    top_val = self.sims[movie1,movie2] * rating_i
                    bot_val = self.sims[movie1,movie2]
                    
                    top_sum = top_sum + top_val
                    bot_sum = bot_sum + bot_val
                else:
                    continue
            
            self.predictions[movie1] = top_sum / bot_sum
            
        
        return self.predictions
    
    
    def similarity(self,p1,p2):
        
        set1 = set(self.data[self.data.movieID==p1].userID.to_numpy()) # users that rated p1
        set2 = set(self.data[self.data.movieID==p2].userID.to_numpy()) # users that rated p2
        set3 = set1.intersection(set2) # users that rated both
        
        if(len(set3)==0):
            return 0
        
        users_ratings=self.data[((self.data['movieID']==p1) | (self.data['movieID']==p2)) & (self.data['userID'].isin(set3))][["userID","rating","movieID"]]
        top_sum = 0
        bot_sum_1 = 0
        bot_sum_2 = 0
        
        
        
        for user in set3: # for each user that rated both movies, this is slow because im searching the data so much..matrices might be faster
            avg = self.avgratings.loc[user ,'rating'].values[0] #this is his average rating for movies in general
            rating_a = users_ratings[(users_ratings['userID'] == user) & (users_ratings['movieID'] == p1)].rating.values[0]#rating for movie a for this user
            rating_b = users_ratings[(users_ratings['userID'] == user) & (users_ratings['movieID'] == p2)].rating.values[0]#rating for movie b by this user
            
            top_sum = top_sum + (rating_a - avg)*(rating_b - avg)
            bot_sum_1 = bot_sum_1 + (rating_a - avg)**2
            bot_sum_2 = bot_sum_2 + (rating_b - avg)**2
            
        
        sim = (top_sum)/( math.sqrt(bot_sum_1) * math.sqrt(bot_sum_2) )
        
       
        if sim < self.threshhold or len(set3) < self.min_values:
            sim = 0.0
            
            
        return round(sim,12) 

In [39]:
md = MovieData('C:/Users/user/Desktop/movielens/movies.dat')
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat', min_ratings=1400)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Similarity between the movies 'Men in black'(1580) and 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Similarity between the movies 'Men in black'(1580) and 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Similarity between the movies 'Men in black'(1580) and 'Independence day'(780): ", rp.similarity(1580, 780))

Similarity between the movies 'Men in black'(1580) and 'Ghostbusters'(2716):  0
Similarity between the movies 'Men in black'(1580) and 'Schindler's List'(527):  0
Similarity between the movies 'Men in black'(1580) and 'Independence day'(780):  0


# Finding most similar movies

In [22]:
class MostSimilarMovies:
    def __init__(self,pred,UIM):
        self.pred = pred
        self.data = UIM.data
        
    def list(self,n):
        #brute force all combinations of movies and list top 20
        movies=self.data.movieID.unique()
          
        res = {}
        
        for movie1 in movies:
            for movie2 in movies:
                if movie1 == movie2:
                    continue
                else:
                        
                    if (movie2,movie1) in res:
                        continue
                    
                    sim = self.pred.similarity(movie1,movie2)
                    
                    if(len(res) < n):
                        res[movie1,movie2] = sim
                    elif(sim > res[min(res, key=res.get)]):
                        del res[min(res, key=res.get)]
                        res[movie1,movie2] = sim
        res = sorted(res.items(), key=lambda x: x[1], reverse=True)              
        return res

In [23]:
pred = ItemBasedPredictor()
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat', min_ratings=1400)
pred.fit(uim)
msm = MostSimilarMovies(pred,uim)

topn = msm.list(20)# works
print(topn)

[((5952, 7153), 0.747020840907), ((4993, 5952), 0.713724352718), ((4993, 7153), 0.679739756693), ((296, 2959), 0.274200954372), ((480, 4306), 0.163197513395), ((296, 593), 0.159135896609), ((356, 480), 0.130649301302), ((2959, 2858), 0.124155927829), ((296, 2858), 0.118136176191), ((296, 318), 0.112291526434), ((2571, 2959), 0.110653355659), ((318, 2858), 0.102585508099), ((356, 4306), 0.069472173331), ((318, 593), 0.063795586248), ((318, 356), 0.053710973225), ((2858, 593), 0.045424475165), ((2959, 593), 0.017145449612), ((2571, 480), 0.0), ((2571, 2858), 0.0), ((2571, 4306), 0.0)]


# Slope one predictor

In [26]:
class SlopeOnePredictor():
        
    def fit(self,uim):
        self.data = uim.data#data
        self.movies = self.data.movieID.unique()# movies
        
    def predict(self,user):
        results = {}
        for item in self.movies:
            results[item] = self.predict1(item,user) ##### WHY
            
        return results
        
    def predict1(self,item,user):
        
        values = {}
        counts = {}
        
        top_sum = 0
        bot_sum = 0
        res ={}
        
        
        for movie1 in self.movies: # we will compare with each movie
            if movie1 == item: # if its the same, just skip it, no point comparing the same items
                continue         
            else:
                set1 = set(self.data[self.data.movieID==item].userID.to_numpy()) # users that rated our item
                set2 = set(self.data[self.data.movieID==movie1].userID.to_numpy()) # users that rated this movie
                set3 = set1.intersection(set2) # users that rated both
                if not user in (set2):
                    continue
                elif len(set3) ==0:
                    continue
                else:
                #get rating for each userID of item and movie
                    ratingsformov = self.data[(self.data['userID'].isin(set3)) & (self.data['movieID']==movie1)].rating.to_numpy()
                    ratingsforitem = self.data[(self.data['userID'].isin(set3)) & (self.data['movieID']==item)].rating.to_numpy()
                    result = np.mean(ratingsforitem - ratingsformov) 
                    alicerating = self.data[(self.data['userID']==user) & (self.data['movieID']==movie1)].rating.values[0]
                    result = alicerating + result # final result after adding users rating to mean of other users
                    count = len(set3)

            top_sum = top_sum + result*count
            bot_sum = bot_sum + count
        
        res = top_sum/bot_sum
        if(res>5):
            res = 5.0
        
        return res

In [29]:
md = MovieData('C:/Users/user/Desktop/movielens/movies.dat') 
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat',min_ratings=1000) 
rp = SlopeOnePredictor() 
rec = Recommender(rp) 
rec.fit(uim)

print("Predictions for 78: ") 
rec_items = rec.recommend(78, n=15, rec_seen=False) 
for idmovie, val in rec_items:
    print("Movie: {}, score: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Movie: The Usual Suspects, score: 4.325079182263173
Movie: The Lord of the Rings: The Fellowship of the Ring, score: 4.155293229840448
Movie: The Lord of the Rings: The Return of the King, score: 4.153135076202185
Movie: The Silence of the Lambs, score: 4.127978169643881
Movie: Shichinin no samurai, score: 4.119790444913598
Movie: The Lord of the Rings: The Two Towers, score: 4.083325894849594
Movie: Indiana Jones and the Last Crusade, score: 3.9670398355464194
Movie: The Incredibles, score: 3.9664496674557546
Movie: Good Will Hunting, score: 3.963362387354114
Movie: Sin City, score: 3.942619137615212
Movie: Batman Begins, score: 3.9375326640077017
Movie: A Beautiful Mind, score: 3.9140940935239508
Movie: Rain Man, score: 3.9107819079644943
Movie: Monsters, Inc., score: 3.8819375978658006
Movie: Finding Nemo, score: 3.8807711131654794


# Some additional testing

In [28]:
md = MovieData('C:/Users/user/Desktop/movielens/movies.dat')
uim = UserItemData('C:/Users/user/Desktop/movielens/user_ratedmovies.dat', min_ratings=1400)
# add 20 ratings to uim for user 10101(me)
new_row = {'userID':10101, 'movieID':296, 'rating':5.0, 'date_day':1, 'date_month':1, 'date_year':2020, 'date_hour':1, 'date_minute':1, 'date_second':1, 'date':datetime.datetime(2020,1,1)}     
uim.data = uim.data.append(new_row, ignore_index=True)

new_row = {'userID':10101, 'movieID':2571, 'rating':5.0, 'date_day':1, 'date_month':1, 'date_year':2020, 'date_hour':1, 'date_minute':1, 'date_second':1, 'date':datetime.datetime(2020,1,1)}     
uim.data = uim.data.append(new_row, ignore_index=True)

new_row = {'userID':10101, 'movieID':4993, 'rating':5.0, 'date_day':1, 'date_month':1, 'date_year':2020, 'date_hour':1, 'date_minute':1, 'date_second':1, 'date':datetime.datetime(2020,1,1)}     
uim.data = uim.data.append(new_row, ignore_index=True)

rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Predictions for me: ")
rec_items = rec.recommend(10101, n=10, rec_seen=False)
for idmovie, val in rec_items:
    print("Movie: {}, score: {}".format(md.get_title(idmovie), val))

Predictions for me: 
Movie: Fight Club, score: 5.0
Movie: The Shawshank Redemption, score: 5.0
Movie: American Beauty, score: 4.946802935010482
Movie: The Lord of the Rings: The Return of the King, score: 4.9148148148148145
Movie: The Silence of the Lambs, score: 4.892006352567496
Movie: The Lord of the Rings: The Two Towers, score: 4.859896856581533
Movie: Forrest Gump, score: 4.759610645638245
Movie: Shrek, score: 4.624180327868852
Movie: Jurassic Park, score: 4.262059973924381
