# OS Seminarska naloga - Tim Kuhar, 63200163

## Knjižnice

V tem koraku sem "importal" vse knjižnice, ki sem jih potreboval

In [128]:
import pandas as pd
import datetime as dt
import random
import math
from collections import defaultdict

## Branje ocen (6)

In [129]:
class UserItemData:

    def __init__(self, path: str, start_date: str = None, end_date: str = None, min_ratings: int = 0):
        self.path = path

        # formatiral sem dani datum, ki je v obliki dd.mm.yyyy v obliko YYYY-MM-DD zaradi kasnejših primerjav (pandas datetime)
        self.start_date = self._format_date(start_date) if start_date else None
        self.end_date = self._format_date(end_date) if end_date else None

        self.min_ratings = min_ratings

        self.ratings = self._read_data()

    def _format_date(self, date: str) -> str:
        date_parts = date.split(".")
        # tud formatiram iz dd.mm.yyyy v obliko YYYY-MM-DD
        formatted_date = "-".join([f"{part:0>2}" for part in date_parts[::-1]])
        return formatted_date

    def _read_data(self):
        # s pandas naložim podatke
        df = pd.read_csv(self.path, sep='\t')
        
        # izdelal sem novo polje date tako, da sem združil polja date_year date_month in date_day
        date_numbers = df.date_year * 10000 + df.date_month * 100 + df.date_day
        df["date"] = pd.to_datetime(date_numbers, format="%Y%m%d")

        # v primeru, da je podan start_date vzamem le zapise, ki so po datumu oz so datum start_date
        if self.start_date:
            df = df[(df['date'] >= self.start_date)]
        
        # v primeru, da je podan end_date vzamem le zapise, ki so pred datumom end_date
        if self.end_date:
            df = df[df['date'] < self.end_date]

        # v primeru, da je podan min_ratings vzamem le filme, ki imajo vsaj min_ratings ocen
        if self.min_ratings:
            df = df.groupby('movieID').filter(lambda x: len(x) >= self.min_ratings)
        
        return df

    def nratings(self):
        return len(self.ratings)



In [130]:
# test
uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
73584


## Branje filmov (6) 

In [131]:
class MovieData:

    def __init__(self, path: str):
        self.path = path
        self.movies = self._load_data()

    def _load_data(self):
        # s pandas naložim podatke
        df = pd.read_csv(self.path, sep='\t', encoding='ISO-8859-1')
        return df

    def get_title(self, movie_id: int):
        movies = self.movies
        # iz zapisov o filmih vzamem le tisti zapis katerega id se ujema z movie_id in preberem naslov
        return movies[movies['id'] == movie_id]['title'].values[0]

In [132]:
# test
md = MovieData('data/movies.dat')
print(md.get_title(1))

Toy story


## Naključni prediktor (6) 

In [133]:
class RandomPredictor:

    def __init__(self, min_rating: int, max_rating: int):
        self.min_rating = min_rating
        self.max_rating = max_rating
        self.data = None

    def fit(self, X: UserItemData):
        self.data = X

    def predict(self, user_id: int):
        # naredim tabelo vseh id filmov (unikatno)
        movies_ids = self.data.ratings['movieID'].unique()
        # sprehodim se čez tabelo id-jev in za vsak id pripišem naključno oceno od min_rating do max_rating
        # vrnem objekt oblike {filmID: ocena}
        return {movie_id: random.randint(self.min_rating, self.max_rating) for movie_id in movies_ids}


In [134]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 5
Film: Grumpy Old Men, ocena: 4
Film: Money Train, ocena: 5
Film: The Usual Suspects, ocena: 3
Film: City Hall, ocena: 3


## Priporočanje (6) 

In [135]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor

    def fit(self, X: UserItemData):
        self.predictor.fit(X)

    def recommend(self, user_id: int, n: int = 10, rec_seen: bool = True):
        prediction = self.predictor.predict(user_id)
        # objekt oblike {filmID: ocena} spremenim v array, da ga lahko sortiram
        # sortiram array tako, da so filmi z najvišjo oceno na začetku
        sorted_prediction = sorted(prediction.items(), key=lambda x: x[1], reverse=True)

        # v primeru, da je rec_seen true uredim podatke še tako, da vzamem le filme, ki jih uporabnik še ni gledal
        if not rec_seen:
            ratings = self.predictor.data.ratings
            # iz ocen vzamem vse filme, ki jih je uporabnik že gledal
            seen_movies = ratings[ratings['userID'] == user_id]['movieID'].values
            # iz arraya sorted_prediction vzamem vse filme, ki se nahajajo v seen_movies
            sorted_prediction = [x for x in sorted_prediction if x[0] not in seen_movies]

        # vrnem prvih n filmov iz sorted_prediction
        return sorted_prediction[:n]


In [157]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Crow, ocena: 5
Film: Beverly Hills Cop III, ocena: 5
Film: The Fifth Element, ocena: 5
Film: Armageddon, ocena: 5
Film: Back to the Future Part II, ocena: 5


## Napovedovanje s povprečjem (6) 

In [158]:
class AveragePredictor:

    def __init__(self, b : int):
        if b < 0:
            raise ValueError("'b' must be equal to or greater than 0")
        self.b = b
        self.data = None
        self.predictions = None

    def fit(self, X : UserItemData):
        self.data = X
        ratings = X.ratings
        # naredim izračun podan v navodilih
        # sprehodim se čez vse filme in za vsak film naredim izračun
        '''  
            vs je vsota vseh ocen za ta film,
            n je število ocen, ki jih je ta film dobil,
            g_avg je povprečje čez vse filme,
            b je parameter formule za povprečje. Če je b=0, gre za navadno povprečje.
        '''
        # movies_ids = ratings["movieID"].unique()
        # vs = [ratings[ratings["movieID"] == movie_id]["rating"].sum() for movie_id in movies_ids]
        # b = self.b
        # g_avg = ratings["rating"].mean()
        # n = [len(ratings[ratings["movieID"] == movie_id]) for movie_id in movies_ids]
        # avg = (vs + b * g_avg) / (n + b)

        #                   movie_id  (|-----------------------vs---------------------------| + |--b--| * |--------g_avg---------|) / (|--------------------n-------------------|) + |--b--|)
        self.predictions = {movie_id: (ratings[ratings["movieID"] == movie_id]["rating"].sum() + self.b * ratings["rating"].mean()) / (len(ratings[ratings["movieID"] == movie_id]) + self.b) for movie_id in ratings["movieID"].unique()}

    def predict(self, user_id : int):
        return self.predictions

In [159]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = AveragePredictor(0)
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Brother Minister: The Assassination of Malcolm X, ocena: 5.0
Film: Synthetic Pleasures, ocena: 5.0
Film: Adam & Steve, ocena: 5.0
Film: Gabbeh, ocena: 5.0
Film: Eve and the Fire Horse, ocena: 5.0


In [139]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = AveragePredictor(100)
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val)) 

Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


## Priporočanje najbolj gledanih filmov (6) 

In [140]:
class ViewsPredictor:

    def __init__(self):
        self.data = None
        self.predictions = None

    def fit(self, X: UserItemData):
        self.data = X
        ratings = X.ratings
        # naredim objekt oblike {filmID: štOcen}
        # grupiram filme po ID-jih in pogledam dolžino skupine
        self.predictions = {movie_id: len(group) for movie_id, group in ratings.groupby('movieID')}

    def predict(self, user_id: int):
        return self.predictions

In [141]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
vp = ViewsPredictor()
rec = Recommender(vp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: The Silence of the Lambs, ocena: 1431
Film: Shrek, ocena: 1404


## Napovedovanje ocen s podobnostjo med produkti (6) 

In [142]:
class ItemBasedPredictor:
    def __init__(self, min_values : int = 0, threshold : int = 0):
        self.min_values = min_values
        self.threshold = threshold
        self.data = None
        self.predictions = None
        self.user_averages = None
        self.similarities = {}

    def fit(self, X : UserItemData):
        self.data = X
        ratings = X.ratings
        # izračunam povprečne ocene za vsakega uporabnika
        # naredim objekt oblike {userID: povprečnaOcena}
        self.user_averages = { x[0]: x[1]['rating'].mean() for x in ratings.groupby('userID')}
        # preoblikujem obliko podatkov v obliko:
        # vrstice: movieID
        # stolpci: userID
        # vrednosti: ocena uporabnika za film
        rating_matrix = ratings.pivot_table(index='movieID', columns='userID', values='rating')
        # normaliziram matriko tako, da od vsake ocene odštejem povprečje
        normalized_matrix = rating_matrix - rating_matrix.mean()
        # izračun podobnosti
        self.similarities = self._compute_similarities(normalized_matrix)
        
    def _compute_similarities(self, matrix):
        similarities = {}
        # naredim kombinacije filmov [movie_1, movie_2]
        for movie_1 in matrix.index:
            for movie_2 in matrix.index:
                # če je id movie_2 manjši od movie_1 nadaljujem, saj ne želim 2x računati vrednosti (podvojenih vrednosti)
                if movie_2 <= movie_1:
                    continue

                # Izračunam podobnosti po popravljeni kosinusni razdalji (po formuli)
                numerator = (matrix.loc[movie_1] * matrix.loc[movie_2]).sum()
                denominator_1 = math.sqrt((matrix.pow(2).loc[movie_1]).where(matrix.loc[movie_2].notnull()).sum())
                denominator_2 = math.sqrt((matrix.pow(2).loc[movie_2]).where(matrix.loc[movie_1].notnull()).sum())
                
                # če ni zadostnega števila podobnosti
                # če je podobnost manjša od threshold-a
                # če je podobnost negativno število
                # potem nastavim podobnost na 0
                if len(matrix.loc[movie_2].where(matrix.loc[movie_1].notnull())) < self.min_values:
                    similarity = 0
                elif numerator / (denominator_1 * denominator_2) < self.threshold or numerator / (denominator_1 * denominator_2) < 0:
                    similarity = 0
                else:
                    similarity = numerator / (denominator_1 * denominator_2)
                
                # shranim podobnost kot
                # kluč: tuple(film1ID, film2ID)
                # vrednost: podobnost
                similarities[(movie_1, movie_2)] = similarity
        return similarities

    def predict(self, user_id : int):
        sum1 = defaultdict(int)
        sum2 = defaultdict(int)

        # seznam že ocenjenih filmov s strani uporabnika
        already_rated_movie_ids = self.data.ratings[self.data.ratings['userID'] == user_id]['movieID'].unique()
        
        # sprehodim se čez podobnosti in preberem FILM1ID, FILM2ID, podobnost
        for (movie_1, movie_2), value in self.similarities.items():
            # v primeru da je film1 v že ocenjenih filmih naredim izračun
            if movie_1 in already_rated_movie_ids:
                # vsoti 1 prištejem oceno filma movie_1 pomnoženo z podobnostjo med filmoma
                sum1[movie_2] += self._get_rating(user_id, movie_1) * value
                # vsoti 2 prištejem podobnost med filmoma
                sum2[movie_2] += value
            # v primeru da je film2 v že ocenjenih filmih naredim izračun
            if movie_2 in already_rated_movie_ids:
                # vsoti 1 prištejem oceno filma movie_2 pomnoženo z podobnostjo med filmoma
                sum1[movie_1] += self._get_rating(user_id, movie_2) * value
                # vsoti 2 prištejem podobnost med filmoma
                sum2[movie_1] += value

        # naredim objekt oblike:
        # {filmID: vrednost}
        # vrendost je izračunana v funkciji _compute_prediction
        return {k: self._compute_prediction(v, sum2[k]) for k, v in sum1.items() if k not in already_rated_movie_ids}

    def _get_rating(self, user_id, movie_id):
        # vrnem oceno filma, ki ga je ocenil dani uporabnik
        return self.data.ratings[(self.data.ratings['userID'] == user_id) & (self.data.ratings['movieID'] == movie_id)].rating.values[0]

    def _compute_prediction(self, numerator, denominator):
        # vrednost izračunana po formuli
        # vsota od ocena filma krat vsota podobnosti
        # v primeru da je vsota podobnosti 0 vrnem 0
        return numerator / denominator if denominator != 0 else 0

    def similarity(self, movie_index_1 : int, movie_index_2 : int):
        # v prejšnjih izračunih je izračunan ona način, da je (film1ID, film2ID) film1ID manjši od film2ID, zato v danem primeru zamenjam ID-ja
        # za podobnost velja da je enaka neglede na to kako obrnemo filme
        # film 1 bo še vedno enako podoben filmu 2 kot je film 2 podoben filmu 1
        # zato lahko v tem primeru obrnem filma
        if movie_index_1 > movie_index_2:
            movie_index_1, movie_index_2 = movie_index_2, movie_index_1
        # preberem podobnost med danima filmoma
        return self.similarities[(movie_index_1, movie_index_2)] if (movie_index_1, movie_index_2) in self.similarities else 0

    def most_similar(self, n : int = 20):
        # uredim podobnosti po vrednosti in vrnem prvih n
        return sorted(self.similarities.items(), key=lambda x: x[1], reverse=True)[:n]

    def similar_items(self, movie_id, n):
        # vzamem tiste podobnosti, ki imajo za movie_1 ali movie_2, movie_id
        # nato jih sortiram tako, da so največje podobnosti na začetku
        # vrnem n podobnosti
        return sorted([(k[1], v) for k,v in self.similarities.items() if k[0] == movie_id] + [(k[0], v) for k,v in self.similarities.items() if k[1] == movie_id], key=lambda x: x[1], reverse=True)[:n]


In [143]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
ip = ItemBasedPredictor()
rec = Recommender(ip)
rec.fit(uim)

print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", ip.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", ip.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", ip.similarity(1580, 780))

Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.23395523176756633
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.4246612584468763


In [144]:
print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: Shichinin no samurai, ocena: 4.3557347903101595
Film: The Usual Suspects, ocena: 4.3546817280678365
Film: The Silence of the Lambs, ocena: 4.335305303472519
Film: Sin City, ocena: 4.2786871668991004
Film: Monsters, Inc., ocena: 4.2175811369435205
Film: The Incredibles, ocena: 4.207098583281748
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.152792107348348
Film: Batman Begins, ocena: 4.146413806700199
Film: Die Hard, ocena: 4.125915602232819
Film: Rain Man, ocena: 4.07153524295855
Film: The Lord of the Rings: The Return of the King, ocena: 4.020237449257013
Film: A Beautiful Mind, ocena: 4.015142490064837
Film: Good Will Hunting, ocena: 4.009280806922821
Film: The Lord of the Rings: The Two Towers, ocena: 3.9414763050955943
Film: Indiana Jones and the Last Crusade, ocena: 3.7969764963789245


In [145]:
for movie, val in ip.most_similar():
    print(f"Film 1: {md.get_title(movie[0])}, Film 2:{md.get_title(movie[1])}, podobnost: {val}")

Film 1: The Lord of the Rings: The Two Towers, Film 2:The Lord of the Rings: The Return of the King, podobnost: 0.8439842148481418
Film 1: The Lord of the Rings: The Fellowship of the Ring, Film 2:The Lord of the Rings: The Two Towers, podobnost: 0.8231885401761888
Film 1: The Lord of the Rings: The Fellowship of the Ring, Film 2:The Lord of the Rings: The Return of the King, podobnost: 0.8079374897442495
Film 1: Kill Bill: Vol. 2, Film 2:Kill Bill: Vol. 2, podobnost: 0.7372340224381029
Film 1: Star Wars, Film 2:Star Wars: Episode V - The Empire Strikes Back, podobnost: 0.7021321132220318
Film 1: Ace Ventura: Pet Detective, Film 2:The Mask, podobnost: 0.6616471778494046
Film 1: Star Wars: Episode V - The Empire Strikes Back, Film 2:Star Wars: Episode VI - Return of the Jedi, podobnost: 0.5992253753778948
Film 1: Independence Day, Film 2:Star Wars: Episode I - The Phantom Menace, podobnost: 0.5610426219249997
Film 1: Ace Ventura: Pet Detective, Film 2:Austin Powers: The Spy Who Shagged 

## Priporočanje glede na trenutno ogledano vsebino (7) 

In [146]:
# metoda prejme id filma, v danem primeru je to film 
# "The Lord of the Rings: The Fellowship of the Ring"
# izpišem prvih 10 najbolj podobnih filmov in vrednost izračuna podobnosti
rec_items = ip.similar_items(4993, 10)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": 
Film: The Lord of the Rings: The Two Towers, ocena: 0.8231885401761888
Film: The Lord of the Rings: The Return of the King, ocena: 0.8079374897442495
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.23961943073496453
Film: Star Wars, ocena: 0.2196558652707407
Film: The Matrix, ocena: 0.2151555270688023
Film: Raiders of the Lost Ark, ocena: 0.19944276706345015
Film: The Usual Suspects, ocena: 0.18321188451910753
Film: Blade Runner, ocena: 0.1639968131541027
Film: Schindler's List, ocena: 0.16105905138148702
Film: Monty Python and the Holy Grail, ocena: 0.1578045379851914


## Priporočilo zase (7)

Dodelil sem si id 1, in si v datoteko user_ratedmovies_new.dat dodal naslednje zapise:<br/>
1   4369    5   7   1   2023    15  0   0<br/>
1   46335   5   7   1   2023    15  5   0<br/>
1   4201    5   7   1   2023    15  10  0<br/>
1   1       3   20  10  2022    7   0   0<br/>
1   2       4   10  1   2021    8   0   0<br/>
1   63436   4.5 12  4   2020    6   8   2<br/>
1   62912   3.5 8   1   2015    5   6   7<br/>
1   59315   5   7   3   2018    6   3   4<br/>
1   56367   2.5 3   3   2012    5   6   7<br/>
1   54503   4   2   2   2020    2   2   2<br/>
1   51939   3   4   3   2021    3   4   5<br/>
1   48322   4.5 3   4   2022    2   3   4<br/>
1   45517   4.5 3   5   2015    2   3   4<br/>
1   45499   4   2   3   2020    5   9   5<br/>
1   4212    4   12  12  2022    5   4   3<br/>
1   3988    3   12  12  2022    12  12  12<br/>
1   3702    3.5 2   2   2009    34  34  34<br/>
1   2513    3.5 5   6   2017    45  34  34<br/>
1   592     4.5 2   3   2034    23  45  45<br/>



In [147]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies_new.dat', min_ratings=1000)
ip = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

In [148]:
print("Predictions for Kuhar(1): ")
rec_items = rec.recommend(1, n=10, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for Kuhar(1): 
Film: Pulp Fiction, ocena: 5
Film: The Matrix, ocena: 5
Film: The Sixth Sense, ocena: 5
Film: Forrest Gump, ocena: 5
Film: Schindler's List, ocena: 5
Film: Independence Day, ocena: 5
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 5
Film: Star Wars: Episode VI - Return of the Jedi, ocena: 5
Film: Memento, ocena: 5
Film: The Matrix Reloaded, ocena: 5


## Napovedovanje z metodo Slope one (7)

In [149]:
class SlopeOnePredictor:
    def __init__(self):
        self.data = None
        self.predictions = {}
        self.dev = {}
        self.weight = {}

    def fit(self, X: UserItemData):
        self.data = X
        ratings = X.ratings

        # tako kot že prej sem preoblikoval tabelo v matriko film, uporabnik, ocena
        ratings_pivot = ratings.pivot_table(index='movieID', columns='userID', values='rating')

        # kombinacija vseh filmov
        for movie_1 in ratings_pivot.index:
            for movie_2 in ratings_pivot.index:
                # če je film isti preskočim
                if movie_1 == movie_2:
                    continue
                # Izračunam utež
                self.weight[(movie_1,movie_2)] = ratings_pivot.loc[movie_1].where(ratings_pivot.loc[movie_2].notnull()).count()
                # Izračunam povprečno razliko med ocenama kombinacije filmov 
                # in številom uporabnikov, ki so ocenili oba filma (movie_1 in movie_2)
                self.dev[(movie_1,movie_2)] = (ratings_pivot.loc[movie_1].where(ratings_pivot.loc[movie_2].notnull()) - ratings_pivot.loc[movie_2].where(ratings_pivot.loc[movie_1].notnull())).mean()

    def predict(self, user_id: int):
        rating_sums = defaultdict(int)
        weight_sums = defaultdict(int)
        
        # pridobim seznam vseh filmov, ki jih je ocenil dani uporabnik
        movie_ids = self.data.ratings[self.data.ratings['userID'] == user_id]['movieID'].unique()

        # iteriram čez podatke, ki sem jih zračunal prej (povprečna razlika ocen parov filmov)
        # dobim film1, film2 in vrendost
        for (movie_1, movie_2), v in self.dev.items():
            # če se film2 nahaja med filmi, ki jih je ocenil uporabnik
            # potem nadaljujem z napovedovanjem ocene filma 1
            if movie_2 in movie_ids:
                # vsoti za film movie_1 prištejem:
                # oceno filma2 (ki ga je ocenil uporabnik) + povprečno razliko ocen para
                # pomnožim z utežjo
                rating_sums[movie_1] += (self.data.ratings[(self.data.ratings['userID'] == user_id) & (self.data.ratings['movieID'] == movie_2)].rating.values[0] + v) * self.weight[(movie_1,movie_2)]
                # vsoti uteži za movie_1 prištejem utež
                weight_sums[movie_1] += self.weight[(movie_1,movie_2)]

        # vrnem objekt oblike:
        # {filmID: vrednost}
        # vrednost: vsota deljena z vsoto uteži
        return { k: (v / weight_sums[k]) for k, v in rating_sums.items() }


In [150]:

#test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
sop = SlopeOnePredictor()
rec = Recommender(sop)
rec.fit(uim)

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))



Predictions for 78: 
Film: The Usual Suspects, ocena: 4.325079182263173
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.155293229840448
Film: The Lord of the Rings: The Return of the King, ocena: 4.153135076202185
Film: The Silence of the Lambs, ocena: 4.127978169643881
Film: Shichinin no samurai, ocena: 4.119790444913598
Film: The Lord of the Rings: The Two Towers, ocena: 4.083325894849594
Film: Indiana Jones and the Last Crusade, ocena: 3.9670398355464194
Film: The Incredibles, ocena: 3.9664496674557546
Film: Good Will Hunting, ocena: 3.963362387354114
Film: Sin City, ocena: 3.942619137615212
Film: Batman Begins, ocena: 3.9375326640077017
Film: A Beautiful Mind, ocena: 3.9140940935239508
Film: Rain Man, ocena: 3.9107819079644943
Film: Monsters, Inc., ocena: 3.8819375978658006
Film: Finding Nemo, ocena: 3.8807711131654794


## Metoda evaluate(self, test_data, n) (8) 

In [151]:
# iz razreda Recommender naredim nov razred Recommender
# torej razredu dodam novo metodo evaluate
class Recommender(Recommender):
    def evaluate(self, test_data : UserItemData, n : int = 10):
        count = 0
        users = test_data.ratings.userID.unique()
        precision_sum = 0
        recall_sum = 0
        rmse_sum = 0
        mae_sum = 0
        
        # a = 0
        for i in users:
            # print(str(a) + "/" + str(len(users)) + " - " + str(i))
            # a += 1
            
            # za vsakega uporabnika naredim n napovedi
            rec_items = dict(self.recommend(i, n=n, rec_seen=False))
            # iz danih testnih podatkov vzamem podatke, ki se nanašajo na trenutnega uporabnika
            # preoblikujem obliko za kasnejše raučananje
            test_items = test_data.ratings[test_data.ratings['userID'] == i][['movieID', 'rating']].pivot_table(index='movieID', values='rating')
            
            # izračunam koliko filmov si je uporabnik pogledal (izmed napovedanih ocen)
            watched_predictions = len(set(rec_items.keys()).intersection(set(test_items.index)))
            # izračunam koliko filmov si še ni pogledal
            new_movies = len(set(test_items.index) - set(self.predictor.data.ratings[self.predictor.data.ratings['userID'] == i]['movieID'].unique()))
            
            # izračunam natančnost
            precision = watched_predictions / n
            # izračunam recall
            recall = watched_predictions / new_movies
            
            # sproti računam vsote natnčonsti in recall-a za končni izpis
            precision_sum += precision
            recall_sum += recall
            
            # izračnuam napake za vsak gledani film
            for id in set(rec_items.keys()).intersection(set(test_items.index)):
                rmse_sum += (test_items.loc[id][0] - rec_items[id]) ** 2
                mae_sum += abs(test_items.loc[id][0] - rec_items[id])
                count += 1

        # končni izračuni po formulah
        rmse = (rmse_sum / count) ** 0.5
        mae = mae_sum / count
        precision = precision_sum / len(users)
        recall = recall_sum / len(users)
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

        return (rmse, mae, precision, recall, f1)


In [152]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
sop = SlopeOnePredictor()
rec = Recommender(sop)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print(mse, mae, precision, recall, f)

0.8593687090470131 0.6321782795132675 0.04849690539345704 0.08567838257795103 0.061935941811173796


In [153]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
ap = AveragePredictor(100)
rec = Recommender(ap)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print(mse, mae, precision, recall, f)

0.8407421191080349 0.6251855149612695 0.24160035366931948 0.19573212227310183 0.21626086589487079


In [154]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
vp = ViewsPredictor()
rec = Recommender(vp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print(mse, mae, precision, recall, f)

1168.0991006535573 1165.1587428774928 0.2482758620689658 0.19909433298718388 0.22098171805671937


In [155]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = RandomPredictor(1,5)
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print(mse, mae, precision, recall, f)

1.4235648373447818 1.1166475754104621 0.23156498673740075 0.18696818774820906 0.20689058146676728


In [156]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
ibp = ItemBasedPredictor()
rec = Recommender(ibp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print(mse, mae, precision, recall, f)

1.002706421945923 0.6956271888135395 0.04885057471264362 0.08737780605136004 0.06266617893863602


Najbolje deluje razred AveragePredictor, saj ima njvišje ocene/točke pri recall, f1 in precision. SlopeOnePredictor pa je zelo blizu za njim. Najslabši pa je bil RandomPredictor kot sem tudi pričakoval, saj si izmisli naključne predikcije.