# OS Seminarska naloga - Tim Kuhar, 63200163

## Knjižnice

In [211]:
import pandas as pd
import datetime as dt
import numpy as np
import random
import math
from collections import defaultdict
import itertools
from sklearn.metrics.pairwise import cosine_similarity

## Branje ocen (6)

In [212]:
class UserItemData:

    def __init__(self, path : str, start_date : str | None = None, end_date : str | None = None, min_ratings : int = 0):
        self.path = path

        # Formatiranje datuma iz formata DD.MM.YYYY v format YYYY-MM-DD
        self.start_date = "-".join([(x if len(x) > 1 else f"0{x}") for x in start_date.split(".")[::-1]]) if start_date else None
        self.end_date = "-".join([(x if len(x) > 1 else f"0{x}") for x in end_date.split(".")[::-1]]) if end_date else None

        self.min_ratings = min_ratings

        self.ratings = self._read_data()

    def _read_data(self):
        df = pd.read_csv(self.path, sep='\t')
        
        # Izdelava stolpca date iz stolpcev date_year, date_month in date_day
        date_numbers = df.date_year * 10000 + df.date_month * 100 + df.date_day
        df["date"] = pd.to_datetime(date_numbers, format="%Y%m%d")

        if self.start_date:
            df = df[(df['date'] >= self.start_date)]
        
        if self.end_date:
            df = df[df['date'] < self.end_date]

        if self.min_ratings:
            df = df.groupby('movieID').filter(lambda x: len(x) >= self.min_ratings)
        
        return df

    def nratings(self):
        return len(self.ratings)


In [213]:
# test
uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
73584


## Branje filmov (6) 

In [214]:
class MovieData:

    def __init__(self, path : str):
        self.path = path
        self.movies = self._read_data()

    def _read_data(self):
        df = pd.read_csv(self.path, sep='\t', encoding='ISO-8859-1')
        return df

    def get_title(self, movieID : int):
        movies = self.movies
        return movies[movies.id == movieID].title.values[0]

In [215]:
# test
md = MovieData('data/movies.dat')
print(md.get_title(1))

Toy story


## Naključni prediktor (6) 

In [216]:
class RandomPredictor:

    def __init__(self, min : int, max : int):
        self.min = min
        self.max = max
        self.data = None

    def fit(self, X : UserItemData):
        self.data = X

    def predict(self, user_id : int):
        # 65133 je število vseh filmov, tega podatka nismo dobili v prediktor zato sem ga moral preveriti
        return {(i + 1): random.randint(self.min, self.max) for i in range(65133)}

In [217]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 4
Film: Grumpy Old Men, ocena: 4
Film: Money Train, ocena: 2
Film: The Usual Suspects, ocena: 3
Film: City Hall, ocena: 5


## Priporočanje (6) 

In [218]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor

    def fit(self, X : UserItemData):
        self.predictor.fit(X)

    def recommend(self, user_id : int, n : int = 10, rec_seen : bool = True):
        prediction = self.predictor.predict(user_id)
        prediction = sorted(prediction.items(), key=lambda x: x[1], reverse=True)

        if not rec_seen:
            ratings = self.predictor.data.ratings
            seen_movies = ratings[ratings.userID == user_id].movieID.values
            prediction = [x for x in prediction if x[1] not in seen_movies]

        return prediction[:n]
            

In [219]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Father of the Bride Part II, ocena: 5
Film: Sabrina, ocena: 5
Film: Tom and Huck, ocena: 5
Film: Sudden Death, ocena: 5
Film: Casino, ocena: 5


## Napovedovanje s povprečjem (6) 

In [220]:
class AveragePredictor:

    def __init__(self, b : int):
        if b < 0:
            raise ValueError("'b' must be equal to or greater than 0")
        self.b = b
        self.data = None
        self.predictions = None

    def fit(self, X : UserItemData):
        self.data = X
        ratings = X.ratings
        '''  
            vs je vsota vseh ocen za ta film,
            n je število ocen, ki jih je ta film dobil,
            g_avg je povprečje čez vse filme,
            b je parameter formule za povprečje. Če je b=0, gre za navadno povprečje.
        '''
        # movies_ids = ratings["movieID"].unique()
        # vs = [ratings[ratings["movieID"] == movie_id]["rating"].sum() for movie_id in movies_ids]
        # b = self.b
        # g_avg = ratings["rating"].mean()
        # n = [len(ratings[ratings["movieID"] == movie_id]) for movie_id in movies_ids]
        # avg = (vs + b * g_avg) / (n + b)

        #                   movie_id  (|-----------------------vs---------------------------| + |--b--| * |--------g_avg---------|) / (|--------------------n-------------------|) + |--b--|)
        self.predictions = {movie_id: (ratings[ratings["movieID"] == movie_id]["rating"].sum() + self.b * ratings["rating"].mean()) / (len(ratings[ratings["movieID"] == movie_id]) + self.b) for movie_id in ratings["movieID"].unique()}

    def predict(self, user_id : int):
        return self.predictions

In [221]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = AveragePredictor(0)
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Brother Minister: The Assassination of Malcolm X, ocena: 5.0
Film: Synthetic Pleasures, ocena: 5.0
Film: Adam & Steve, ocena: 5.0
Film: Gabbeh, ocena: 5.0
Film: Eve and the Fire Horse, ocena: 5.0


In [222]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = AveragePredictor(100)
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val)) 

Film: The Shawshank Redemption, ocena: 4.305187875177615
Film: The Godfather, ocena: 4.262394259034606
Film: The Usual Suspects, ocena: 4.225944245560473
Film: Fight Club, ocena: 4.199670479562388
Film: Pulp Fiction, ocena: 4.189550712063961


## Priporočanje najbolj gledanih filmov (6) 

In [223]:
class ViewsPredictor:

    def __init__(self):
        self.data = None
        self.predictions = None

    def fit(self, X : UserItemData):
        self.data = X
        ratings = X.ratings
        self.predictions = { x[0]: len(x[1]) for x in ratings.groupby('movieID')}

    def predict(self, user_id : int):
        return self.predictions

In [224]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
vp = ViewsPredictor()
rec = Recommender(vp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Matrix, ocena: 1670
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: Forrest Gump, ocena: 1568
Film: Pulp Fiction, ocena: 1537
Film: The Lord of the Rings: The Two Towers, ocena: 1528


## Napovedovanje ocen s podobnostjo med produkti (6) 

In [225]:
class ItemBasedPredictor:
    def __init__(self, min_values : int = 0, threshold : int = 0):
        self.min_values = min_values
        self.threshold = threshold
        self.data = None
        self.predictions = None
        self.user_averages = None
        self.similarities = {}

    def fit(self, X : UserItemData):
        self.data = X
        ratings = X.ratings
        self.user_averages = { x[0]: x[1]['rating'].mean() for x in ratings.groupby('userID')}
        rating_matrix = ratings.pivot_table(index='movieID', columns='userID', values='rating')
        normalized_matrix = rating_matrix - rating_matrix.mean()
        self.similarities = self._compute_similarities(normalized_matrix)
        
    def _compute_similarities(self, matrix):
        similarities = {}
        for movie_1 in matrix.index:
            for movie_2 in matrix.index:
                if movie_2 <= movie_1:
                    continue
                numerator = (matrix.loc[movie_1] * matrix.loc[movie_2]).sum()
                denominator_1 = math.sqrt((matrix.pow(2).loc[movie_1]).where(matrix.loc[movie_2].notnull()).sum())
                denominator_2 = math.sqrt((matrix.pow(2).loc[movie_2]).where(matrix.loc[movie_1].notnull()).sum())
                similarity = numerator / (denominator_1 * denominator_2)
                if similarity < self.threshold or similarity < 0:
                    similarity = 0
                if len(matrix.loc[movie_2].where(matrix.loc[movie_1].notnull())) < self.min_values:
                    similarity = 0
                similarities[(movie_1, movie_2)] = similarity
        return similarities

    def predict(self, user_id : int):
        sum1 = defaultdict(int)
        sum2 = defaultdict(int)
        already_rated_movie_ids = self.data.ratings[self.data.ratings['userID'] == user_id]['movieID'].unique()
        for (movie_1, movie_2), value in self.similarities.items():
            if movie_1 in already_rated_movie_ids:
                sum1[movie_2] += self._get_rating(user_id, movie_1) * value
                sum2[movie_2] += value
            if movie_2 in already_rated_movie_ids:
                sum1[movie_1] += self._get_rating(user_id, movie_2) * value
                sum2[movie_1] += value
        return {k: self._compute_prediction(v, sum2[k]) for k, v in sum1.items() if k not in already_rated_movie_ids}

    def _get_rating(self, user_id, movie_id):
        return self.data.ratings[(self.data.ratings['userID'] == user_id) & (self.data.ratings['movieID'] == movie_id)].rating.values[0]

    def _compute_prediction(self, numerator, denominator):
        return numerator / denominator if denominator != 0 else 0

    def similarity(self, movie_index_1 : int, movie_index_2 : int):
        if movie_index_1 > movie_index_2:
            movie_index_1, movie_index_2 = movie_index_2, movie_index_1
        return self.similarities[(movie_index_1, movie_index_2)] if (movie_index_1, movie_index_2) in self.similarities else 0

    def most_similar(self, n : int = 20):
        return sorted(self.similarities.items(), key=lambda x: x[1], reverse=True)[:n]

    def similar_items(self, movie_id, n):
        return sorted([(k[1], v) for k,v in self.similarities.items() if k[0] == movie_id] + [(k[0], v) for k,v in self.similarities.items() if k[1] == movie_id], key=lambda x: x[1], reverse=True)[:n]


In [226]:
# test
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
ip = ItemBasedPredictor()
rec = Recommender(ip)
rec.fit(uim)

print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", ip.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", ip.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", ip.similarity(1580, 780))

Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.23395523176756633
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.4246612584468763


In [227]:
print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: Shichinin no samurai, ocena: 4.3557347903101595
Film: The Usual Suspects, ocena: 4.3546817280678365
Film: The Silence of the Lambs, ocena: 4.335305303472519
Film: Sin City, ocena: 4.2786871668991004
Film: Monsters, Inc., ocena: 4.2175811369435205
Film: The Incredibles, ocena: 4.207098583281748
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.152792107348348
Film: Batman Begins, ocena: 4.146413806700199
Film: Die Hard, ocena: 4.125915602232819
Film: Rain Man, ocena: 4.07153524295855
Film: The Lord of the Rings: The Return of the King, ocena: 4.020237449257013
Film: A Beautiful Mind, ocena: 4.015142490064837
Film: Good Will Hunting, ocena: 4.009280806922821
Film: The Lord of the Rings: The Two Towers, ocena: 3.9414763050955943
Film: Indiana Jones and the Last Crusade, ocena: 3.7969764963789245


In [228]:
for movie, val in ip.most_similar():
    print(f"Film 1: {md.get_title(movie[0])}, Film 2:{md.get_title(movie[1])}, podobnost: {val}")

Film 1: The Lord of the Rings: The Two Towers, Film 2:The Lord of the Rings: The Return of the King, podobnost: 0.8439842148481418
Film 1: The Lord of the Rings: The Fellowship of the Ring, Film 2:The Lord of the Rings: The Two Towers, podobnost: 0.8231885401761888
Film 1: The Lord of the Rings: The Fellowship of the Ring, Film 2:The Lord of the Rings: The Return of the King, podobnost: 0.8079374897442495
Film 1: Kill Bill: Vol. 2, Film 2:Kill Bill: Vol. 2, podobnost: 0.7372340224381029
Film 1: Star Wars, Film 2:Star Wars: Episode V - The Empire Strikes Back, podobnost: 0.7021321132220318
Film 1: Ace Ventura: Pet Detective, Film 2:The Mask, podobnost: 0.6616471778494046
Film 1: Star Wars: Episode V - The Empire Strikes Back, Film 2:Star Wars: Episode VI - Return of the Jedi, podobnost: 0.5992253753778948
Film 1: Independence Day, Film 2:Star Wars: Episode I - The Phantom Menace, podobnost: 0.5610426219249997
Film 1: Ace Ventura: Pet Detective, Film 2:Austin Powers: The Spy Who Shagged 

## Priporočanje glede na trenutno ogledano vsebino (7) 

In [229]:
rec_items = ip.similar_items(4993, 10)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": 
Film: The Lord of the Rings: The Two Towers, ocena: 0.8231885401761888
Film: The Lord of the Rings: The Return of the King, ocena: 0.8079374897442495
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.23961943073496453
Film: Star Wars, ocena: 0.2196558652707407
Film: The Matrix, ocena: 0.2151555270688023
Film: Raiders of the Lost Ark, ocena: 0.19944276706345015
Film: The Usual Suspects, ocena: 0.18321188451910753
Film: Blade Runner, ocena: 0.1639968131541027
Film: Schindler's List, ocena: 0.16105905138148702
Film: Monty Python and the Holy Grail, ocena: 0.1578045379851914


## Priporočilo zase (7)

In [230]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies_new.dat', min_ratings=1000)
ip = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

In [231]:
print("Predictions for Kuhar(1): ")
rec_items = rec.recommend(1, n=10, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for Kuhar(1): 
Film: Waiting to Exhale, ocena: 5
Film: Heat, ocena: 5
Film: The American President, ocena: 5
Film: Four Rooms, ocena: 5
Film: Get Shorty, ocena: 5
Film: Powder, ocena: 5
Film: Othello, ocena: 5
Film: La cité des enfants perdus, ocena: 5
Film: Yao a yao yao dao waipo qiao, ocena: 5
Film: It Takes Two, ocena: 5
