# HW 6 - Recommender systems

## Input data

### Movies

The dataset for movies includes the following columns:

-   **movieId**: ID for the movie
-   **title**: title of the movie
-   **genres**: list of genres separated by "|"

### Ratings

The Ratings dataset has four columns:

-   **userId**: user ID
-   **movieId**: rated movie ID
-   **rating**: rated on a 5-star scale with half-star increments ranging from 0.5 to 5.0 stars.
-   **timestamp**: timestamps represent the number of seconds since midnight Coordinated Universal Time (UTC) on January 1, 1970.


In [1]:
!pip3 install -q pandas

In [98]:
import pandas as pd

moviesDf = pd.read_csv("../ml-latest-small/movies.csv")
ratingsDf = pd.read_csv("../ml-latest-small/testing-ratings.csv")

print("Movies")
display(moviesDf)

print("Ratings")
display(ratingsDf)

Movies


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


Ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,6,4.0,964982224
2,1,50,5.0,964982931
3,1,101,5.0,964980868
4,1,151,5.0,964984041
...,...,...,...,...
50413,610,162350,3.5,1493849971
50414,610,163981,3.5,1493850155
50415,610,166528,4.0,1493879365
50416,610,168248,5.0,1493850091


## Content based recommender system

This function suggests movies to users based on their preferred genres.


### Finds unique genres from the collection


In [99]:
# finds unique genres from the collection
uniqueGenres = set()

for genres in moviesDf["genres"]:
    uniqueGenres = uniqueGenres.union(set(genres.split("|")))

# uniqueGenres.remove("(no genres listed)")
uniqueGenres = [genre for genre in uniqueGenres]
print(f"unique genres: {uniqueGenres}")

unique genres: ['Adventure', 'Animation', 'Fantasy', 'Children', 'Romance', 'Mystery', 'Horror', 'Documentary', 'Crime', 'Sci-Fi', 'Comedy', 'Film-Noir', 'Western', '(no genres listed)', 'IMAX', 'Drama', 'Musical', 'Action', 'Thriller', 'War']


### Create new datasets


In [100]:
# movie like a set of genres

data = []

for index, movie in moviesDf.iterrows():
    genres = set(movie['genres'].split("|"))
    data.append([movie["movieId"]] +
                [1 if genre in genres else 0 for genre in uniqueGenres])

movieGenresDf = pd.DataFrame(data, columns=["movieId"] + uniqueGenres)
movieGenresDf.set_index("movieId", inplace=True)
display(movieGenresDf)

Unnamed: 0_level_0,Adventure,Animation,Fantasy,Children,Romance,Mystery,Horror,Documentary,Crime,Sci-Fi,Comedy,Film-Noir,Western,(no genres listed),IMAX,Drama,Musical,Action,Thriller,War
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
193583,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
193587,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [101]:
# userprofile consists set of genres instead of movies

moviesRatingsDf = pd.merge(moviesDf, ratingsDf, on="movieId")
moviesRatingsDf.set_index("movieId", inplace=True)

data = {}  # dictionary, keys are user ids, values are dictionaries with keys of genres and values 0, increased by 1 for every movie of this genre rated by >= 2.5

for index, movieRating in moviesRatingsDf.iterrows():
    genres = set(movieRating["genres"].split("|"))
    if movieRating["userId"] in data:
        if movieRating["rating"] >= 2.5:
            for genre in genres:
                data[movieRating["userId"]][genre] += 1
    else:
        data[movieRating["userId"]] = {genre: 0 for genre in uniqueGenres}
        for genre in genres:
            data[movieRating["userId"]][genre] += 1

data = [[user] + [data[user][key] for key in data[user]] for user in data]
userGenresDf = pd.DataFrame(data, columns=["userId"] + uniqueGenres)
userGenresDf.set_index("userId", inplace=True)
display(userGenresDf)

Unnamed: 0_level_0,Adventure,Animation,Fantasy,Children,Romance,Mystery,Horror,Documentary,Crime,Sci-Fi,Comedy,Film-Noir,Western,(no genres listed),IMAX,Drama,Musical,Action,Thriller,War
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,44,17,26,22,14,9,6,0,24,14,36,0,5,0,0,35,12,46,24,12
5,3,3,3,4,7,0,1,0,4,1,5,0,0,0,3,14,2,3,3,1
7,18,4,8,5,5,7,3,0,10,16,16,2,0,0,1,20,1,22,19,5
15,26,10,6,9,6,8,10,0,7,25,14,0,1,0,8,27,4,21,20,1
18,65,12,27,15,24,21,6,4,62,49,75,5,8,0,21,106,6,104,80,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,11,2,8,4,0,1,1,0,0,4,2,0,0,0,5,5,0,9,3,0
459,8,4,2,4,0,1,0,0,0,4,1,0,1,0,2,5,0,6,3,0
184,15,15,10,2,5,8,1,4,6,19,9,0,0,1,6,20,0,27,10,3
306,18,17,10,17,14,2,8,0,0,4,37,0,0,0,6,4,3,9,6,0


### Compute cosine similarity


In [6]:
!pip3 install -q scikit-learn

In [102]:
from sklearn.metrics.pairwise import cosine_similarity

# compute cosine similarity
contentBasedRecommendations = pd.DataFrame(cosine_similarity(
    movieGenresDf, userGenresDf), columns=userGenresDf.index, index=movieGenresDf.index)
display(contentBasedRecommendations)

userId,1,5,7,15,18,19,46,50,64,68,...,148,236,163,360,535,106,459,184,306,556
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.648589,0.422507,0.465564,0.483475,0.398853,0.645145,0.345949,0.541386,0.510788,0.494514,...,0.851600,0.473365,0.195180,0.220564,0.452839,0.630298,0.611632,0.464308,0.832403,0.638802
2,0.531269,0.303030,0.365339,0.393703,0.284001,0.388428,0.238197,0.312678,0.325675,0.317334,...,0.636501,0.166667,0.000000,0.000000,0.000000,0.693161,0.581820,0.317340,0.488467,0.798087
3,0.353624,0.445362,0.303109,0.235213,0.321822,0.617527,0.328196,0.473056,0.524133,0.550847,...,0.425210,0.612372,0.694365,0.348743,0.795557,0.073821,0.050899,0.201528,0.678014,0.065164
4,0.490846,0.787879,0.483190,0.451318,0.544113,0.646134,0.446619,0.729581,0.737479,0.729618,...,0.385758,0.611111,0.818923,0.474579,0.779484,0.210962,0.249351,0.399613,0.597015,0.133014
5,0.360072,0.262432,0.326599,0.232849,0.344792,0.659838,0.309426,0.509716,0.498820,0.471737,...,0.367484,0.769800,0.436436,0.493197,0.900070,0.104399,0.071982,0.183216,0.695641,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.625125,0.367405,0.510310,0.424117,0.501098,0.611321,0.438354,0.568121,0.524460,0.544228,...,0.734968,0.529238,0.327327,0.328798,0.562544,0.548096,0.467880,0.620899,0.686241,0.483814
193583,0.456198,0.333333,0.329983,0.288076,0.302580,0.552762,0.267971,0.484344,0.433336,0.406037,...,0.655789,0.555556,0.251976,0.284747,0.584613,0.361649,0.290910,0.399613,0.694709,0.372441
193585,0.350070,0.734809,0.408248,0.449065,0.487306,0.245822,0.309426,0.594669,0.536115,0.484720,...,0.066815,0.192450,0.436436,0.328798,0.225018,0.260998,0.359908,0.407147,0.075204,0.138233
193587,0.445566,0.222681,0.375278,0.364580,0.377085,0.265308,0.328196,0.322880,0.267011,0.332038,...,0.543324,0.068041,0.154303,0.116248,0.159111,0.406017,0.508987,0.604583,0.345654,0.260654


In [35]:
from sklearn.preprocessing import MinMaxScaler

# set seen movies to 0
for userId in contentBasedRecommendations.columns:
    seenMovies = ratingsDf[ratingsDf["userId"] == userId]["movieId"]
    contentBasedRecommendations[userId][seenMovies] = 0

# normalization
scaler = MinMaxScaler()
scaledRecommendations = scaler.fit_transform(
    contentBasedRecommendations.values)
contentBasedRecommendations = pd.DataFrame(
    scaledRecommendations, index=contentBasedRecommendations.index, columns=contentBasedRecommendations.columns)

display(contentBasedRecommendations)

userId,17,21,27,31,32,33,40,43,44,45,...,598,499,236,306,360,163,320,535,184,291
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.409091,0.521739,0.421053,0.947745,0.090909,0.505964,0.298142,0.206406,0.510000,0.725211
2,0.477107,0.449788,0.771389,0.546401,0.205953,0.128713,0.515174,0.631883,0.433013,0.422912,...,0.234726,0.505172,0.135894,0.492591,0.058682,0.081650,0.192450,0.000000,0.374388,0.780203
3,0.206235,0.492003,0.454882,0.708566,0.622193,0.691193,0.587443,0.608060,0.574524,0.697882,...,0.718699,0.206235,0.665743,0.720066,0.143740,0.800000,0.157135,0.652714,0.221359,0.076444
4,0.505172,0.518458,0.628539,0.771389,0.865004,1.000000,0.905996,0.797376,0.577350,0.792403,...,0.938905,0.392911,0.747418,0.730942,0.469453,0.898146,0.256600,0.977054,0.438938,0.187249
5,0.194441,0.505494,0.371135,0.723713,0.523192,0.565918,0.584615,0.000000,0.625000,0.794190,...,0.508197,0.194441,0.706127,0.770626,0.101639,0.989949,0.222222,0.461538,0.290689,0.108108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.559017,0.698771,0.630929,0.779383,0.499411,0.360129,0.584615,0.638431,0.718750,0.744071,...,0.457378,0.486102,0.529595,0.853193,0.254099,0.565685,0.555556,0.307692,0.648460,0.567568
193583,0.336781,0.453221,0.485690,0.674966,0.384446,0.376238,0.532939,0.556659,0.505181,0.632142,...,0.352089,0.280651,0.475630,0.826282,0.117363,0.571548,0.192450,0.266469,0.000000,0.405706
193585,0.583322,0.202198,0.445362,0.334021,0.618318,0.754557,0.738462,0.521168,0.187500,0.385529,...,0.609837,0.388881,0.353063,0.247701,0.609837,0.424264,0.222222,0.769231,0.447214,0.216216
193587,0.481216,0.529850,0.472377,0.393648,0.252240,0.084883,0.261086,0.313243,0.486136,0.376202,...,0.215610,0.412471,0.166436,0.544915,0.215610,0.100000,0.628539,0.108786,0.000000,0.420442


### Recommender function


In [103]:
def recommendTopKMoviesUsingContentBased(userId, k=10, noOutput=False):
    recommendedMoviesIds = contentBasedRecommendations[userId].sort_values(
        ascending=False)
    
    if not noOutput:
        print("Seen movies (sorted from the best rated)")
        display(moviesRatingsDf[moviesRatingsDf["userId"] == userId].sort_values(
            by=["rating"], ascending=False).drop(columns=["userId", "timestamp"]))
        print("Content based recommendations (from most similar)")
        display(pd.merge(recommendedMoviesIds.head(k), moviesDf, on="movieId"))
        # save output to file
        file = open("../results/testing-contentBased.txt", "w")
        [file.write(f"{id}\n") for id in recommendedMoviesIds.head(k).index.tolist()]
    return pd.DataFrame(recommendedMoviesIds, index=contentBasedRecommendations.index).rename(columns={userId: "order"})


recommendTopKMoviesUsingContentBased(userId=2, k=10);

Seen movies (sorted from the best rated)


Unnamed: 0_level_0,title,genres,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
80906,Inside Job (2010),Documentary,5.0
89774,Warrior (2011),Drama,5.0
106782,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama,5.0
131724,The Jinx: The Life and Deaths of Robert Durst ...,Documentary,5.0
1704,Good Will Hunting (1997),Drama|Romance,4.5
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,4.5
68157,Inglourious Basterds (2009),Action|Drama|War,4.5
6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller,4.0
46970,Talladega Nights: The Ballad of Ricky Bobby (2...,Action|Comedy,4.0
74458,Shutter Island (2010),Drama|Mystery|Thriller,4.0


Content based recommendations (from most similar)


Unnamed: 0,movieId,2,title,genres
0,27802,0.880471,Infernal Affairs 2 (Mou gaan dou II) (2003),Action|Crime|Drama|Thriller
1,31420,0.880471,Assault on Precinct 13 (2005),Action|Crime|Drama|Thriller
2,78,0.880471,"Crossing Guard, The (1995)",Action|Crime|Drama|Thriller
3,2540,0.880471,"Corruptor, The (1999)",Action|Crime|Drama|Thriller
4,1589,0.880471,Cop Land (1997),Action|Crime|Drama|Thriller
5,91842,0.880471,Contraband (2012),Action|Crime|Drama|Thriller
6,3265,0.880471,Hard-Boiled (Lat sau san taam) (1992),Action|Crime|Drama|Thriller
7,69131,0.880471,Killshot (2008),Action|Crime|Drama|Thriller
8,46335,0.880471,"Fast and the Furious: Tokyo Drift, The (Fast a...",Action|Crime|Drama|Thriller
9,37733,0.880471,"History of Violence, A (2005)",Action|Crime|Drama|Thriller


## Collaborative filtering recommender system

This function recommends movies to users by comparing their similarity to other users.


In [104]:
# create user movie rating
userMovieRatingDf = ratingsDf.pivot(
    index="userId", columns="movieId", values="rating").fillna(0)
display(userMovieRatingDf)

movieId,1,2,3,4,5,6,7,8,9,10,...,189713,190209,190215,190221,191005,193567,193573,193581,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
# compute similarity between all users
similarities = pd.DataFrame(cosine_similarity(
    userMovieRatingDf), index=userMovieRatingDf.index, columns=userMovieRatingDf.index)
display(similarities)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.000000,0.035108,0.153215,0.048605,0.046552,0.090406,0.044083,0.032917,0.013974,...,0.042581,0.092500,0.117004,0.045072,0.061635,0.073954,0.149407,0.149930,0.080640,0.059186
2,0.000000,1.000000,0.000000,0.007258,0.031673,0.000000,0.000000,0.000000,0.000000,0.055130,...,0.154658,0.000000,0.010306,0.000000,0.000000,0.034030,0.024459,0.000000,0.000000,0.045275
3,0.035108,0.000000,1.000000,0.000000,0.009839,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.005355,0.000000,0.002134,0.000000,0.021063,0.001561,0.004559,0.002539,0.000000,0.019009
4,0.153215,0.007258,0.000000,1.000000,0.060905,0.023916,0.027524,0.027545,0.025367,0.001417,...,0.057592,0.088178,0.153252,0.031013,0.038030,0.111295,0.110059,0.067399,0.007398,0.064884
5,0.048605,0.031673,0.009839,0.060905,1.000000,0.110891,0.076008,0.213692,0.000000,0.021642,...,0.074131,0.202523,0.070624,0.144358,0.035562,0.025042,0.098520,0.059167,0.036320,0.042291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.073954,0.034030,0.001561,0.111295,0.025042,0.051626,0.120055,0.025167,0.042818,0.039147,...,0.097730,0.040975,0.152238,0.034898,0.054416,1.000000,0.116183,0.127992,0.070442,0.094507
607,0.149407,0.024459,0.004559,0.110059,0.098520,0.077374,0.074350,0.027848,0.000000,0.002388,...,0.053199,0.071318,0.148259,0.063753,0.041979,0.116183,1.000000,0.138665,0.044877,0.061289
608,0.149930,0.000000,0.002539,0.067399,0.059167,0.080477,0.159488,0.080147,0.022858,0.008911,...,0.061484,0.091215,0.128158,0.050941,0.040540,0.127992,0.138665,1.000000,0.053121,0.160638
609,0.080640,0.000000,0.000000,0.007398,0.036320,0.031694,0.045593,0.048670,0.000000,0.035052,...,0.000000,0.125462,0.035015,0.073976,0.080223,0.070442,0.044877,0.053121,1.000000,0.014573


In [106]:
# set similarity to himself to 0 (recommendation of self won't help)
for i in similarities.index:
    similarities[i][i] = 0

# normalization (task requirement)
scaledRecommendations = scaler.fit_transform(similarities.values)
similarities = pd.DataFrame(similarities, index=similarities.index, columns=similarities.columns)

display(similarities)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.000000,0.035108,0.153215,0.048605,0.046552,0.090406,0.044083,0.032917,0.013974,...,0.042581,0.092500,0.117004,0.045072,0.061635,0.073954,0.149407,0.149930,0.080640,0.059186
2,0.000000,0.000000,0.000000,0.007258,0.031673,0.000000,0.000000,0.000000,0.000000,0.055130,...,0.154658,0.000000,0.010306,0.000000,0.000000,0.034030,0.024459,0.000000,0.000000,0.045275
3,0.035108,0.000000,0.000000,0.000000,0.009839,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.005355,0.000000,0.002134,0.000000,0.021063,0.001561,0.004559,0.002539,0.000000,0.019009
4,0.153215,0.007258,0.000000,0.000000,0.060905,0.023916,0.027524,0.027545,0.025367,0.001417,...,0.057592,0.088178,0.153252,0.031013,0.038030,0.111295,0.110059,0.067399,0.007398,0.064884
5,0.048605,0.031673,0.009839,0.060905,0.000000,0.110891,0.076008,0.213692,0.000000,0.021642,...,0.074131,0.202523,0.070624,0.144358,0.035562,0.025042,0.098520,0.059167,0.036320,0.042291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.073954,0.034030,0.001561,0.111295,0.025042,0.051626,0.120055,0.025167,0.042818,0.039147,...,0.097730,0.040975,0.152238,0.034898,0.054416,0.000000,0.116183,0.127992,0.070442,0.094507
607,0.149407,0.024459,0.004559,0.110059,0.098520,0.077374,0.074350,0.027848,0.000000,0.002388,...,0.053199,0.071318,0.148259,0.063753,0.041979,0.116183,0.000000,0.138665,0.044877,0.061289
608,0.149930,0.000000,0.002539,0.067399,0.059167,0.080477,0.159488,0.080147,0.022858,0.008911,...,0.061484,0.091215,0.128158,0.050941,0.040540,0.127992,0.138665,0.000000,0.053121,0.160638
609,0.080640,0.000000,0.000000,0.007398,0.036320,0.031694,0.045593,0.048670,0.000000,0.035052,...,0.000000,0.125462,0.035015,0.073976,0.080223,0.070442,0.044877,0.053121,0.000000,0.014573


In [107]:
# 1 = watched, 0 = not watched (the lowest rating is 0.5, so 0 indeed mean not watched), transpose in order to index columns by users
watched = (userMovieRatingDf != 0).astype(int).T
display(watched)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,1,0,1,0,0,0,...,1,0,1,0,0,1,0,1,0,1
2,0,0,0,0,0,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193573,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193581,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Recommender function

In [108]:
import numpy as np

def recommendTopKMoviesUsingCollaborativeFiltering(userId, k=10, noOutput=False):
    mostSimilarUsers = similarities[userId]
    weightedRating = watched * mostSimilarUsers # watch/not watched weighed by user similarity
    scaled = scaler.fit_transform(weightedRating.replace(0, np.NaN).mean(axis=1).fillna(0).values.reshape(-1, 1))
    recommendedMoviesIds = pd.DataFrame(scaled, index=weightedRating.index).sort_values(by=0, ascending=False)
    
    if not noOutput:
        print("Seen movies (sorted from the best rated)")
        display(moviesRatingsDf[moviesRatingsDf["userId"] == userId].sort_values(
            by=["rating"], ascending=False).drop(columns=["userId", "timestamp"]))
        print("Collaborative filtering recommendations (from most similar)")
        display(pd.merge(recommendedMoviesIds.head(k), moviesDf, on="movieId"))
        # save result ids to file
        file = open("../results/testing-collaborativeFiltering.txt", "w")
        [file.write(f"{id}\n") for id in recommendedMoviesIds.head(k).index.tolist()]
    return recommendedMoviesIds.rename(columns={0: "order"})


recommendTopKMoviesUsingCollaborativeFiltering(userId=2, k=10);

Seen movies (sorted from the best rated)


Unnamed: 0_level_0,title,genres,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
80906,Inside Job (2010),Documentary,5.0
89774,Warrior (2011),Drama,5.0
106782,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama,5.0
131724,The Jinx: The Life and Deaths of Robert Durst ...,Documentary,5.0
1704,Good Will Hunting (1997),Drama|Romance,4.5
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,4.5
68157,Inglourious Basterds (2009),Action|Drama|War,4.5
6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller,4.0
46970,Talladega Nights: The Ballad of Ricky Bobby (2...,Action|Comedy,4.0
74458,Shutter Island (2010),Drama|Mystery|Thriller,4.0


Collaborative filtering recommendations (from most similar)


Unnamed: 0,movieId,0,title,genres
0,53127,1.0,Bug (2007),Drama|Horror|Thriller
1,131739,0.982749,Batman vs. Robin (2015),Action|Adventure|Animation
2,74946,0.85463,She's Out of My League (2010),Comedy
3,193609,0.848592,Andrew Dice Clay: Dice Rules (1991),Comedy
4,50445,0.813537,"Hitcher, The (2007)",Action|Horror|Thriller
5,52579,0.813537,"Vie en Rose, La (Môme, La) (2007)",Drama|Musical
6,60857,0.813537,"Tracey Fragments, The (2007)",Drama
7,55555,0.813537,"Edge of Heaven, The (Auf der anderen Seite) (2...",Drama
8,6223,0.813537,Spun (2001),Comedy|Crime|Drama
9,501,0.813537,Naked (1993),Drama


## Hybrid recommender system

In [111]:
def recommendTopKHybrid(userId, k=10, contentBaseRate=0.7, collaborativeFilteringRate=0.3):
    contentBasedRecommendedMovieIds = recommendTopKMoviesUsingContentBased(
        userId, k, noOutput=True) * contentBaseRate
    collaborativeFilteringMovieIds = recommendTopKMoviesUsingCollaborativeFiltering(
        userId, k, noOutput=True) * collaborativeFilteringRate
    orders = pd.merge(collaborativeFilteringMovieIds,
                      contentBasedRecommendedMovieIds, on="movieId")
    orders.loc[:, "sum"] = orders.sum(numeric_only=True, axis=1)
    orders.sort_values(by="sum", ascending=False, inplace=True)
    orders.drop(columns=["order_x", "order_y"], inplace=True)

    print("Seen movies (sorted from the best rated)")
    display(moviesRatingsDf[moviesRatingsDf["userId"] == userId].sort_values(
            by=["rating"], ascending=False).drop(columns=["userId", "timestamp"]))

    print("Hybrid recommendations (from most similar)")
    display(pd.merge(orders.head(k), moviesDf, on="movieId"))
    #save results to file
    file = open(f"../results/testing-hybrid-{contentBaseRate}-{collaborativeFilteringRate}.txt", "w")
    [file.write(f"{id}\n") for id in orders.head(k).index.tolist()]


recommendTopKHybrid(userId=2)

Seen movies (sorted from the best rated)


Unnamed: 0_level_0,title,genres,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
80906,Inside Job (2010),Documentary,5.0
89774,Warrior (2011),Drama,5.0
106782,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama,5.0
131724,The Jinx: The Life and Deaths of Robert Durst ...,Documentary,5.0
1704,Good Will Hunting (1997),Drama|Romance,4.5
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,4.5
68157,Inglourious Basterds (2009),Action|Drama|War,4.5
6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller,4.0
46970,Talladega Nights: The Ballad of Ricky Bobby (2...,Action|Comedy,4.0
74458,Shutter Island (2010),Drama|Mystery|Thriller,4.0


Hybrid recommendations (from most similar)


Unnamed: 0,movieId,sum,title,genres
0,96861,0.835717,Taken 2 (2012),Action|Crime|Drama|Thriller
1,184931,0.819192,Death Wish (2018),Action|Crime|Drama|Thriller
2,150548,0.803355,Sherlock: The Abominable Bride (2016),Action|Crime|Drama|Mystery|Thriller
3,55721,0.779786,Elite Squad (Tropa de Elite) (2007),Action|Crime|Drama|Thriller
4,115680,0.778314,Time Lapse (2014),Crime|Drama|Sci-Fi|Thriller
5,501,0.765571,Naked (1993),Drama
6,60857,0.765571,"Tracey Fragments, The (2007)",Drama
7,55555,0.765571,"Edge of Heaven, The (Auf der anderen Seite) (2...",Drama
8,20,0.755565,Money Train (1995),Action|Comedy|Crime|Drama|Thriller
9,83369,0.754609,"Way Back, The (2010)",Drama


## Statistics

In [122]:
import os

def recall(testingData, trainingData):
    return len(set(testingData).intersection(set(trainingData))) / len (trainingData)

def precision(testingData, trainingData):
    return len(set(testingData).intersection(set(trainingData))) / len(testingData)

def fMeasure(recall, precision):
    return 0 if precision + recall == 0 else 2 * (recall * precision) / (recall + precision)

def computeStatistics():
    for fileName in os.listdir("../results/"):
        if fileName.startswith("testing"):
            trainingFile = "training" + fileName[7:]
            file = open(f"../results/{trainingFile}", "r")
            trainingData = [int(x) for x in file.read().split("\n") if x != ""]
            file = open(f"../results/{fileName}", "r")
            testingData = [int(x) for x in file.read().split("\n") if x != ""]
            rec = recall(testingData, trainingData)
            prec = precision(testingData, trainingData)
            fmes = fMeasure(rec, prec)
            print(f"{fileName[8:]}: recall = {rec}, precision = {prec}, f-measure = {fmes}")
            
            
    
computeStatistics()

collaborativeFiltering.txt: recall = 0.0, precision = 0.0, f-measure = 0
hybrid-0.3-0.7.txt: recall = 0.0, precision = 0.0, f-measure = 0
hybrid-0.7-0.3.txt: recall = 0.1, precision = 0.1, f-measure = 0.10000000000000002
contentBased.txt: recall = 0.0, precision = 0.0, f-measure = 0
hybrid-0.5-0.5.txt: recall = 0.0, precision = 0.0, f-measure = 0
