### DDW - Recommender Systems

In [48]:
import pandas as pd
import numpy as np
import math

In [20]:
def load_data(file="small-dataset"):
    data = pd.read_csv(f"data/{file}.csv", header=None).set_index(0).replace(0, np.nan)
    return data

In [54]:
data = load_data()
print(data)

         1    2    3  4  5    6
0                              
us     7.0  6.0  7.0  4  5  4.0
as     6.0  7.0  NaN  4  3  4.0
uas    NaN  3.0  3.0  1  1  NaN
usda   1.0  2.0  2.0  3  3  4.0
ufsds  1.0  NaN  1.0  2  3  3.0


In [160]:
def cosine_sim(v1, v2):
    idxs = np.intersect1d(v1.dropna().index, v2.dropna().index)
    A = np.sum(v1 * v2)
    B = math.sqrt(np.sum((v1*v1).loc[idxs]))*math.sqrt(np.sum((v2*v2).loc[idxs]))
    return A/B

In [69]:
cosine_sim(data.iloc[4], data.iloc[2])

0.6446583712203042

In [159]:
def pearson_sim(v1, v2):
    idxs = np.intersect1d(v1.dropna().index, v2.dropna().index)
    mv1 = np.mean(v1.loc[v1.dropna().index])
    mv2 = np.mean(v2.loc[v2.dropna().index])
    A = np.sum((v1.loc[idxs]-mv1)*(v2.loc[idxs]-mv2))
    B = math.sqrt(np.sum((v1.loc[idxs]-mv1)*(v1.loc[idxs]-mv1)))*math.sqrt(np.sum((v2.loc[idxs]-mv2)*(v2.loc[idxs]-mv2)))
    return A/B

In [73]:
pearson_sim(data.iloc[0], data.iloc[2])

0.8944271909999159

In [95]:
def kNN(data, ix, k=2, fn=cosine_sim):
    vec = data.loc[ix]
    sim = data.apply(lambda v: fn(v, vec), axis=1).sort_values(ascending=False).head(k+1).index.values
    return np.setdiff1d(sim, ix)

In [97]:
print(kNN(data, 'uas'))

['as' 'us']


In [129]:
def recommendations(data, ix, k=2, top=2, fn=cosine_sim):
    sim = kNN(data, ix, k, fn)
    cand = data.columns[np.where(data.loc[ix].isna())]
    pred = np.mean(data.loc[sim, cand], axis=0)
    return pred.sort_values(ascending=False).head(top).index.values

In [110]:
print(recommendations(data, 'uas'))

[1 6]


### Part two

In [117]:
ml = pd.read_csv("data/ml-latest-small/ratings.csv")

# dá do řádků uživatele, do sloupců filmy a do buněk hodnoty
ml_data = ml.pivot(index="userId", columns="movieId", values="rating")

In [177]:
userId = 1

import warnings
warnings.filterwarnings("ignore")
rec = recommendations(ml_data, userId, k=30, top=10)
print(rec)

history = ml[ml['userId']==userId]['movieId'].index.values
ml_movies = pd.read_csv("data/ml-latest-small/movies.csv")
ml_movies[~(ml_movies["movieId"].isin(history)) & ml_movies["movieId"].isin(rec)]

[6620 1575 1734  309 1730 1719  327 5812 1658 1620]


Unnamed: 0,movieId,title,genres
277,309,"Red Firecracker, Green Firecracker (Pao Da Shu...",Drama
292,327,Tank Girl (1995),Action|Comedy|Sci-Fi
1252,1575,Gabbeh (1996),Drama
1290,1620,Kiss the Girls (1997),Crime|Drama|Mystery|Thriller
1319,1658,"Life Less Ordinary, A (1997)",Romance|Thriller
1358,1719,"Sweet Hereafter, The (1997)",Drama
1365,1730,Kundun (1997),Drama
1369,1734,My Life in Pink (Ma vie en rose) (1997),Comedy|Drama
4323,5812,Far from Heaven (2002),Drama|Romance
4730,6620,American Splendor (2003),Comedy|Drama
