In [1]:
import numpy as np
import pandas as pd
import surprise

import math
from datetime import datetime

In [2]:
from surprise import SVD
from surprise import accuracy
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split
from surprise import Dataset
from surprise import Reader


## 목표
1. Matrix Factorization 을 numpy 만으로 구현할 것
2. 영화 시청 이력을 기반으로 영화 평점 예측 모델 구현
3. 데이터 셋은 아래 것을 사용할 것: https://grouplens.org/datasets/movielens/
4. 학습 데이터:  timestamp >= 1104505203 ~ timestamp <= 1230735592
5. 평가 데이터: timestamp >= 1230735600 이후
6. 최종 산출물 평가데이터에 대한 아래 결과와 함께 구현 코드: $<userId>,<movieId>,<predicted rating>,<timestamp>$
7. 구현의 성능 향승을 위한 동시성 사용방안에 대해 고민할 것

## practice

In [3]:
# data load
df_rating = pd.DataFrame(pd.read_csv("../data/ratings.csv"))

In [4]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
del df_rating['timestamp']

In [6]:
df_rating

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [7]:
# 평가를 최소 20번이상 한 유저들
us_rating = df_rating.groupby('userId')['rating'].count().reset_index().sort_values('rating',ascending=False)
us_rating.tail()

Unnamed: 0,userId,rating
12093,12094,20
119538,119539,20
156758,156759,20
12083,12084,20
36206,36207,20


In [8]:
#평가를 최소 1번 받은 영화들
mv_rating = df_rating.groupby('movieId')['rating'].count().reset_index().sort_values('rating',ascending=False)
mv_rating.tail()

Unnamed: 0,movieId,rating
41343,167308,1
41350,167326,1
41354,167336,1
41355,167338,1
59046,209171,1


In [9]:
# 50번 이하의 평가를 받은 영화, 50번 이하의 영화를 본 유저를 제외
min_movie = 50 
min_user = 50

filter_user = df_rating['userId'].value_counts() > min_user
filter_user = filter_user[filter_user].index.tolist()

filter_movie = df_rating['movieId'].value_counts() > min_user
filter_movie = filter_movie[filter_movie].index.tolist()

df_new = df_rating[(df_rating['movieId'].isin(filter_movie))&(df_rating['userId'].isin(filter_user))]
df_new     

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [10]:
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(df_new[['userId','movieId','rating']],reader )

In [13]:
data

<surprise.dataset.DatasetAutoFolds at 0x12963fa90>

In [12]:
print("Using SVD")
algo = SVD()
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)


Using SVD


{'test_rmse': array([0.77222478, 0.77202028, 0.77198672]),
 'fit_time': (722.4528722763062, 797.2867441177368, 791.428240776062),
 'test_time': (131.72352623939514, 134.94076299667358, 93.6947569847107)}

In [13]:
trainset,testset = train_test_split(data,test_size=0.25)
algo = SVD()
predictions = algo.fit(trainset).test(testset)

In [20]:
pd.DataFrame(predictions)

Unnamed: 0,uid,iid,r_ui,est,details
0,9687,4340,2.0,2.364986,{'was_impossible': False}
1,803,7785,4.5,3.717608,{'was_impossible': False}
2,11727,91630,2.0,3.206693,{'was_impossible': False}
3,59614,89492,4.0,3.965279,{'was_impossible': False}
4,55828,91597,4.0,3.984255,{'was_impossible': False}
...,...,...,...,...,...
5671041,117842,593,4.0,3.944036,{'was_impossible': False}
5671042,144823,2329,5.0,3.464501,{'was_impossible': False}
5671043,115500,2080,1.5,2.420303,{'was_impossible': False}
5671044,129597,32840,4.0,4.232667,{'was_impossible': False}


In [23]:
accuracy.rmse(predictions)

RMSE: 0.7659


0.7659225704345772

## 구현

In [80]:
df_rating = pd.DataFrame(pd.read_csv("../data/ratings.csv"))

In [81]:
df_all = df_rating.copy()
df_train = df_all[df_all["timestamp"] >= 1104505203]
df_train = df_train[df_train["timestamp"] <= 1230735592]
df_test = df_all[df_all["timestamp"] >= 1230735600]

In [83]:
df_train_test = pd.concat([df_train,df_test])
del df_train_test['timestamp']

In [84]:
df_train_test

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [85]:
df_test

Unnamed: 0,userId,movieId,rating,timestamp
254,3,1,4.0,1439472215
255,3,29,4.5,1484754967
256,3,32,4.5,1439474635
257,3,50,5.0,1439474391
258,3,111,4.0,1484753849
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [86]:
timestamps = pd.DataFrame(df_test['timestamp'])

In [87]:
test_size_per = len(df_test)/len(df_train_test)
test_size_per

0.7137099658861119

In [88]:
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(df_train_test[['userId','movieId','rating']],reader )
trainset,testset = train_test_split(data,test_size = test_size_per)
algo = SVD()
predictions = algo.fit(trainset).test(testset)

In [110]:
testset

[(27092, 5902, 4.0),
 (136901, 53996, 1.0),
 (74808, 1350, 3.5),
 (121142, 1968, 1.5),
 (22799, 4744, 2.0),
 (11602, 52528, 2.5),
 (74674, 7046, 4.0),
 (117117, 3948, 4.0),
 (94539, 71530, 5.0),
 (91169, 8638, 4.5),
 (123865, 53125, 3.5),
 (67217, 5902, 3.5),
 (139523, 120795, 3.0),
 (34771, 2541, 2.5),
 (121940, 1721, 4.5),
 (122974, 34437, 2.0),
 (56658, 8361, 1.0),
 (120340, 1682, 3.5),
 (133192, 2712, 2.5),
 (151449, 4085, 4.0),
 (146446, 51694, 4.0),
 (117336, 3706, 4.5),
 (127400, 318, 5.0),
 (96801, 7205, 4.0),
 (137614, 208, 3.5),
 (31, 2918, 3.0),
 (155992, 176419, 4.5),
 (34423, 4728, 4.0),
 (109834, 47810, 2.5),
 (64972, 162606, 5.0),
 (75753, 541, 4.5),
 (153348, 3070, 3.0),
 (2687, 4014, 5.0),
 (51678, 2357, 5.0),
 (33595, 315, 2.0),
 (140703, 86190, 4.0),
 (92767, 72011, 3.0),
 (103372, 2916, 3.5),
 (92141, 1041, 3.0),
 (161380, 40732, 4.0),
 (116857, 96610, 4.0),
 (133486, 3623, 3.5),
 (75589, 2918, 5.0),
 (74658, 74458, 3.0),
 (22415, 377, 2.5),
 (84968, 6711, 4.0),
 (8

In [76]:
df_predict = pd.DataFrame(predictions)

del df_predict['details']

columns_name = {'uid': 'userId', 'iid': 'movieId', "r_ui" : 'rating', "est" : "predicted rating"}
df_predict = df_predict.rename(columns = columns_name , inplace = False)

# df_predict = df_predict.sort_values(["userId", "movieId"], ascending = (True, True))


In [89]:
df_predict

Unnamed: 0,userId,movieId,rating,predicted rating
2088467,1,306,3.5,4.081533
8644291,1,307,5.0,4.269365
138047,1,665,5.0,4.119085
10838926,1,899,3.5,3.933253
6360343,1,1088,4.0,2.885010
...,...,...,...,...
5647231,162541,45517,4.5,2.781632
10708436,162541,50872,4.5,3.527612
5163580,162541,55768,2.5,2.574992
2568752,162541,56176,2.0,1.906118


In [42]:
accuracy.rmse(predictions)

RMSE: 0.8203


0.8202585240048046

In [None]:
# train_test_split

In [94]:
df_train

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
24997287,162521,56174,4.0,1219357742
24997298,162521,59315,4.5,1219339433
24997301,162521,59615,3.5,1219339443
24997302,162521,59725,4.0,1219339392


In [None]:
test_time = df_test['timestamp'].copy # timestamp 따로 저장 
del df_test['timestamp']

In [125]:
reader = Reader(rating_scale=(0,5))
train_data = Dataset.load_from_df(df_train[['userId','movieId','rating']],reader )
trainset_2 = train_data.build_full_trainset()

test_data = Dataset.load_from_df(df_test[['userId','movieId','rating']],reader )
testset_2 = test_data.build_full_trainset()
testset_2 = testset_2.build_testset()

In [126]:
algo = SVD()

predictions = algo.fit(trainset_2).test(testset_2)

In [127]:
accuracy.rmse(predictions)

RMSE: 0.9972


0.9972418416769231

### 구현 성능향상을 위해서 고민
1. 적게 영화를 본 유저 제거
2. 적은 유저가 본 영화 데이터 제거 
3. SVD말고 

### matrix factorization
$\hat r_{ui} = q_i^Tp_u$
* reduced = U * s @ VT 로 분해 = $q_i^Tp_u$

### Learning algorithms
1. SGD(Stochastic gradient descent)
    * prediction error($e_{ui} = r_{ui} - q_i^Tp_u$)를 최소화 하는 parmeter를 gradient descent로 계산
    * $q_i \leftarrow q_i + \gamma \centerdot(e_{ui}\centerdot p_u - \lambda \centerdot q_i)$
    * $p_u \leftarrow p_u + \gamma \centerdot(e_{ui}\centerdot q_i - \lambda \centerdot p_u)$  
    
