## Matrix-Factorization Library 활용

    - !pip install matrix-factorization

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

from tqdm import tqdm_notebook

from matrix_factorization import BaselineModel, KernelMF, train_update_test_split

### 데이터 불러오기 및 분리

In [4]:
ratings_df = pd.read_csv('./data/ml-latest-small/ratings.csv',
                         encoding = 'utf-8')


display(ratings_df.head())
print(ratings_df.shape)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


(100836, 4)


In [17]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2,
                                     random_state = 42)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


### column명 변경 (matrix_factorization 형식에 맞게)

    - user_id, item_id, rating, timestamp

In [18]:
train_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [19]:
new_train_df = train_df.copy()

new_train_df = new_train_df.rename(columns={'userId' : 'user_id', 'movieId' : 'item_id'})
new_train_df.columns

Index(['user_id', 'item_id', 'rating', 'timestamp'], dtype='object')

    - train / valid / test로 나눔

In [20]:
(
    X_train_initial,
    y_train_initial,
    X_train_update,
    y_train_update,
    X_test_update,
    y_test_update
) = train_update_test_split(new_train_df, frac_new_users=0.2)

In [21]:
matrix_factor = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.004)
matrix_factor.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 0.9803219998373767
Epoch  2 / 20  -  train_rmse: 0.9549115056034922
Epoch  3 / 20  -  train_rmse: 0.9380967189695844
Epoch  4 / 20  -  train_rmse: 0.9253829754423145
Epoch  5 / 20  -  train_rmse: 0.9151220439319362
Epoch  6 / 20  -  train_rmse: 0.9065047601645694
Epoch  7 / 20  -  train_rmse: 0.8990557823418011
Epoch  8 / 20  -  train_rmse: 0.8924475431733833
Epoch  9 / 20  -  train_rmse: 0.8864921709390449
Epoch  10 / 20  -  train_rmse: 0.8810491799650102
Epoch  11 / 20  -  train_rmse: 0.8760153705558816
Epoch  12 / 20  -  train_rmse: 0.8712910539667581
Epoch  13 / 20  -  train_rmse: 0.8668629568795232
Epoch  14 / 20  -  train_rmse: 0.8626464979098405
Epoch  15 / 20  -  train_rmse: 0.8586306692289715
Epoch  16 / 20  -  train_rmse: 0.8547681134366045
Epoch  17 / 20  -  train_rmse: 0.8510568587543147
Epoch  18 / 20  -  train_rmse: 0.8474489083157907
Epoch  19 / 20  -  train_rmse: 0.84395417633182
Epoch  20 / 20  -  train_rmse: 0.8405530148435252


### Update model with new Users

In [22]:
matrix_factor.update_users(
    X_train_update, y_train_update, lr = 0.001, n_epochs=20, verbose=1
)

# 업데이트했더니 RMSE 올라감!! 새로운 데이터셋에대해서 반응이 어떤지 확인할 수 있음

Epoch  1 / 20  -  train_rmse: 1.014129807371265
Epoch  2 / 20  -  train_rmse: 0.9979462073046832
Epoch  3 / 20  -  train_rmse: 0.9855822159118733
Epoch  4 / 20  -  train_rmse: 0.9758608939845543
Epoch  5 / 20  -  train_rmse: 0.9680509248386042
Epoch  6 / 20  -  train_rmse: 0.9616378342532522
Epoch  7 / 20  -  train_rmse: 0.9562761849371407
Epoch  8 / 20  -  train_rmse: 0.9516757128794358
Epoch  9 / 20  -  train_rmse: 0.9476499167101242
Epoch  10 / 20  -  train_rmse: 0.9440782661519322
Epoch  11 / 20  -  train_rmse: 0.9408869317670773
Epoch  12 / 20  -  train_rmse: 0.9379883378623604
Epoch  13 / 20  -  train_rmse: 0.9353276802064893
Epoch  14 / 20  -  train_rmse: 0.9328662846103541
Epoch  15 / 20  -  train_rmse: 0.9305835423961721
Epoch  16 / 20  -  train_rmse: 0.9284379794792987
Epoch  17 / 20  -  train_rmse: 0.926416471213219
Epoch  18 / 20  -  train_rmse: 0.9245020218106285
Epoch  19 / 20  -  train_rmse: 0.9226855841088466
Epoch  20 / 20  -  train_rmse: 0.9209463122343954


In [23]:
pred = matrix_factor.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=True)

print(f'Test RMSE : {rmse: .4f}')

Test RMSE :  0.8750


### 추천 리스트 (user에게 어떤 아이템을 추천해줄지)

In [25]:
user = 200
item_known = X_train_initial.query("user_id == @user")['item_id']
matrix_factor.recommend(user=user, items_known=item_known)

Unnamed: 0,user_id,item_id,rating_pred
459,200,1221,4.494299
505,200,858,4.477896
412,200,48516,4.426765
669,200,750,4.422649
55,200,50,4.410492
334,200,2028,4.395701
1213,200,48780,4.376283
204,200,58559,4.366483
344,200,7361,4.349286
842,200,912,4.338221


## SGD

In [26]:
baseline_model = BaselineModel(method='sgd', n_epochs=20, reg=0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=False)

print(f'Test RMSE : {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.8935270937860513
Epoch  2 / 20  -  train_rmse: 0.8685745977212562
Epoch  3 / 20  -  train_rmse: 0.8550698310657813
Epoch  4 / 20  -  train_rmse: 0.845197191229175
Epoch  5 / 20  -  train_rmse: 0.837742917234955
Epoch  6 / 20  -  train_rmse: 0.8321000567829994
Epoch  7 / 20  -  train_rmse: 0.8271071878731127
Epoch  8 / 20  -  train_rmse: 0.8227701386744684
Epoch  9 / 20  -  train_rmse: 0.8191290612063753
Epoch  10 / 20  -  train_rmse: 0.816456238712737
Epoch  11 / 20  -  train_rmse: 0.8133613464581888
Epoch  12 / 20  -  train_rmse: 0.8107645546293066
Epoch  13 / 20  -  train_rmse: 0.8084592976584181
Epoch  14 / 20  -  train_rmse: 0.8064189538525462
Epoch  15 / 20  -  train_rmse: 0.8046775190238704
Epoch  16 / 20  -  train_rmse: 0.8025746965368589
Epoch  17 / 20  -  train_rmse: 0.8010371527631797
Epoch  18 / 20  -  train_rmse: 0.7994074043579835
Epoch  19 / 20  -  train_rmse: 0.7980364999012409
Epoch  20 / 20  -  train_rmse: 0.7971053504175353
Test RMSE : 

In [29]:
%%time
baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)

pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=False)

print(f'Test RMSE : {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9975141742005232
Epoch  2 / 20  -  train_rmse: 0.9818273433807932
Epoch  3 / 20  -  train_rmse: 0.9701305440774957
Epoch  4 / 20  -  train_rmse: 0.961188270419657
Epoch  5 / 20  -  train_rmse: 0.9542565111069525
Epoch  6 / 20  -  train_rmse: 0.9487113419633955
Epoch  7 / 20  -  train_rmse: 0.944234695217546
Epoch  8 / 20  -  train_rmse: 0.9405402128855591
Epoch  9 / 20  -  train_rmse: 0.9374235215169545
Epoch  10 / 20  -  train_rmse: 0.9347878570950781
Epoch  11 / 20  -  train_rmse: 0.9325023632468711
Epoch  12 / 20  -  train_rmse: 0.9305259785965706
Epoch  13 / 20  -  train_rmse: 0.928782055956956
Epoch  14 / 20  -  train_rmse: 0.9272358418418716
Epoch  15 / 20  -  train_rmse: 0.925860424253245
Epoch  16 / 20  -  train_rmse: 0.924622736947889
Epoch  17 / 20  -  train_rmse: 0.9234967124082814
Epoch  18 / 20  -  train_rmse: 0.9224784910176426
Epoch  19 / 20  -  train_rmse: 0.9215459445238481
Epoch  20 / 20  -  train_rmse: 0.9206904663385214
Test RMSE : 0.

### ALS

In [30]:
%%time
baseline_model = BaselineModel(method='als', n_epochs=20, reg=0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=False)

print(f'Test RMSE : {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.779109904541514
Epoch  2 / 20  -  train_rmse: 0.7608860425740508
Epoch  3 / 20  -  train_rmse: 0.759436747699867
Epoch  4 / 20  -  train_rmse: 0.7592883597926661
Epoch  5 / 20  -  train_rmse: 0.7592686827731504
Epoch  6 / 20  -  train_rmse: 0.759265162089296
Epoch  7 / 20  -  train_rmse: 0.759264336809037
Epoch  8 / 20  -  train_rmse: 0.7592641004071806
Epoch  9 / 20  -  train_rmse: 0.7592640215533498
Epoch  10 / 20  -  train_rmse: 0.7592639917105367
Epoch  11 / 20  -  train_rmse: 0.7592639791938116
Epoch  12 / 20  -  train_rmse: 0.7592639735273531
Epoch  13 / 20  -  train_rmse: 0.7592639708229058
Epoch  14 / 20  -  train_rmse: 0.7592639694828268
Epoch  15 / 20  -  train_rmse: 0.7592639687970673
Epoch  16 / 20  -  train_rmse: 0.7592639684326994
Epoch  17 / 20  -  train_rmse: 0.7592639682283938
Epoch  18 / 20  -  train_rmse: 0.7592639681044094
Epoch  19 / 20  -  train_rmse: 0.759263968020909
Epoch  20 / 20  -  train_rmse: 0.759263967957894
Test RMSE : 1.0