# Import

In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler

# Data Load

In [3]:
history_df = pd.read_csv('../data/history_data.csv',encoding='utf-8')

## 중복 데이터 제거 ##
history_df = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
history_df['rating']=1
history_df.head(3)

Unnamed: 0,profile_id,log_time,album_id,rating
0,3,20220301115719,15,1
1,3,20220301115809,16,1
2,3,20220301115958,17,1


# 전체 데이터를 이용하여 table 생성

In [4]:
n_users = history_df.profile_id.nunique()
n_items = history_df.album_id.nunique()
print(n_users,n_items)

8311 20695


In [5]:
ratings_matrix_df = history_df.pivot_table('rating',index='profile_id',columns='album_id')
ratings_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,


In [6]:
ratings_total_matrix_df = pd.DataFrame(np.zeros(ratings_matrix_df.values.shape),index=ratings_matrix_df.index,columns=ratings_matrix_df.columns)
ratings_total_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
total_data = history_df.values

for row in tqdm(range(total_data.shape[0])):
    row_data =total_data[row] # row_data = profile_id ,log_time, album_id, rating
    ratings_total_matrix_df.loc[row_data[0],row_data[2]]+=1

ratings_total_matrix_df.head()

  0%|          | 0/899252 [00:00<?, ?it/s]

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 유저별로 최대 1 최소 0 으로 Minmaxscaler 적용

In [8]:
## MinMaxScaler 적용을 위해 transpose() 후 다시 transpose() 적용 ##
scaler = MinMaxScaler()
ratings_minmax_matrix_t = scaler.fit_transform(ratings_total_matrix_df.transpose())
ratings_minmax_matrix_df_t = pd.DataFrame(ratings_minmax_matrix_t,index=ratings_total_matrix_df.columns,columns=ratings_total_matrix_df.index)
ratings_minmax_matrix_df = ratings_minmax_matrix_df_t.transpose()
ratings_minmax_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 잠재요인 협업 필터링 적용

In [9]:
R = ratings_minmax_matrix_df.values
num_users,num_items = R.shape
non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0 ]

In [11]:
# K가 커질수록 valid 에서 성능 향상
# valid 테스트를 통해 하이퍼파라미터를 찾음
## 연산시간 약 50분 소요 ##
K = 128000
steps=5
learning_rate=0.01
r_lambda=0.01

num_users,num_items = R.shape

np.random.seed(1212)
P = np.random.normal(scale=1./(K),size=(num_users,K))
Q = np.random.normal(scale=1./(K),size=(num_items,K))


for step in tqdm(range(steps)):
    for i,j,r in non_zeros:
        eij = r - np.dot(P[i,:],Q[j,:].T)
        P[i,:] = P[i,:] + learning_rate*(eij*Q[j,:] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate*(eij*P[i,:] - r_lambda*Q[j,:])
    print('now step :',step)

  0%|          | 0/5 [00:00<?, ?it/s]

now step : 0
now step : 1
now step : 2
now step : 3
now step : 4


In [12]:
# 연산시간 약 1분 소요 
full_pred = np.dot(P,Q.T)
full_pred.shape

(8311, 20695)

In [13]:
actual_pred_matrix_df = pd.DataFrame(full_pred,index=ratings_minmax_matrix_df.index,columns=ratings_minmax_matrix_df.columns)
actual_pred_matrix_df

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,8.223037e-08,3.189552e-08,-3.741311e-10,9.286890e-09,-7.420128e-10,3.044453e-08,3.460468e-08,-2.384493e-09,9.868861e-09,3.229751e-08,...,2.535052e-08,9.794784e-09,-3.380018e-08,-5.541014e-08,-1.621176e-08,-2.389521e-08,-8.884075e-09,-6.002862e-08,5.656532e-08,1.610136e-08
5,2.647762e-07,8.096929e-08,5.850891e-08,3.291090e-08,3.200129e-08,5.611503e-08,4.570351e-08,-9.539941e-10,-4.396822e-08,8.780706e-09,...,1.758989e-08,4.269701e-09,1.129913e-08,1.286029e-08,-2.899299e-08,-1.451422e-09,4.364741e-08,5.022864e-09,-3.555852e-08,-4.587623e-08
7,3.630466e-08,-1.339965e-08,3.049472e-09,-2.351918e-08,7.984295e-09,-1.676890e-12,-1.613547e-08,-9.192577e-09,-6.152580e-09,1.931152e-09,...,-2.934813e-09,3.287377e-09,3.593958e-08,3.413032e-09,-5.149684e-09,1.774774e-08,1.586385e-08,3.369650e-08,-2.946359e-08,7.266186e-09
12,2.001866e-08,2.771220e-08,6.212890e-09,-1.843817e-08,-6.571303e-09,-9.066055e-09,1.888557e-08,-4.626061e-08,-1.548385e-08,3.166774e-09,...,-4.111964e-08,5.279388e-09,-1.656156e-08,-4.124346e-08,1.182524e-08,6.891286e-10,4.042438e-09,1.420906e-08,7.029099e-09,1.548079e-09
16,1.230353e-07,6.690177e-08,3.756683e-08,-6.846887e-09,-2.967036e-08,7.048844e-09,-1.978781e-08,-1.691320e-08,1.125735e-08,2.289033e-08,...,1.876711e-08,-1.514827e-08,3.316312e-08,-1.206795e-08,1.035655e-08,1.183413e-08,3.688246e-08,7.169435e-09,3.129369e-08,-2.431112e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33022,1.770644e-08,2.985017e-08,4.831539e-08,1.484308e-08,2.097373e-08,3.695876e-08,1.100571e-09,5.556366e-09,-1.074828e-08,8.580381e-09,...,-2.935949e-08,-3.775956e-08,-2.353238e-08,-4.002520e-08,-3.042060e-08,-1.879547e-08,-3.221916e-09,-3.089235e-08,-7.602118e-09,2.976390e-08
33023,1.464400e-07,2.747258e-08,2.009198e-08,2.462670e-08,4.584194e-08,9.531902e-09,7.773435e-09,1.749540e-08,-5.661900e-09,1.324252e-08,...,-1.811240e-08,-6.551332e-09,-1.550260e-08,2.120734e-08,1.423157e-08,3.651338e-08,2.127653e-08,-1.008468e-08,-2.689165e-08,-5.363067e-09
33026,6.703679e-09,2.837892e-08,2.917351e-08,-2.998327e-08,3.583004e-09,-1.380978e-08,-3.315408e-08,1.852425e-08,2.027111e-08,2.598594e-08,...,-8.574087e-10,1.353319e-08,-5.178774e-08,2.743445e-09,1.526329e-09,2.119603e-08,-2.024172e-08,9.492069e-09,-8.870482e-09,-2.775016e-08
33027,3.839621e-08,2.030003e-08,4.409504e-08,3.784000e-10,-8.444710e-09,-5.265513e-09,-1.872653e-08,-3.351951e-08,1.124223e-08,3.426094e-08,...,-1.751140e-08,-1.974831e-08,-1.072856e-08,-3.621214e-08,6.051103e-08,-2.614535e-08,-1.554522e-08,-1.743055e-09,1.798775e-08,2.637721e-09


# 생성된 예측행렬을 ensemble을 시켜주기 위하여 유저별로 최대 1 최소 0 으로 Minmaxscaler 적용

In [14]:
scaler = MinMaxScaler()
actual_pred_matrix_t = scaler.fit_transform(actual_pred_matrix_df.transpose())
actual_pred_matrix_df_t = pd.DataFrame(actual_pred_matrix_t,index=ratings_total_matrix_df.columns,columns=ratings_total_matrix_df.index)
actual_pred_matrix_df = actual_pred_matrix_df_t.transpose()
actual_pred_matrix_df

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.128423,0.089486,0.064523,0.071996,0.064238,0.088363,0.091581,0.062968,0.072446,0.089797,...,0.084423,0.072389,0.038666,0.021949,0.052271,0.046328,0.057940,0.018376,0.108569,0.077268
5,0.168163,0.078971,0.068072,0.055651,0.055209,0.066911,0.061858,0.039218,0.018346,0.043942,...,0.048216,0.041753,0.045164,0.045921,0.025612,0.038977,0.060861,0.042118,0.022426,0.017420
7,0.156024,0.098218,0.117348,0.086449,0.123087,0.113800,0.095036,0.103111,0.106646,0.116048,...,0.110388,0.117625,0.155600,0.117771,0.107812,0.134442,0.132251,0.152991,0.079535,0.122252
12,0.131307,0.140654,0.114536,0.084589,0.099005,0.095974,0.129931,0.050789,0.088178,0.110835,...,0.057034,0.113401,0.086868,0.056884,0.121354,0.107825,0.111899,0.124250,0.115527,0.108869
16,0.108562,0.079456,0.064245,0.041216,0.029382,0.048421,0.034506,0.035997,0.050603,0.056635,...,0.054497,0.036912,0.061962,0.038509,0.050136,0.050902,0.063890,0.048484,0.060992,0.032161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33022,0.116217,0.129889,0.150677,0.112994,0.119896,0.137892,0.097522,0.102538,0.084182,0.105943,...,0.063230,0.053773,0.069790,0.051222,0.062035,0.075123,0.092656,0.061504,0.087724,0.129792
33023,0.144895,0.069906,0.065254,0.068112,0.081485,0.058598,0.057489,0.063617,0.049020,0.060936,...,0.041172,0.048460,0.042817,0.065957,0.061560,0.075605,0.066001,0.046233,0.035639,0.049209
33026,0.109040,0.134082,0.135000,0.066654,0.105434,0.085340,0.062991,0.122696,0.124714,0.131317,...,0.100304,0.116930,0.041463,0.104464,0.103058,0.125783,0.077909,0.112261,0.091046,0.069234
33027,0.119937,0.102130,0.125545,0.082526,0.073844,0.076973,0.063727,0.049170,0.093217,0.115868,...,0.064922,0.062721,0.071597,0.046520,0.141699,0.056426,0.066857,0.080439,0.099855,0.084750


# 예측행렬 저장

In [15]:
actual_pred_matrix_df.to_csv('./save_matrix_csv/total_latent.csv',header=False,index=False)