# Import

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler

# Data Load

In [2]:
history_df = pd.read_csv('../data/history_data.csv',encoding='utf-8')

## 중복 데이터 제거 ##
history_df = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
history_df['rating']=1
history_df.head(3)

Unnamed: 0,profile_id,log_time,album_id,rating
0,3,20220301115719,15,1
1,3,20220301115809,16,1
2,3,20220301115958,17,1


# Train / Valid split

In [3]:
## user 별 전체 데이터중 80% train / 20% valid 사용 ## 

In [4]:
### user 별 전체 데이터*0.8 에 해당해는 데이터 개수 행 추가 ###
count_df = history_df.groupby(['profile_id']).count()
count_df = count_df[['album_id']]
count_df['train_count'] = count_df[['album_id']].apply(lambda x : x*(0.8))
count_df

Unnamed: 0_level_0,album_id,train_count
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,21,16.8
5,543,434.4
7,2,1.6
12,7,5.6
16,3,2.4
...,...,...
33022,2,1.6
33023,12,9.6
33026,1,0.8
33027,15,12.0


In [5]:
### 정확한 valid 분리를 위하여 각 user 별 시간순으로 정렬 ###
history_df = history_df.sort_values(['log_time'])
history_df

Unnamed: 0,profile_id,log_time,album_id,rating
798337,25844,20220301000418,18024,1
798338,25844,20220301000531,1881,1
185888,4783,20220301000656,201,1
798339,25844,20220301000668,4608,1
101611,2794,20220301000805,2641,1
...,...,...,...,...
250052,6435,20220430235415,2467,1
64322,2086,20220430235656,2184,1
313534,8440,20220430235710,348,1
292774,7703,20220430235855,188,1


In [6]:
### user별 시간순으로 정렬된 history_df를 이용하여 먼저본 80% 의 데이터를 train에 넣어주고 나중에 본 20%를 valid 에 넣어주기 ###

count_dict = defaultdict(int)

history_matrix = history_df.values
train_data=[]
valid_data=[]
for row in tqdm(history_matrix):
    profile_id = row[0]
    if count_dict[profile_id]<count_df.loc[profile_id,'train_count']:
        count_dict[profile_id]+=1
        train_data.append(row)
    else:
        valid_data.append(row)

  0%|          | 0/899252 [00:00<?, ?it/s]

In [7]:
### train_data 를 DataFrame 변환 ###
train_data_df = pd.DataFrame(train_data,columns=history_df.columns)
train_data_df

Unnamed: 0,profile_id,log_time,album_id,rating
0,25844,20220301000418,18024,1
1,25844,20220301000531,1881,1
2,4783,20220301000656,201,1
3,25844,20220301000668,4608,1
4,2794,20220301000805,2641,1
...,...,...,...,...
722707,7562,20220430233135,114,1
722708,18994,20220430233509,4053,1
722709,18994,20220430233524,818,1
722710,18994,20220430233542,818,1


In [8]:
### valid_data 를 DataFrame 변환 ###
valid_data_df = pd.DataFrame(valid_data,columns=history_df.columns)
valid_data_df

Unnamed: 0,profile_id,log_time,album_id,rating
0,6967,20220301101930,1465,1
1,6967,20220301102276,1747,1
2,6967,20220301102572,6529,1
3,6967,20220301102663,6530,1
4,6967,20220301102871,6531,1
...,...,...,...,...
176535,5597,20220430235403,2519,1
176536,6435,20220430235415,2467,1
176537,2086,20220430235656,2184,1
176538,8440,20220430235710,348,1


# 전체 데이터를 통하여 table 생성

In [9]:
## Train 과 Valid로 나눈 데이터중 Train 데이터를 이용하여 pivot_table을 생성하면 ##
## tarin에는 없고 valid에만 있는 (profile_id - album_id)가 존재하여 ##
## 행렬 크기가 달라지므로 전체 데이터를 이용하여 pivot_table 생성 ##

### 예시 ###
train_n_users = train_data_df.profile_id.nunique()
train_n_items = train_data_df.album_id.nunique()
print(train_n_users,train_n_items)

n_users = history_df.profile_id.nunique()
n_items = history_df.album_id.nunique()
print(n_users,n_items)
#############

8311 19657
8311 20695


In [10]:
## (8311,19657) 과 (8311,20695)로 차이 발생 --> 정확한 평가 어려움 ##
## 따라서 전체 데이터를 이용하여 table 생성

In [11]:
n_users = history_df.profile_id.nunique()
n_items = history_df.album_id.nunique()
print(n_users,n_items)

8311 20695


In [12]:
## 데이터가 있는 행과 열을 기준으로 table 형성을 위하여 rating 기준으로 dataframe 생성 ##

In [13]:
ratings_matrix_df = history_df.pivot_table('rating',index='profile_id',columns='album_id')
ratings_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,


In [14]:
ratings_total_matrix_df = pd.DataFrame(np.zeros(ratings_matrix_df.values.shape),index=ratings_matrix_df.index,columns=ratings_matrix_df.columns)
ratings_total_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
train_data = train_data_df.values

for row in tqdm(range(train_data.shape[0])): 
    row_data =train_data[row] # row_data = profile_id ,log_time, album_id, rating
    ratings_total_matrix_df.loc[row_data[0],row_data[2]]+=1

ratings_total_matrix_df.head()

  0%|          | 0/722712 [00:00<?, ?it/s]

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 유저별로 최대 1 최소 0 으로 Minmaxscaler 적용

In [17]:
## MinMaxScaler 적용을 위해 transpose() 후 다시 transpose() 적용 ##
scaler = MinMaxScaler()
ratings_minmax_matrix_t = scaler.fit_transform(ratings_total_matrix_df.transpose())
ratings_minmax_matrix_df_t = pd.DataFrame(ratings_minmax_matrix_t,index=ratings_total_matrix_df.columns,columns=ratings_total_matrix_df.index)
ratings_minmax_matrix_df = ratings_minmax_matrix_df_t.transpose()
ratings_minmax_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# (profile_id와 index) 와 (album_id와 columns) 변환을 위한 dictionary 생성

In [18]:
index_list = list(ratings_minmax_matrix_df.index)
column_list = list(ratings_minmax_matrix_df.columns)

print('index 의 전체 길이 :',len(index_list),'columns 의 전체 길이 :',len(column_list))

real_to_fake_user = {real:fake for fake,real in enumerate(index_list)} # profile_id -> index
fake_to_real_user = {fake:real for fake,real in enumerate(index_list)} # index -> profile_id

real_to_fake_album = {real:fake for fake,real in enumerate(column_list)} # album_id -> column
fake_to_real_album = {fake:real for fake,real in enumerate(column_list)} # column -> album_id

print('profile_id 33032의 index 번호 :',real_to_fake_user[33032] ,'index 번호 8310의 profile_id :',fake_to_real_user[8310])
print('album_id 25916의 column 번호 :',real_to_fake_album[25916] ,'column 번호 20694의 album_id :',fake_to_real_album[20694])

index 의 전체 길이 : 8311 columns 의 전체 길이 : 20695
profile_id 33032의 index 번호 : 8310 index 번호 8310의 profile_id : 33032
album_id 25916의 column 번호 : 20694 column 번호 20694의 album_id : 25916


# 성능 평가를 위한 함수

In [24]:
def recallk(actual, predicted, k = 25):
    """ label과 prediction 사이의 recall 평가 함수 
    Args:
        actual : 실제로 본 상품 리스트
        pred : 예측한 상품 리스트
        k : 상위 몇개의 데이터를 볼지 (ex : k=5 상위 5개의 상품만 봄)
    Returns: 
        recall_k : recall@k 
    """     
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

def unique(sequence):
    # preserves order
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def ndcgk(actual, predicted, k = 25):

    set_actual = set(actual)
    idcg = sum([1.0 / np.log(i + 2) for i in range(min(k, len(set_actual)))])
    dcg = 0.0
    unique_predicted = unique(predicted[:k])
    for i, r in enumerate(unique_predicted):
        if r in set_actual:
            dcg += 1.0 / np.log(i + 2)
    ndcg_k = dcg / idcg
    return ndcg_k

def evaluation(gt, pred):
    """ label과 prediction 사이의 recall, coverage, competition metric 평가 함수 
    Args:
        gt : 데이터 프레임 형태의 정답 데이터 
        pred : 데이터 프레임 형태의 예측 데이터 
    Returns: 
        rets : recall, ndcg, coverage, competition metric 결과 
            ex) {'recall': 0.123024, 'ndcg': 056809, 'coverage': 0.017455, 'score': 0.106470}
    """    
    gt = gt.groupby('profile_id')['album_id'].unique().to_frame().reset_index()
    gt.columns = ['profile_id', 'actual_list']

    evaluated_data = pd.merge(pred, gt, how = 'left', on = 'profile_id')
    evaluated_data['Recall@25'] = evaluated_data.apply(lambda x: recallk(x.actual_list, x.predicted_list), axis=1)
    evaluated_data['NDCG@25'] = evaluated_data.apply(lambda x: ndcgk(x.actual_list, x.predicted_list), axis=1)

    recall = evaluated_data['Recall@25'].mean()
    ndcg = evaluated_data['NDCG@25'] .mean()

    score = 0.75*recall + 0.25*ndcg
    rets = {"recall" :recall, 
            "ndcg" :ndcg,  
            "score" :score}
    return rets

## 예측 DataFrame을 통해 score 평가하기 ## 
def df_to_score(actual_df,predict_df):
    # actual = valid_data_df
    pred = pd.DataFrame()
    query_user_ids = actual_df['profile_id'].unique()
    pred_list=[]
    for user_id in query_user_ids:
        items = predict_df.loc[user_id,:].sort_values(ascending=False).index.values[:25]
        pred_list.append(list(items))

    pred['profile_id'] = query_user_ids
    pred['predicted_list'] = pred_list
    rets = evaluation(actual_df, pred)

    print('score :',rets['score'],'recall :',rets['recall'])

# 잠재요인 협업 필터링 적용

In [21]:
R = ratings_minmax_matrix_df.values
num_users,num_items = R.shape
non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] >0 ]

In [22]:
# K가 커질수록 valid 에서 성능 향상
# valid 테스트를 통해 하이퍼파라미터를 찾음
## 연산시간 약 20분 소요 ##
K = 128000
steps=5
learning_rate=0.01
r_lambda=0.01

num_users,num_items = R.shape

np.random.seed(1212)
P = np.random.normal(scale=1./(K),size=(num_users,K))
Q = np.random.normal(scale=1./(K),size=(num_items,K))


for step in tqdm(range(steps)):
    for i,j,r in non_zeros:
        eij = r - np.dot(P[i,:],Q[j,:].T)
        P[i,:] = P[i,:] + learning_rate*(eij*Q[j,:] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate*(eij*P[i,:] - r_lambda*Q[j,:])

    if (step %1)==0:
        print('iteration step :',step )
        pre = np.dot(P,Q.T)
        pred = pd.DataFrame()
        query_user_ids = valid_data_df['profile_id'].unique()
        pred_list=[]
        for user_id in query_user_ids:
            i = real_to_fake_user[user_id]
            pred_u_idx = np.argsort(pre[i])[:-26:-1]
            pred_u_idx = [fake_to_real_album[item] for item in pred_u_idx]
            pred_list.append(pred_u_idx)
            
        pred['profile_id'] = query_user_ids
        pred['predicted_list'] = pred_list
        rets = evaluation(valid_data_df, pred)
        print('score :',rets['score'],'recall :',rets['recall'])

  0%|          | 0/5 [00:00<?, ?it/s]

iteration step : 0
score : 0.15580458722649296 recall : 0.15739837725331532
iteration step : 1
score : 0.2293599978566509 recall : 0.23742060789390992
iteration step : 2
score : 0.26611037004402743 recall : 0.27863545742152235
iteration step : 3
score : 0.2900067822021094 recall : 0.30618097427725727
iteration step : 4
score : 0.2963095390132179 recall : 0.31403483469091753


In [23]:
# 약 1분 소요 #
full_pred = np.dot(P,Q.T)
full_pred.shape

(8311, 20695)

In [25]:
pred_df = pd.DataFrame(full_pred,index=ratings_minmax_matrix_df.index,columns=ratings_minmax_matrix_df.columns)
pred_df

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.287659e-07,4.127323e-08,4.316050e-09,1.603521e-08,3.895990e-09,4.019800e-08,4.297357e-08,1.274637e-10,1.055987e-08,3.892453e-08,...,2.487452e-08,9.974310e-09,-3.487126e-08,-5.500743e-08,-1.316652e-08,-2.664366e-08,-1.148986e-08,-6.278178e-08,5.686299e-08,1.734185e-08
5,2.294626e-07,6.952564e-08,4.344790e-08,2.817402e-08,2.551308e-08,4.421875e-08,4.510818e-08,-4.165661e-10,-4.021887e-08,-3.543156e-10,...,1.610196e-08,4.423719e-09,1.277007e-08,1.114034e-08,-3.037938e-08,-1.630756e-09,4.509370e-08,7.704257e-09,-3.658291e-08,-4.684600e-08
7,3.558738e-08,-1.001810e-08,1.668402e-09,-2.328711e-08,8.488591e-09,3.236950e-09,-1.477896e-08,-9.426430e-09,-5.842372e-09,5.018174e-09,...,-3.088912e-09,3.279648e-09,3.587857e-08,3.366277e-09,-5.163427e-09,1.771303e-08,1.587086e-08,3.377212e-08,-2.950526e-08,7.286611e-09
12,1.258077e-08,3.054672e-08,6.148699e-09,-1.965197e-08,-4.962971e-09,-1.590313e-08,1.668039e-08,-4.629538e-08,-1.522525e-08,2.364918e-09,...,-4.036442e-08,5.688497e-09,-1.676012e-08,-4.118977e-08,1.065106e-08,1.837664e-09,3.489025e-09,1.508614e-08,7.502245e-09,1.862654e-09
16,1.063131e-07,6.319872e-08,3.537020e-08,-8.309374e-09,-3.377238e-08,4.012181e-09,-2.467189e-08,-1.693449e-08,1.110623e-08,2.399084e-08,...,1.903332e-08,-1.530952e-08,3.321926e-08,-1.201016e-08,1.081768e-08,1.223313e-08,3.636201e-08,6.796316e-09,3.152310e-08,-2.412251e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33022,9.137250e-09,2.943904e-08,5.145882e-08,1.463967e-08,1.864033e-08,3.584880e-08,5.527221e-10,5.278930e-09,-1.057841e-08,6.860704e-09,...,-2.929156e-08,-3.777176e-08,-2.365864e-08,-4.004539e-08,-3.040211e-08,-1.894065e-08,-3.219114e-09,-3.098849e-08,-7.697838e-09,2.984918e-08
33023,2.065409e-07,3.854208e-08,2.557218e-08,2.722311e-08,5.199599e-08,1.500472e-08,9.278882e-09,1.750100e-08,-4.696028e-09,1.506324e-08,...,-1.564541e-08,-7.234860e-09,-1.311406e-08,2.234706e-08,1.626534e-08,3.875538e-08,2.166288e-08,-1.235808e-08,-2.850699e-08,-5.431228e-09
33026,5.770144e-09,2.760988e-08,2.719964e-08,-2.869297e-08,3.064651e-09,-1.583944e-08,-3.519183e-08,1.859471e-08,1.998915e-08,2.685041e-08,...,-8.982538e-10,1.352967e-08,-5.177990e-08,2.740728e-09,1.522371e-09,2.116690e-08,-2.026155e-08,9.488591e-09,-8.809709e-09,-2.773594e-08
33027,3.365569e-08,1.924231e-08,4.146812e-08,-8.505515e-10,-8.067222e-09,-6.751126e-09,-2.063744e-08,-3.305740e-08,1.236317e-08,3.218411e-08,...,-1.711530e-08,-2.016684e-08,-1.119473e-08,-3.620785e-08,6.072888e-08,-2.662107e-08,-1.578597e-08,-1.381497e-09,1.884280e-08,2.850954e-09


# Valid 데이터에 대한 성능 평가

In [26]:
df_to_score(valid_data_df,pred_df)

score : 0.2963095390132179 recall : 0.31403483469091753
