# Import

In [4]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Data Load

In [5]:
history_df = pd.read_csv('../data/history_data.csv',encoding='utf-8')

## 중복 데이터 제거 ##
history_df = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
history_df['rating']=1
history_df.head(3)

Unnamed: 0,profile_id,log_time,album_id,rating
0,3,20220301115719,15,1
1,3,20220301115809,16,1
2,3,20220301115958,17,1


# Train / Valid split

In [6]:
## user 별 전체 데이터중 80% train / 20% valid 사용 ## 

In [7]:
### user 별 전체 데이터*0.8 에 해당해는 데이터 개수 행 추가 ###
count_df = history_df.groupby(['profile_id']).count()
count_df = count_df[['album_id']]
count_df['train_count'] = count_df[['album_id']].apply(lambda x : x*(0.8))
count_df

Unnamed: 0_level_0,album_id,train_count
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,21,16.8
5,543,434.4
7,2,1.6
12,7,5.6
16,3,2.4
...,...,...
33022,2,1.6
33023,12,9.6
33026,1,0.8
33027,15,12.0


In [8]:
### 정확한 valid 분리를 위하여 각 user 별 시간순으로 정렬 ###
history_df = history_df.sort_values(['log_time'])
history_df

Unnamed: 0,profile_id,log_time,album_id,rating
798337,25844,20220301000418,18024,1
798338,25844,20220301000531,1881,1
185888,4783,20220301000656,201,1
798339,25844,20220301000668,4608,1
101611,2794,20220301000805,2641,1
...,...,...,...,...
250052,6435,20220430235415,2467,1
64322,2086,20220430235656,2184,1
313534,8440,20220430235710,348,1
292774,7703,20220430235855,188,1


In [11]:
### user별 시간순으로 정렬된 history_df를 이용하여 먼저본 80% 의 데이터를 train에 넣어주고 나중에 본 20%를 valid 에 넣어주기 ###

count_dict = defaultdict(int)

history_matrix = history_df.values
train_data=[]
valid_data=[]
for row in tqdm(history_matrix):
    profile_id = row[0]
    if count_dict[profile_id]<count_df.loc[profile_id,'train_count']:
        count_dict[profile_id]+=1
        train_data.append(row)
    else:
        valid_data.append(row)

  0%|          | 0/899252 [00:00<?, ?it/s]

In [15]:
### train_data 를 DataFrame 변환 ###
train_data_df = pd.DataFrame(train_data,columns=history_df.columns)
train_data_df

Unnamed: 0,profile_id,log_time,album_id,rating
0,25844,20220301000418,18024,1
1,25844,20220301000531,1881,1
2,4783,20220301000656,201,1
3,25844,20220301000668,4608,1
4,2794,20220301000805,2641,1
...,...,...,...,...
722707,7562,20220430233135,114,1
722708,18994,20220430233509,4053,1
722709,18994,20220430233524,818,1
722710,18994,20220430233542,818,1


In [16]:
### valid_data 를 DataFrame 변환 ###
valid_data_df = pd.DataFrame(valid_data,columns=history_df.columns)
valid_data_df

Unnamed: 0,profile_id,log_time,album_id,rating
0,6967,20220301101930,1465,1
1,6967,20220301102276,1747,1
2,6967,20220301102572,6529,1
3,6967,20220301102663,6530,1
4,6967,20220301102871,6531,1
...,...,...,...,...
176535,5597,20220430235403,2519,1
176536,6435,20220430235415,2467,1
176537,2086,20220430235656,2184,1
176538,8440,20220430235710,348,1


# 전체 데이터를 통하여 table 생성

In [19]:
## Train 과 Valid로 나눈 데이터중 Train 데이터를 이용하여 pivot_table을 생성하면 ##
## tarin에는 없고 valid에만 있는 (profile_id - album_id)가 존재하여 ##
## 행렬 크기가 달라지므로 전체 데이터를 이용하여 pivot_table 생성 ##

### 예시 ###
train_n_users = train_data_df.profile_id.nunique()
train_n_items = train_data_df.album_id.nunique()
print(train_n_users,train_n_items)

n_users = history_df.profile_id.nunique()
n_items = history_df.album_id.nunique()
print(n_users,n_items)
#############

8311 19657
8311 20695


In [20]:
## (8311,19657) 과 (8311,20695)로 차이 발생 --> 정확한 평가 어려움 ##
## 따라서 전체 데이터를 이용하여 table 생성

In [21]:
n_users = history_df.profile_id.nunique()
n_items = history_df.album_id.nunique()
print(n_users,n_items)

8311 20695


In [24]:
## 데이터가 있는 행과 열을 기준으로 table 형성을 위하여 rating 기준으로 dataframe 생성 ##

In [25]:
ratings_matrix_df = history_df.pivot_table('rating',index='profile_id',columns='album_id')
ratings_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,


In [26]:
## ratings_matrix_df 와 동일한 크기의 zero DataFrame 생성 ##

In [27]:
ratings_total_matrix_df = pd.DataFrame(np.zeros(ratings_matrix_df.values.shape),index=ratings_matrix_df.index,columns=ratings_matrix_df.columns)
ratings_total_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
## zero DataFrame을 train 데이터만을 이용하여 (profile_id , album_id) 쌍의 개수를 세주기 ##

In [29]:
train_data = train_data_df.values

for row in tqdm(range(train_data.shape[0])): 
    row_data =train_data[row] # row_data = profile_id ,log_time, album_id, rating
    ratings_total_matrix_df.loc[row_data[0],row_data[2]]+=1

ratings_total_matrix_df.head()

  0%|          | 0/722712 [00:00<?, ?it/s]

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 유저별로 최대 1 최소 0 으로 Minmaxscaler 적용

In [30]:
## MinMaxScaler 적용을 위해 transpose() 후 다시 transpose() 적용 ##
scaler = MinMaxScaler()
ratings_minmax_matrix_t = scaler.fit_transform(ratings_total_matrix_df.transpose())
ratings_minmax_matrix_df_t = pd.DataFrame(ratings_minmax_matrix_t,index=ratings_total_matrix_df.columns,columns=ratings_total_matrix_df.index)
ratings_minmax_matrix_df = ratings_minmax_matrix_df_t.transpose()
ratings_minmax_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 아이템 기반 최근접 이웃 협업 필터링 적용

In [32]:
## 아이템 기반 최근접 이웃 협업 필터링 적용을 위한 album 행 별로 cosine 유사도 구하기 ##
cos_sim = cosine_similarity(ratings_minmax_matrix_df.values.T,ratings_minmax_matrix_df.values.T)
cos_sim_df = pd.DataFrame(cos_sim,index = ratings_minmax_matrix_df.columns,columns=ratings_minmax_matrix_df.columns)

In [33]:
cos_sim_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.141787,0.111016,0.060306,0.022496,0.044933,0.043651,0.008757,0.0,0.02462,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.141787,1.0,0.451736,0.048898,0.015265,0.009614,0.010933,0.00016,0.0,0.00951,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.111016,0.451736,1.0,0.047067,0.008209,0.001323,0.002125,0.0,0.0,0.008258,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.060306,0.048898,0.047067,1.0,0.005359,0.006285,0.003364,0.014597,0.0,0.005207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.022496,0.015265,0.008209,0.005359,1.0,0.008156,0.003678,0.0,0.0,0.000268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
## 코사인 유사도가 비슷한 상위 n개의 album에 각 (user_id,album_id) 값과 코사인 유사도를 곱해주고 합한뒤 전체 코사인 유사도의 합으로 나누어주기 ##
## n은 하이퍼 파라미터 --> 25정도일경우 가장 높은 score를 보임 ##
## 연산시간 약 15분 소요 ##
ratings_minmax_matrix=ratings_minmax_matrix_df.values

def find_pred_n_sim(n_sim=20):
    pred = np.zeros(ratings_minmax_matrix.shape)
    
    for col in tqdm(range(ratings_minmax_matrix.shape[1])):
        cos_sim_ind = np.argsort(cos_sim[col])[:-n_sim-1:-1]
        for row in range(ratings_minmax_matrix.shape[0]):
            pred[row,col]=ratings_minmax_matrix[row,cos_sim_ind].dot(cos_sim[cos_sim_ind,col])/np.sum(cos_sim[cos_sim_ind,col])   
    
    return pred

pre = find_pred_n_sim(25)

  0%|          | 0/20695 [00:00<?, ?it/s]

  pred[row,col]=ratings_minmax_matrix[row,cos_sim_ind].dot(cos_sim[cos_sim_ind,col])/np.sum(cos_sim[cos_sim_ind,col])


In [35]:
## 예측 행렬의 DataFrame 생성 ##
pred_df = pd.DataFrame(pre,index=ratings_minmax_matrix_df.index,columns=ratings_minmax_matrix_df.columns)
pred_df

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.047505,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,
5,0.091352,0.004802,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.055665,...,0.0,0.0,0.0,0.0,0.0,0.000197,0.000197,0.000197,0.000197,
7,0.030674,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,
12,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,
16,0.031385,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33022,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,
33023,0.042148,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,
33026,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,
33027,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,


# 성능 평가를 위한 함수

In [41]:
def recallk(actual, predicted, k = 25):
    """ label과 prediction 사이의 recall 평가 함수 
    Args:
        actual : 실제로 본 상품 리스트
        pred : 예측한 상품 리스트
        k : 상위 몇개의 데이터를 볼지 (ex : k=5 상위 5개의 상품만 봄)
    Returns: 
        recall_k : recall@k 
    """     
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

def unique(sequence):
    # preserves order
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def ndcgk(actual, predicted, k = 25):

    set_actual = set(actual)
    idcg = sum([1.0 / np.log(i + 2) for i in range(min(k, len(set_actual)))])
    dcg = 0.0
    unique_predicted = unique(predicted[:k])
    for i, r in enumerate(unique_predicted):
        if r in set_actual:
            dcg += 1.0 / np.log(i + 2)
    ndcg_k = dcg / idcg
    return ndcg_k

def evaluation(gt, pred):
    """ label과 prediction 사이의 recall, coverage, competition metric 평가 함수 
    Args:
        gt : 데이터 프레임 형태의 정답 데이터 
        pred : 데이터 프레임 형태의 예측 데이터 
    Returns: 
        rets : recall, ndcg, coverage, competition metric 결과 
            ex) {'recall': 0.123024, 'ndcg': 056809, 'coverage': 0.017455, 'score': 0.106470}
    """    
    gt = gt.groupby('profile_id')['album_id'].unique().to_frame().reset_index()
    gt.columns = ['profile_id', 'actual_list']

    evaluated_data = pd.merge(pred, gt, how = 'left', on = 'profile_id')
    evaluated_data['Recall@25'] = evaluated_data.apply(lambda x: recallk(x.actual_list, x.predicted_list), axis=1)
    evaluated_data['NDCG@25'] = evaluated_data.apply(lambda x: ndcgk(x.actual_list, x.predicted_list), axis=1)

    recall = evaluated_data['Recall@25'].mean()
    ndcg = evaluated_data['NDCG@25'] .mean()

    score = 0.75*recall + 0.25*ndcg
    rets = {"recall" :recall, 
            "ndcg" :ndcg,  
            "score" :score}
    return rets

## 예측 DataFrame을 통해 score 평가하기 ## 
def df_to_score(actual_df,predict_df):
    # actual = valid_data_df
    pred = pd.DataFrame()
    query_user_ids = actual_df['profile_id'].unique()
    pred_list=[]
    for user_id in query_user_ids:
        items = predict_df.loc[user_id,:].sort_values(ascending=False).index.values[:25]
        pred_list.append(list(items))

    pred['profile_id'] = query_user_ids
    pred['predicted_list'] = pred_list
    rets = evaluation(actual_df, pred)

    print('score :',rets['score'],'recall :',rets['recall'])

# Valid 데이터에 대한 성능 평가

In [42]:
df_to_score(valid_data_df,pred_df)

score : 0.3368245744288646 recall : 0.36500059168032895
