In [372]:
import sys
sys.path.append("../../src")

import numpy as np
import pandas as pd

from utils import ndcg_calculator

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [373]:
test_answer_week = pd.read_parquet("../../data/test_answer_week.parquet")
test_answer_month = pd.read_parquet("../../data/test_answer_month.parquet")

df_train_week = pd.read_parquet("../../data/train_week.parquet")
df_train_month = pd.read_parquet("../../data/train_month.parquet")

sample_sumbission_week = pd.read_parquet("../../data/sample_sumbission_week.parquet")
sample_sumbission_month = pd.read_parquet("../../data/sample_sumbission_month.parquet")

# apriori 순서 반영
df_train_week.sort_values(by='log_dt', inplace=True)
df_train_month.sort_values(by='log_dt', inplace=True)

In [374]:
df_train_week.drop_duplicates(['profile_id','album_id'], inplace=True)
df_train_month.drop_duplicates(['profile_id','album_id'], inplace=True)

In [375]:
n = 25

# MP

In [376]:
MP_list_week = list(df_train_week.album_id.value_counts().head(n).index)
MP_list_month = list(df_train_month.album_id.value_counts().head(n).index)

# Apriori

In [377]:
def Apriori_encoder(df_train:pd.DataFrame())->pd.DataFrame():
    # each user's album_id to list
    transactions = [transaction[1]['album_id'].tolist() for transaction in \
                         list(df_train.groupby('profile_id'))]
    
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    
    orders_1hot = pd.DataFrame(te_ary, columns=te.columns_)
    '''
    output -> DataFrame(True&False)
    '''
    return orders_1hot

In [378]:
%%time
orders_1hot_week = Apriori_encoder(df_train_week)
orders_1hot_month = Apriori_encoder(df_train_month)

CPU times: user 647 ms, sys: 29 ms, total: 676 ms
Wall time: 675 ms


In [379]:
def Apriori_train(orders_1hot:pd.DataFrame(), min_support_num:int, min_threshold_num:int)->pd.DataFrame():

    is_ap = apriori(orders_1hot, min_support=min_support_num, use_colnames=True)
    
    # matrix의 default가 confidence
    rules_confidence_item = association_rules(is_ap, min_threshold= min_threshold_num)

    #lift 기준으로 sort
    rules_confidence_item = rules_confidence_item.sort_values(['lift'], ascending = False)
    return rules_confidence_item

In [380]:
%%time
rules_confidence_item_week = Apriori_train(orders_1hot_week, 0.1, 0.8)
rules_confidence_item_month = Apriori_train(orders_1hot_month, 0.1, 0.8)

CPU times: user 243 ms, sys: 9.78 ms, total: 253 ms
Wall time: 252 ms


In [381]:
rules_confidence_item_week

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,(17),"(16, 15)",0.118034,0.146953,0.106119,0.899054,6.117969,0.088773,8.450497
13,(18),"(16, 15)",0.119772,0.146953,0.10103,0.843523,5.740091,0.083429,5.451592
16,(19),"(16, 15)",0.123867,0.146953,0.100658,0.812625,5.529832,0.082455,4.552625
8,"(16, 17)",(15),0.114062,0.168673,0.106119,0.930359,5.515749,0.08688,11.937333
14,"(16, 19)",(15),0.108849,0.168673,0.100658,0.924743,5.482456,0.082298,11.04657
11,"(16, 18)",(15),0.109966,0.168673,0.10103,0.918736,5.44684,0.082482,10.229938
1,(17),(15),0.118034,0.168673,0.107857,0.913775,5.417428,0.087947,9.641363
2,(18),(15),0.119772,0.168673,0.106119,0.88601,5.252822,0.085917,7.293003
17,"(16, 38)",(15),0.125605,0.168673,0.107608,0.856719,5.079167,0.086422,5.802088
3,(19),(15),0.123867,0.168673,0.104381,0.842685,4.995965,0.083488,5.284485


In [382]:
rules_confidence_item_month

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
3,"(124, 125)",(65),0.131261,0.181038,0.109665,0.835473,4.61489,0.085901,4.97766
0,(15),(16),0.144433,0.187012,0.122377,0.847296,4.530708,0.095367,5.323944
2,"(65, 125)",(124),0.129269,0.248736,0.109665,0.848341,3.410603,0.077511,4.953644
1,(65),(124),0.181038,0.248736,0.147496,0.814721,3.275439,0.102465,4.054765


# prediction

In [383]:
def Apriori_pred(df_train:pd.DataFrame(), rules_confidence_item:pd.DataFrame())->pd:
    unique_user_transaction_list = df_train.groupby('profile_id')['album_id'].unique().reset_index()

    # user transaction 목록을 가져와 lift 순으로 consequents 담기
    rules_confidence_list = []
    for album_list in unique_user_transaction_list['album_id']:
        user_consequents_list = []
        for idx, antecedents in enumerate(rules_confidence_item['antecedents']):
            # issubset -> antecedents 하위집합이면 True
            if antecedents.issubset(album_list):
                user_consequents_list += rules_confidence_item['consequents'][idx]
        # 순서 유지 중복 제거
        rules_confidence_list.append(list(dict.fromkeys(user_consequents_list)))
    # apriori pred 담기    
    unique_user_transaction_list['album_id'] = rules_confidence_list
    
    return unique_user_transaction_list

In [384]:
# week Apriori pred@25
apriori_pred_week = Apriori_pred(df_train_week, rules_confidence_item_week)
apriori_sumbission_week = sample_sumbission_week.copy().drop(columns='album_id')
apriori_sumbission_week = apriori_sumbission_week.merge(apriori_pred_week, on='profile_id')
# week Apriori + week MP pred@25
apriori_mp_sumbission_week = apriori_sumbission_week.copy()
apriori_mp_sumbission_week['album_id'] = apriori_sumbission_week['album_id'].apply(lambda x: (x + MP_list_week)[:n])

# month Apriori pred@25
apriori_pred_month = Apriori_pred(df_train_month, rules_confidence_item_month)
apriori_sumbission_month = sample_sumbission_month.copy().drop(columns='album_id')
apriori_sumbission_month = apriori_sumbission_month.merge(apriori_pred_month, on='profile_id')
# month Apriori + month MP pred@25
apriori_mp_sumbission_month = apriori_sumbission_month.copy()
apriori_mp_sumbission_month['album_id'] = apriori_sumbission_month['album_id'].apply(lambda x: (x + MP_list_month)[:n])

In [385]:
apriori_mp_sumbission_week

Unnamed: 0,profile_id,album_id
0,5,"[16, 15, 124, 65, 124, 16, 125, 241, 65, 15, 3..."
1,20,"[124, 16, 125, 241, 65, 15, 38, 339, 190, 347,..."
2,22,"[16, 15, 124, 124, 16, 125, 241, 65, 15, 38, 3..."
3,24,"[124, 16, 125, 241, 65, 15, 38, 339, 190, 347,..."
4,31,"[124, 16, 125, 241, 65, 15, 38, 339, 190, 347,..."
...,...,...
2182,32965,"[124, 16, 125, 241, 65, 15, 38, 339, 190, 347,..."
2183,32978,"[124, 16, 125, 241, 65, 15, 38, 339, 190, 347,..."
2184,32979,"[16, 15, 124, 16, 125, 241, 65, 15, 38, 339, 1..."
2185,32998,"[124, 16, 125, 241, 65, 15, 38, 339, 190, 347,..."


# Evaluation

In [386]:
%%time
apriori_week_ndcg = ndcg_calculator(test_answer_week, apriori_sumbission_week, n)
apriori_mp_week_ndcg = ndcg_calculator(test_answer_week, apriori_mp_sumbission_week, n)

apriori_month_ndcg = ndcg_calculator(test_answer_month, apriori_sumbission_month, n)
apriori_mp_month_ndcg = ndcg_calculator(test_answer_month, apriori_mp_sumbission_month, n)

CPU times: user 559 ms, sys: 2.31 ms, total: 561 ms
Wall time: 561 ms


In [387]:
print("Week performance")
print(f"nDCG(apriori): {apriori_week_ndcg:.4f}")
print(f"nDCG(apriori + mp): {apriori_mp_week_ndcg:.4f} \n")

print("Month performance")
print(f"nDCG(apriori): {apriori_month_ndcg:.4f}")
print(f"nDCG(apriori + mp): {apriori_mp_month_ndcg:.4f}")

Week performance
nDCG(apriori): 0.0158
nDCG(apriori + mp): 0.0412 

Month performance
nDCG(apriori): 0.0099
nDCG(apriori + mp): 0.0586
