In [1722]:
import sys
sys.path.append("../../src")

import numpy as np
import pandas as pd

import lightgbm

from utils import ndcg_calculator

import datetime as dt
from tqdm import tqdm

# dataload

In [1723]:
def dataload(path:str='../../data/'):

    test_answer_week = pd.read_parquet(path + "test_answer_week.parquet")
    test_answer_month = pd.read_parquet(path + "test_answer_month.parquet")

    df_train_week = pd.read_parquet(path + "train_week.parquet")
    df_train_month = pd.read_parquet(path + "train_month.parquet")

    sample_sumbission_week = pd.read_parquet(path + "sample_sumbission_week.parquet")
    sample_sumbission_month = pd.read_parquet(path + "sample_sumbission_month.parquet")

    df_train_week.sort_values(by='log_dt', inplace=True)
    df_train_month.sort_values(by='log_dt', inplace=True)
    
    return test_answer_week, test_answer_month, df_train_week, df_train_month, sample_sumbission_week, sample_sumbission_month

In [1724]:
path='../../data/'

test_answer_week, test_answer_month, \
df_train_week, df_train_month, \
sample_sumbission_week, sample_sumbission_month = dataload(path)

In [1725]:
# 빠르게 month 테스트
# df_train_week = df_train_month.copy()
# sample_sumbission_week = sample_sumbission_month.copy()
# test_answer_week = test_answer_month.copy()

# preprocess

In [1726]:
n = 25

# 1.Feature engineering

In [1727]:
# week & day feature engineering
def week_day_feature(df_train:pd.DataFrame())->pd.DataFrame():
    df_train['week'] = df_train['log_date'].apply(lambda x: x.isocalendar()[1])
    df_train['day'] = df_train['log_date'].apply(lambda x: x.isocalendar()[2])
    week_min = df_train.week.min()
    df_train['week'] = df_train['week'].apply(lambda x: x-week_min)
    
    return  df_train

# album_cnt & album_rank feature engineering
def album_cnt_rank_feature(df_train:pd.DataFrame())->pd.DataFrame():
    album_cnt = df_train.album_id.value_counts().reset_index().rename(columns={'index':'album_id','album_id':'album_cnt'})
    album_cnt['rank'] = album_cnt['album_cnt'].rank(method='first', ascending=False)
    df_train = df_train.merge(album_cnt, on='album_id')
    
    return df_train

def feature_engineering(df_train:pd.DataFrame())->pd.DataFrame():
    df_train = week_day_feature(df_train)
    df_train = album_cnt_rank_feature(df_train)
    
    return df_train

In [1728]:
df_train_week = feature_engineering(df_train_week)
df_train_month = feature_engineering(df_train_month)

# 2.basic candidate(MP@300 , latest each user@5)

In [1729]:
# # MP@300 & MP_percent feature made
# def MP_candidate(df_train:pd.DataFrame(), cand=300)->pd.DataFrame():
#     MP_df = df_train.album_id.value_counts().reset_index()
#     MP_df.rename(columns={'index':'album_id','album_id':'item_cnt'}, inplace=True)

#     # MP@300
#     '''
#     MP 중복 시청 포함 
#     '''
#     MP_cand = MP_df[:cand][['album_id']]
#     return MP_cand


# # latest 1 day each user@5
# def latest_candidate(df_train:pd.DataFrame())->pd.DataFrame():
#     seven_days = df_train['log_date'].max()- dt.timedelta(days=1)
#     latest_history = df_train[df_train['log_date']>=seven_days]
#     latest_history = latest_history.groupby(['album_id']).count()['profile_id'].reset_index().rename(columns={'profile_id':'latest_cnt'})
    
#     # history count more than least 2 
#     latest_cand = latest_history[latest_history['latest_cnt']>=2][['album_id']].drop_duplicates()
    
#     return latest_cand


# def candidate(df_train:pd.DataFrame(), cand=300)->pd.DataFrame():
    
#     MP_cand = MP_candidate(df_train, cand)
#     latest_cand = latest_candidate(df_train)
#     cand = pd.concat([MP_cand,latest_cand])
#     cand.drop_duplicates('album_id', inplace=True)
#     cand['rating'] = 1
    
#     return cand, MP_cand

In [1730]:
# cand_week, MP_cand_week = candidate(df_train_week,300)
# cand_month, MP_cand_month = candidate(df_train_month,300)

In [1731]:
# a = df_train_week.groupby(['profile_id','album_id'])['ss_id'].count().sort_valu
# a
# # .sort_values(by=['profile_id','ss_id'])

In [1732]:
# 라벨 방법1
last_week = df_train_week['week'].max()

label_df = df_train_week.query(f'week=={last_week}').copy()
df_train = df_train_week.query(f"week < {last_week}").copy()

In [1733]:
# 라벨 방법2
label_df_preprocess = df_train_week.drop_duplicates(subset=['profile_id','album_id'])
label_df_preprocess = label_df_preprocess.sort_values(by='log_date', ascending=False)
label_df_list = []
for user_id in label_df_preprocess.profile_id.unique():
    label_df_list.append(label_df_preprocess[label_df_preprocess['profile_id']==user_id].head(30))
    
label_df = pd.concat(label_df_list)
df_train = df_train_week.copy()

In [1734]:
personal_train = df_train.drop_duplicates(subset=['profile_id','album_id','ss_id'])
df_train = df_train.drop_duplicates(subset=['profile_id','album_id'])

In [1735]:
label_df = label_df[['profile_id','album_id']]
label_df.drop_duplicates(subset=['profile_id','album_id'],inplace=True)
label_df['rating'] = 1

In [1736]:
# 라벨 유저 한정1
# customers = label_df.profile_id.unique()
# 전체 유저 한정
customers = df_train_week.profile_id.unique()

In [1737]:
last_week

7

# general MP
- 마지막 1주, 2주의 MP를 각 유저마다 넣는다.

In [1738]:
df_train.week.unique()

array([0, 1, 2, 4, 5, 3, 6, 7])

In [1739]:
print('중복제거 후 데이터 수:', len(df_train_week))

중복제거 후 데이터 수: 945518


In [1740]:
# 마지막 6,5주 각각 MP를 10개 뽑음
last_week_ver1 = 6
last_week_ver2 = 5

MP_latest_ver1_df = df_train.query(f"week == {last_week_ver1}")

In [1741]:
MP_df = MP_latest_ver1_df.groupby('album_id')['profile_id'].count().sort_values(ascending=False)
MP_df = MP_df.reset_index()
MP_df.columns = ['album_id','counts']
MP_candidate_df = MP_df[:10].copy()
MP_candidate_df['join_col'] = 1

In [1742]:
# df_train_week 전체 유저 대상으로 후보군을 뽑을 것임
customer_df = df_train_week[df_train_week['profile_id'].isin(customers)][['profile_id']]
customer_df['join_col'] = 1
popular_articles_cand_ver1 = customer_df.copy()
popular_articles_cand_ver1 = popular_articles_cand_ver1.merge(MP_candidate_df, on="join_col")

popular_articles_cand_ver1.drop_duplicates(subset=['profile_id','album_id'],inplace=True)

In [1743]:
MP_latest_ver2_df = df_train.query(f"week == {last_week_ver2}")

In [1744]:
MP_df = MP_latest_ver2_df.groupby('album_id')['profile_id'].count().sort_values(ascending=False)
MP_df = MP_df.reset_index()
MP_df.columns = ['album_id','counts']
MP_candidate_df = MP_df[:10].copy()
MP_candidate_df['join_col'] = 1

In [1745]:
customer_df = df_train_week[df_train_week['profile_id'].isin(customers)][['profile_id']]
customer_df['join_col'] = 1
popular_articles_cand_ver2 = customer_df.copy()
popular_articles_cand_ver2 = popular_articles_cand_ver2.merge(MP_candidate_df, on="join_col")

popular_articles_cand_ver2.drop_duplicates(subset=['profile_id','album_id'],inplace=True)

In [1746]:
popular_articles_cand = pd.concat([popular_articles_cand_ver1, popular_articles_cand_ver2])
popular_articles_cand = popular_articles_cand.groupby(['profile_id','album_id'])['counts'].sum().reset_index()

In [1747]:
popular_articles_cand

Unnamed: 0,profile_id,album_id,counts
0,3,16,288
1,3,38,132
2,3,52,156
3,3,125,271
4,3,190,183
...,...,...,...
96679,33019,329,171
96680,33019,339,463
96681,33019,347,389
96682,33019,987,302


In [1748]:
# popular_articles_cand = popular_articles_cand[popular_articles_cand['profile_id'].isin(customers)]

# personal_MP

In [1749]:
personal_MP_df = personal_train.groupby(['profile_id','album_id'])[['ss_id']].count().reset_index()
personal_MP_df.columns = ['profile_id','album_id','personal_counts']

In [1750]:
# 서로 다른날 5회 이상 시청한 앨범만
personal_MP = personal_MP_df[personal_MP_df['personal_counts'] >= 5]
personal_MP = personal_MP.sort_values(by=['profile_id','personal_counts'],ascending=False)

In [1751]:
# 상위 5개만 pick
head_df_list = []
# 전체 유저 대상으로 뽑기
for user_id in tqdm(customer_df.profile_id.unique()):
    personal_MP_user_len = len(personal_MP[personal_MP['profile_id']==user_id].head())
    random_choice_list = personal_MP.album_id.unique()
    if personal_MP_user_len <5:
        # 5개 아이템이 없는 경우 랜덤으로 없는 개수만 만큼 choice
        user_df = personal_MP[personal_MP['profile_id']==user_id]
        df = pd.DataFrame()
        random_choices = np.random.choice(random_choice_list, size=(5-personal_MP_user_len))
        df['profile_id'] = [user_id for _ in range(5-personal_MP_user_len)]
        df['album_id'] = random_choices
        df = pd.concat([user_df, df])
        head_df_list.append(df)
    else:
        head_df_list.append(personal_MP[personal_MP['profile_id']==user_id].head())
        
personal_MP_candidate = pd.concat(head_df_list)

100%|██████████████████████████████████████| 8057/8057 [00:06<00:00, 1290.19it/s]


In [1752]:
# personal_MP_candidate = personal_MP_candidate[personal_MP_candidate['profile_id'].isin(customers)]

# MP_user_genre

In [1753]:
# week, day, album_cnt, rank 컬럼 candidate 붙여야 함
# df_train

In [1754]:
meta_df = pd.read_csv(path+'meta_data.csv')
meta_df = meta_df[['album_id','genre_mid','run_time','cast_1','cast_2','cast_3']]

  meta_df = pd.read_csv(path+'meta_data.csv')


In [1755]:
df_train_meta = df_train.merge(meta_df, on='album_id')

In [1756]:
user_genre_df = df_train_meta.groupby(['profile_id','genre_mid']).count()['ss_id'].reset_index()
user_genre_df.columns = ['profile_id','genre_mid','genre_cnt']
user_genre_df = user_genre_df.groupby(['profile_id','genre_mid']).sum().reset_index().sort_values(by=['profile_id','genre_cnt'],ascending=False)

# 장르 선호도 피처 만들기
## 100이상 시청한 사람들만 percent
user_total_watch_dict = user_genre_df.groupby('profile_id')['genre_cnt'].sum()\
                        [user_genre_df.groupby('profile_id')['genre_cnt'].sum()>=100].to_dict()
# 전체 시청 피처 만들기
user_genre_df['user_genre_cnt'] = user_genre_df['profile_id'].apply(lambda x: user_total_watch_dict.get(x, None))
user_genre_df['user_genre_percent'] = user_genre_df['genre_cnt']/user_genre_df['user_genre_cnt']
user_genre_df.drop(columns=['user_genre_cnt'],inplace=True)
user_genre_df.dropna(subset=['user_genre_percent'],axis=0,inplace=True)

In [1757]:
genre_top_items = {}
genre_count = df_train_meta['genre_mid'].value_counts()
for genre in genre_count.index:
    genre_top_items[genre] = list(df_train_meta[df_train_meta['genre_mid']==genre]['album_id'].value_counts().head(10).index)

In [1758]:
df_list = []
for user_id in customer_df.profile_id.unique():
    user_genres = user_genre_df[user_genre_df['profile_id']== user_id].head(2)['genre_mid']
    
    df = pd.DataFrame()
    if len(user_genres) == 0:
        
        df['album_id'] = genre_top_items['노래율동']
        df['album_id'] = genre_top_items['TV만화']
        
    elif len(user_genres) == 1:
        genre_list_1 = genre_top_items[user_genres.values[0]]
        genre_list_2 = genre_top_items['노래율동']
        df['album_id'] = list(dict.fromkeys(np.append(genre_list_1,genre_list_2)))
        
    elif len(user_genres) == 2:
        genre_list_1 = genre_top_items.get(user_genres.values[0],[])
        genre_list_2 = genre_top_items.get(user_genres.values[1],[])
        df['album_id'] = list(dict.fromkeys(np.append(genre_list_1,genre_list_2)))

    df['profile_id'] = user_id
    df_list.append(df)

In [1759]:
genre_candidate = pd.concat(df_list, ignore_index=True)
genre_candidate = genre_candidate[['profile_id','album_id']]

In [1760]:
# genre_candidate = genre_candidate[genre_candidate['profile_id'].isin(customers)]

# candidate merge

In [1761]:
candidate_1 = popular_articles_cand[['profile_id','album_id']]
candidate_2 = personal_MP_candidate[['profile_id','album_id']]
candidate_3 = genre_candidate[['profile_id','album_id']]

cand = pd.concat([candidate_1, candidate_2, candidate_3])
cand.drop_duplicates(subset=['profile_id','album_id'],inplace=True)

In [1762]:
candidate = pd.merge(cand, popular_articles_cand[['album_id','counts']].drop_duplicates(), how='left', on='album_id')
# personal_MP_df 달라짐
candidate = pd.merge(candidate,personal_MP_df, how='left', on=['profile_id','album_id'])

In [1763]:
print('candidate 데이터 수:',len(candidate), 'cand 데이터 수:',len(cand))

candidate 데이터 수: 217677 cand 데이터 수: 217677


# model preprocess

In [1764]:
profile_df = pd.read_csv(path+'profile_data.csv')

In [1765]:
# candidate_add_features = pd.merge(candidate,profile_df, how='left', on='profile_id')
# candidate_add_features = pd.merge(candidate_add_features,meta_df.drop_duplicates('album_id'), how='left', on='album_id')

In [1766]:
# columns = ['sex','pr_interest_keyword_cd_1','pr_interest_keyword_cd_2','pr_interest_keyword_cd_3','ch_interest_keyword_cd_1','ch_interest_keyword_cd_2','ch_interest_keyword_cd_3',\
# 'genre_mid','cast_1','cast_2','cast_3']
# from sklearn.preprocessing import LabelEncoder
# for col in columns:
#     LE = LabelEncoder()
#     candidate_add_features[col] = LE.fit_transform(candidate_add_features[col])

In [1767]:
candidate_add_features = candidate.copy()

In [1768]:
train_df = pd.merge(candidate_add_features, label_df, how='left', on=['profile_id','album_id'])
train_df['rating'] = train_df['rating'].fillna(0)

In [1769]:
# cf_df = train_df.sample(frac=0.5, random_state=42).reset_index(drop=True)

In [1770]:
# def label_preprocess(df_train:pd.DataFrame(), cand:pd.DataFrame()):
#     merge_train_week = df_train.drop_duplicates(subset=['profile_id','album_id'])

#     train_df = pd.merge(merge_train_week, cand, how='left', on='album_id')
#     drop_list = ['ss_id','act_target_dtl','payment','continuous_play','short_trailer','log_dt','log_date']
#     train_df.drop(columns=drop_list,inplace=True)
#     train_df.fillna(0, inplace=True)
    
#     return train_df

def lgbm_preprocess(train_df:pd.DataFrame()):
    X_train = train_df.drop(columns=['rating'])
    y_train = train_df['rating']
    
    train_group = train_df.groupby('profile_id')['profile_id'].count().to_numpy()
    return X_train, y_train, train_group

def preprocess(train:pd.DataFrame()):
#     train_df = label_preprocess(train, cand)
    X_train, y_train, train_group = lgbm_preprocess(train_df)
    
    return X_train, y_train, train_group

In [1771]:
train_week = preprocess(train_df)
# train_month = preprocess(df_train_month, cand_month)

X_train_week, y_train_week, train_group_week = train_week
# X_train_month, y_train_month, train_group_month = train_month

In [1772]:
y_train_week.value_counts()

0.0    205208
1.0     12469
Name: rating, dtype: int64

# model

In [1773]:
def train(X_train:pd.DataFrame(), y_train:pd.Series(), train_group:np.array, model_params:dict):
    model = lightgbm.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        boosting_type="dart",
        num_leaves= 20,
        learning_rate=0.005,
        n_estimators= model_params['n_estimators'],
        importance_type='gain',
        verbose= model_params['verbose'],
        random_state= model_params['random_state']
    )
    
    model.fit(
    X=X_train,
    y=y_train,
    group=train_group,
    )
    
    feature_importances_df = pd.DataFrame(dict(zip(X_train.columns, model.feature_importances_)), \
                                          index=['feature_importances']).T
    
    
    return model, feature_importances_df

  def train(X_train:pd.DataFrame(), y_train:pd.Series(), train_group:np.array, model_params:dict):


In [1774]:
X_train_week.drop(columns='personal_counts',inplace=True)

In [1775]:
model_params = {
    'n_estimators':5,
    'verbose':2,
    'random_state':42,
    'eval_at':25
}

model_week, feature_importances_df_week = train(X_train_week, y_train_week, train_group_week, model_params)
# model_month, feature_importances_df_month = train(X_train_month, y_train_month, train_group_month, model_params)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000218
[LightGBM] [Debug] init for col-wise cost 0.000026 seconds, init for row-wise cost 0.001529 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 524
[LightGBM] [Info] Number of data points in the train set: 217677, number of used features: 3
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 20 and depth = 9


In [1776]:
feature_importances_df_week.style.bar(color='lightgreen', subset='feature_importances')

Unnamed: 0,feature_importances
profile_id,717.511404
album_id,9645.274308
counts,1876.312012


# Evaluation

In [1777]:
def valid_evaluation(
            model, 
            X_train:pd.DataFrame(), 
            sample_sumbission:pd.DataFrame(), 
            n:int, 
            test_answer
            )->pd.DataFrame():
    
    random_item_list = X_train.album_id.unique()
    
    pred = model.predict(X_train)
    X_train['pred'] = pred
    
#     MP_list = MP_cand.album_id.values
    
    # each user pred 25 items
    lgbm_sub_df = X_train.sort_values(by='pred', ascending=False).groupby('profile_id').head(25)
    lgbm_user_items_dict = lgbm_sub_df.groupby('profile_id')['album_id'].unique().to_dict()
    sample_sumbission['album_id'] = sample_sumbission['profile_id'].apply(lambda x: lgbm_user_items_dict.get(x, np.random.choice(random_item_list, size=25)))
    
#     # cold start user file MP_list top25
#     sample_sumbission_cold = sample_sumbission.copy()
#     sample_sumbission_cold['album_id'] = sample_sumbission_cold['album_id']\
#                                             .apply(lambda x: list(dict.fromkeys(np.append(x, MP_list)))[:25])
    
    print('lgbm ndcg:', ndcg_calculator(sample_sumbission, test_answer, n))
#     print('lgbm ndcg cold_user to MP:', ndcg_calculator(sample_sumbission_cold, test_answer, n))
    
    return X_train, sample_sumbission

In [1778]:
print('week performance')
valid_evaluation_list_week = valid_evaluation(model_week, X_train_week, sample_sumbission_week, n, test_answer_week)
# print('month performance')
# valid_evaluation_list_month = valid_evaluation(model_month, X_train_month, sample_sumbission_month, n, MP_cand_month, test_answer_month)

week performance
lgbm ndcg: 0.059991747132555305


In [1720]:
(X_train, sample_sumbission_week) = valid_evaluation_list_week
# (X_train, sample_sumbission_month, sample_sumbission_cold_month) = valid_evaluation_list_month

In [1721]:
display(sample_sumbission_week_)

Unnamed: 0,profile_id,album_id
0,5,"[264, 16, 73, 38, 15, 18, 230, 136, 241, 130, ..."
1,20,"[426, 442, 471, 478, 474, 416, 339, 241, 329, ..."
2,22,"[264, 987, 16, 227, 291, 832, 302, 38, 15, 606..."
3,24,"[16, 1021, 52, 38, 241, 416, 417, 419, 606, 16..."
4,31,"[1021, 241, 52, 2054, 416, 329, 419, 417, 190,..."
...,...,...
2182,32965,"[16, 38, 52, 416, 58, 62, 419, 417, 1021, 606,..."
2183,32978,"[52, 38, 673, 16, 416, 417, 419, 606, 1021, 62..."
2184,32979,"[52, 40, 38, 16, 264, 1021, 1880, 416, 58, 62,..."
2185,32998,"[16, 38, 52, 416, 67, 62, 58, 417, 419, 606, 1..."


In [845]:
pred_list = []
for i_list in sample_sumbission_week.album_id:
    for i in i_list:
        pred_list.append(i)

In [920]:
len(set(pred_list)), len(set(gt_list))

(3414, 3833)

In [911]:
gt_list = []
for i_list in test_answer_week.album_id:
    for i in i_list:
        gt_list.append(i)

In [921]:
len(set(pred_list)- set(gt_list)), len(set(gt_list) - set(pred_list))

(1416, 1835)

In [63]:
def evaluation(
            X_train:pd.DataFrame(), 
            sumbission:pd.DataFrame(), 
            n:int, 
            MP_cand:pd.DataFrame()
            )->pd.DataFrame():
    
    MP_list = MP_cand.album_id.values
    
    # each user pred 25 items
    lgbm_sub_df = X_train.sort_values(by='pred', ascending=False).groupby('profile_id').head(25)
    lgbm_user_items_dict = lgbm_sub_df.groupby('profile_id')['album_id'].unique().to_dict()
    sumbission['predicted_list'] = sumbission['profile_id']\
                                            .apply(lambda x: lgbm_user_items_dict.get(x, []))
    
    # cold start user file MP_list top25
    sumbission_cold = sumbission.copy()
    sumbission_cold['predicted_list'] = sumbission['predicted_list']\
                                            .apply(lambda x: list(dict.fromkeys(np.append(x, MP_list)))[:25])
    
    return sumbission, sumbission_cold

In [65]:
submission = pd.read_csv(path + 'sample_submission.csv')
sumbission, sumbission_cold = evaluation(X_train_week, submission, n, MP_cand_week)

In [66]:
# 제출 조건 충족 확인
assert submission.profile_id.nunique() == sumbission_cold.profile_id.nunique()
for pred_list in sumbission_cold.predicted_list:
    assert len(pred_list) == 25

In [67]:
sample_sumbission_cold_week

Unnamed: 0,profile_id,album_id
0,5,"[224, 136, 157, 159, 41, 227, 158, 229, 226, 2..."
1,20,"[432, 416, 494, 33, 493, 505, 52, 491, 492, 47..."
2,22,"[224, 737, 740, 407, 606, 856, 227, 229, 888, ..."
3,24,"[1869, 16, 15, 19, 124, 17, 18, 38, 241, 125, ..."
4,31,"[1961, 1967, 1966, 977, 1942, 1926, 1943, 1962..."
...,...,...
2182,32965,"[407, 593, 16, 67, 1709, 1394, 714, 715, 13690..."
2183,32978,"[525, 1725, 52, 1006, 1024, 1023, 1007, 3429, ..."
2184,32979,"[733, 987, 264, 330, 1402, 1880, 16, 17, 38, 1..."
2185,32998,"[416, 417, 7105, 2021, 4246, 16, 15, 19, 124, ..."


In [None]:
# sample_sumbission_cold_week.to_csv('lgbm_basic_submission.csv', index=False)