In [291]:
import sys
sys.path.append("../../src")

import numpy as np
import pandas as pd

import lightgbm

from utils import ndcg_calculator

import datetime as dt
from tqdm import tqdm

# dataload

In [196]:
def dataload(path:str='../../data/'):

    test_answer_week = pd.read_parquet(path + "test_answer_week.parquet")
    test_answer_month = pd.read_parquet(path + "test_answer_month.parquet")

    df_train_week = pd.read_parquet(path + "train_week.parquet")
    df_train_month = pd.read_parquet(path + "train_month.parquet")

    sample_sumbission_week = pd.read_parquet(path + "sample_sumbission_week.parquet")
    sample_sumbission_month = pd.read_parquet(path + "sample_sumbission_month.parquet")

    df_train_week.sort_values(by='log_dt', inplace=True)
    df_train_month.sort_values(by='log_dt', inplace=True)
    
    return test_answer_week, test_answer_month, df_train_week, df_train_month, sample_sumbission_week, sample_sumbission_month

In [197]:
path='../../data/'

test_answer_week, test_answer_month, \
df_train_week, df_train_month, \
sample_sumbission_week, sample_sumbission_month = dataload(path)

# preprocess

In [198]:
n = 25

# 1.Feature engineering

In [199]:
# week & day feature engineering
def week_day_feature(df_train:pd.DataFrame())->pd.DataFrame():
    df_train['week'] = df_train['log_date'].apply(lambda x: x.isocalendar()[1])
    df_train['day'] = df_train['log_date'].apply(lambda x: x.isocalendar()[2])
    week_min = df_train.week.min()
    df_train['week'] = df_train['week'].apply(lambda x: x-week_min)
    
    return  df_train

# album_cnt & album_rank feature engineering
def album_cnt_rank_feature(df_train:pd.DataFrame())->pd.DataFrame():
    album_cnt = df_train.album_id.value_counts().reset_index().rename(columns={'index':'album_id','album_id':'album_cnt'})
    album_cnt['rank'] = album_cnt['album_cnt'].rank(method='first', ascending=False)
    df_train = df_train.merge(album_cnt, on='album_id')
    
    return df_train

def feature_engineering(df_train:pd.DataFrame())->pd.DataFrame():
    df_train = week_day_feature(df_train)
    df_train = album_cnt_rank_feature(df_train)
    
    return df_train

In [200]:
df_train_week = feature_engineering(df_train_week)
df_train_month = feature_engineering(df_train_month)

# 2.basic candidate(MP@300 , latest each user@5)

In [201]:
# # MP@300 & MP_percent feature made
# def MP_candidate(df_train:pd.DataFrame(), cand=300)->pd.DataFrame():
#     MP_df = df_train.album_id.value_counts().reset_index()
#     MP_df.rename(columns={'index':'album_id','album_id':'item_cnt'}, inplace=True)

#     # MP@300
#     '''
#     MP 중복 시청 포함 
#     '''
#     MP_cand = MP_df[:cand][['album_id']]
#     return MP_cand


# # latest 1 day each user@5
# def latest_candidate(df_train:pd.DataFrame())->pd.DataFrame():
#     seven_days = df_train['log_date'].max()- dt.timedelta(days=1)
#     latest_history = df_train[df_train['log_date']>=seven_days]
#     latest_history = latest_history.groupby(['album_id']).count()['profile_id'].reset_index().rename(columns={'profile_id':'latest_cnt'})
    
#     # history count more than least 2 
#     latest_cand = latest_history[latest_history['latest_cnt']>=2][['album_id']].drop_duplicates()
    
#     return latest_cand


# def candidate(df_train:pd.DataFrame(), cand=300)->pd.DataFrame():
    
#     MP_cand = MP_candidate(df_train, cand)
#     latest_cand = latest_candidate(df_train)
#     cand = pd.concat([MP_cand,latest_cand])
#     cand.drop_duplicates('album_id', inplace=True)
#     cand['rating'] = 1
    
#     return cand, MP_cand

In [202]:
# cand_week, MP_cand_week = candidate(df_train_week,300)
# cand_month, MP_cand_month = candidate(df_train_month,300)

In [203]:
# a = df_train_week.groupby(['profile_id','album_id'])['ss_id'].count().sort_valu
# a
# # .sort_values(by=['profile_id','ss_id'])

In [204]:
customers = df_train_week.profile_id.unique()

In [205]:
last_week = df_train_week['week'].max()

label_df = df_train_week.query(f'week=={last_week}').copy()
df_train = df_train_week.query(f"week < {last_week}").copy()

In [206]:
personal_train = df_train.drop_duplicates(subset=['profile_id','album_id','ss_id'])
df_train = df_train.drop_duplicates(subset=['profile_id','album_id'])

In [207]:
label_df = label_df[['profile_id','album_id']]
label_df['rating'] = 1

In [208]:
last_week

7

# general MP
- 마지막 1주, 2주의 MP를 각 유저마다 넣는다.

In [209]:
df_train.week.unique()

array([0, 1, 2, 4, 5, 3, 6])

In [210]:
print('중복제거 후 데이터 수:', len(df_train_week))

중복제거 후 데이터 수: 945518


In [211]:
# 마지막 6,5주 각각 MP를 10개 뽑음
last_week_ver1 = 6
last_week_ver2 = 5

MP_latest_ver1_df = df_train.query(f"week == {last_week_ver1}")

In [212]:
MP_df = MP_latest_ver1_df.groupby('album_id')['profile_id'].count().sort_values(ascending=False)
MP_df = MP_df.reset_index()
MP_df.columns = ['album_id','counts']
MP_candidate_df = MP_df[:10].copy()
MP_candidate_df['join_col'] = 1

In [213]:
# df_train_week 전체 유저 대상으로 후보군을 뽑을 것임
customer_df = df_train_week[df_train_week['profile_id'].isin(customers)][['profile_id']]
customer_df['join_col'] = 1
popular_articles_cand_ver1 = customer_df.copy()
popular_articles_cand_ver1 = popular_articles_cand_ver1.merge(MP_candidate_df, on="join_col")

popular_articles_cand_ver1.drop_duplicates(subset=['profile_id','album_id'],inplace=True)

In [214]:
MP_latest_ver2_df = df_train.query(f"week == {last_week_ver2}")

In [215]:
MP_df = MP_latest_ver2_df.groupby('album_id')['profile_id'].count().sort_values(ascending=False)
MP_df = MP_df.reset_index()
MP_df.columns = ['album_id','counts']
MP_candidate_df = MP_df[:10].copy()
MP_candidate_df['join_col'] = 1

In [216]:
customer_df = df_train_week[df_train_week['profile_id'].isin(customers)][['profile_id']]
customer_df['join_col'] = 1
popular_articles_cand_ver2 = customer_df.copy()
popular_articles_cand_ver2 = popular_articles_cand_ver2.merge(MP_candidate_df, on="join_col")

popular_articles_cand_ver2.drop_duplicates(subset=['profile_id','album_id'],inplace=True)

In [217]:
popular_articles_cand = pd.concat([popular_articles_cand_ver1, popular_articles_cand_ver2])
popular_articles_cand = popular_articles_cand.groupby(['profile_id','album_id'])['counts'].sum().reset_index()

# personal_MP

In [376]:
personal_MP = personal_train.groupby(['profile_id','album_id'])[['ss_id']].count().reset_index()
personal_MP.columns = ['profile_id','album_id','personal_counts']

In [377]:
# 서로 다른날 5회 이상 시청한 앨범만
personal_MP = personal_MP[personal_MP['personal_counts'] >= 5]
personal_MP = personal_MP.sort_values(by=['profile_id','personal_counts'],ascending=False)

In [378]:
# 상위 5개만 pick
head_df_list = []
# 전체 유저 대상으로 뽑기
for user_id in tqdm(customer_df.profile_id.unique()):
    personal_MP_user_len = len(personal_MP[personal_MP['profile_id']==user_id].head())
    random_choice_list = personal_MP.album_id.unique()
    if personal_MP_user_len <5:
        # 5개 아이템이 없는 경우 랜덤으로 없는 개수만 만큼 choice
        user_df = personal_MP[personal_MP['profile_id']==user_id]
        df = pd.DataFrame()
        random_choices = np.random.choice(random_choice_list, size=(5-personal_MP_user_len))
        df['profile_id'] = [user_id for _ in range(5-personal_MP_user_len)]
        df['album_id'] = random_choices
        df = pd.concat([user_df, df])
        head_df_list.append(df)
    else:
        head_df_list.append(personal_MP[personal_MP['profile_id']==user_id].head())
        
personal_MP_candidate = pd.concat(head_df_list)

100%|██████████████████████████████████████| 8057/8057 [00:06<00:00, 1315.52it/s]


In [379]:
personal_MP_candidate

Unnamed: 0,profile_id,album_id,personal_counts
0,25844,2303,
1,25844,4399,
2,25844,6888,
3,25844,7442,
4,25844,6243,
...,...,...,...
0,28368,4695,
1,28368,5396,
2,28368,2765,
3,28368,106,


In [380]:
popular_articles_cand

Unnamed: 0,profile_id,album_id,counts
0,3,16,288
1,3,38,132
2,3,52,156
3,3,125,271
4,3,190,183
...,...,...,...
96679,33019,329,171
96680,33019,339,463
96681,33019,347,389
96682,33019,987,302


# MP_user_genre

In [357]:
# week, day, album_cnt, rank 컬럼 candidate 붙여야 함
# df_train

In [363]:
meta_df = pd.read_csv(path+'meta_data.csv')
meta_df = meta_df[['album_id','genre_mid','run_time','cast_1','cast_2','cast_3']]

  meta_df = pd.read_csv(path+'meta_data.csv')


In [397]:
df_train_meta = df_train.merge(meta_df, on='album_id')

In [462]:
user_genre_df = df_train_meta.groupby(['profile_id','genre_mid']).count()['ss_id'].reset_index()
user_genre_df.columns = ['profile_id','genre_mid','genre_cnt']
user_genre_df = user_genre_df.groupby(['profile_id','genre_mid']).sum().reset_index().sort_values(by=['profile_id','genre_cnt'],ascending=False)

# 장르 선호도 피처 만들기
## 100이상 시청한 사람들만 percent
user_total_watch_dict = user_genre_df.groupby('profile_id')['genre_cnt'].sum()\
                        [user_genre_df.groupby('profile_id')['genre_cnt'].sum()>=100].to_dict()
# 전체 시청 피처 만들기
user_genre_df['user_genre_cnt'] = user_genre_df['profile_id'].apply(lambda x: user_total_watch_dict.get(x, None))
user_genre_df['user_genre_percent'] = user_genre_df['genre_cnt']/user_genre_df['user_genre_cnt']
user_genre_df.drop(columns=['user_genre_cnt'],inplace=True)
user_genre_df.dropna(subset=['user_genre_percent'],axis=0,inplace=True)

In [481]:
genre_top_items = {}
genre_count = df_train_meta['genre_mid'].value_counts()
# genre_count = genre_count[genre_count>=500]
for genre in genre_count.index:
    genre_top_items[genre] = list(df_train_meta[df_train_meta['genre_mid']==genre]['album_id'].value_counts().head(10).index)

In [488]:
df = pd.DataFrame()

df['album_id'] = genre_top_items['노래율동']

In [490]:
df

Unnamed: 0,album_id
0,16
1,38
2,18
3,230
4,981
5,15
6,56
7,136
8,185
9,783


In [511]:
df_list = []
for user_id in customer_df.profile_id.unique():
    user_genres = user_genre_df[user_genre_df['profile_id']== user_id].head(2)['genre_mid']
    
    df = pd.DataFrame()
    if len(user_genres) == 0:
        
        df['album_id'] = genre_top_items['노래율동']
        df['album_id'] = genre_top_items['TV만화']
        
    elif len(user_genres) == 1:
        df['album_id'] = genre_top_items.get(user_genres.values[0],None)
        df['album_id'] = genre_top_items.get('노래율동',None)  
        
    elif len(user_genres) == 2:
        genre_list_1 = genre_top_items.get(user_genres.values[0],None)
        genre_list_2 = genre_top_items.get(user_genres.values[1],None)
        df['album_id'] = list(dict.fromkeys(np.append(genre_list_1,genre_list_2)))

    df['profile_id'] = user_id
    df_list.append(df)

# model preprocess

In [97]:
def label_preprocess(df_train:pd.DataFrame(), cand:pd.DataFrame()):
    merge_train_week = df_train.drop_duplicates(subset=['profile_id','album_id'])

    train_df = pd.merge(merge_train_week, cand, how='left', on='album_id')
    drop_list = ['ss_id','act_target_dtl','payment','continuous_play','short_trailer','log_dt','log_date']
    train_df.drop(columns=drop_list,inplace=True)
    train_df.fillna(0, inplace=True)
    
    return train_df

def lgbm_preprocess(train_df:pd.DataFrame()):
    X_train = train_df.drop(columns=['rating'])
    y_train = train_df['rating']
    
    train_group = train_df.groupby('profile_id')['profile_id'].count().to_numpy()
    return X_train, y_train, train_group

def preprocess(train:pd.DataFrame(), cand:pd.DataFrame()):
    train_df = label_preprocess(train, cand)
    X_train, y_train, train_group = lgbm_preprocess(train_df)
    
    return X_train, y_train, train_group

In [53]:
train_week = preprocess(df_train_week, cand_week)
train_month = preprocess(df_train_month, cand_month)

X_train_week, y_train_week, train_group_week = train_week
X_train_month, y_train_month, train_group_month = train_month

In [54]:
y_train_week.value_counts(), y_train_month.value_counts()

(1.0    263078
 0.0     97270
 Name: rating, dtype: int64,
 1.0    158741
 0.0     62559
 Name: rating, dtype: int64)

In [55]:
X_train_week

Unnamed: 0,profile_id,album_id,week,day,album_cnt,rank
0,25844,18024,0,2,9,9198.0
1,17666,18024,1,7,9,9198.0
2,15730,18024,2,5,9,9198.0
3,17424,18024,4,1,9,9198.0
4,8391,18024,4,6,9,9198.0
...,...,...,...,...,...,...
360343,759,7568,7,6,1,17682.0
360344,6718,19102,7,6,1,17689.0
360345,6720,8106,7,6,1,17686.0
360346,6720,8107,7,6,1,17683.0


# model

In [56]:
def train(X_train:pd.DataFrame(), y_train:pd.Series(), train_group:np.array, model_params:dict):
    model = lightgbm.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        boosting_type="dart",
        n_estimators= model_params['n_estimators'],
        importance_type='gain',
        verbose= model_params['verbose'],
        random_state= model_params['random_state']
    )
    
    model.fit(
    X=X_train,
    y=y_train,
    group=train_group,
    )
    
    feature_importances_df = pd.DataFrame(dict(zip(X_train.columns, model.feature_importances_)), \
                                          index=['feature_importances']).T
    
    
    return model, feature_importances_df

  def train(X_train:pd.DataFrame(), y_train:pd.Series(), train_group:np.array, model_params:dict):


In [57]:
model_params = {
    'n_estimators':100,
    'verbose':1,
    'random_state':42,
    'eval_at':25
}

model_week, feature_importances_df_week = train(X_train_week, y_train_week, train_group_week, model_params)
model_month, feature_importances_df_month = train(X_train_month, y_train_month, train_group_month, model_params)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1028
[LightGBM] [Info] Number of data points in the train set: 360348, number of used features: 6
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 221300, number of used features: 6


In [58]:
feature_importances_df_week.style.bar(color='lightgreen', subset='feature_importances')

Unnamed: 0,feature_importances
profile_id,620.967171
album_id,12815.929749
week,9123.443953
day,3177.949855
album_cnt,21429.868576
rank,43636.550086


# Evaluation

In [59]:
def valid_evaluation(
            model, 
            X_train:pd.DataFrame(), 
            sample_sumbission:pd.DataFrame(), 
            n:int, 
            MP_cand:pd.DataFrame(),
            test_answer
            )->pd.DataFrame():
    
    
    
    pred = model.predict(X_train)
    X_train['pred'] = pred
    
    MP_list = MP_cand.album_id.values
    
    # each user pred 25 items
    lgbm_sub_df = X_train.sort_values(by='pred', ascending=False).groupby('profile_id').head(25)
    lgbm_user_items_dict = lgbm_sub_df.groupby('profile_id')['album_id'].unique().to_dict()
    sample_sumbission['album_id'] = sample_sumbission['profile_id'].apply(lambda x: lgbm_user_items_dict[x])
    
    # cold start user file MP_list top25
    sample_sumbission_cold = sample_sumbission.copy()
    sample_sumbission_cold['album_id'] = sample_sumbission_cold['album_id']\
                                            .apply(lambda x: list(dict.fromkeys(np.append(x, MP_list)))[:25])
    
    print('lgbm ndcg:', ndcg_calculator(sample_sumbission, test_answer, n))
    print('lgbm ndcg cold_user to MP:', ndcg_calculator(sample_sumbission_cold, test_answer, n))
    
    return X_train, sample_sumbission, sample_sumbission_cold

In [60]:
print('week performance')
valid_evaluation_list_week = valid_evaluation(model_week, X_train_week, sample_sumbission_week, n, MP_cand_week, test_answer_week)
print('month performance')
valid_evaluation_list_month = valid_evaluation(model_month, X_train_month, sample_sumbission_month, n, MP_cand_month, test_answer_month)

week performance
lgbm ndcg: 0.15769713915388472
lgbm ndcg cold_user to MP: 0.15769713915388472
month performance
lgbm ndcg: 0.26233002556930307
lgbm ndcg cold_user to MP: 0.26233002556930307


In [61]:
(X_train, sample_sumbission_week, sample_sumbission_cold_week) = valid_evaluation_list_week
(X_train, sample_sumbission_month, sample_sumbission_cold_month) = valid_evaluation_list_month

In [62]:
display(sample_sumbission_week, sample_sumbission_cold_week)

Unnamed: 0,profile_id,album_id
0,5,"[224, 136, 157, 159, 41, 227, 158, 229, 226, 2..."
1,20,"[432, 416, 494, 33, 493, 505, 52, 491, 492, 47..."
2,22,"[224, 737, 740, 407, 606, 856, 227, 229, 888, ..."
3,24,[1869]
4,31,"[1961, 1967, 1966, 977, 1942, 1926, 1943, 1962..."
...,...,...
2182,32965,"[407, 593, 16, 67, 1709, 1394, 714, 715, 13690..."
2183,32978,"[525, 1725, 52, 1006, 1024, 1023, 1007, 3429, ..."
2184,32979,"[733, 987, 264, 330, 1402, 1880, 16, 17, 38, 1..."
2185,32998,"[416, 417, 7105, 2021, 4246]"


Unnamed: 0,profile_id,album_id
0,5,"[224, 136, 157, 159, 41, 227, 158, 229, 226, 2..."
1,20,"[432, 416, 494, 33, 493, 505, 52, 491, 492, 47..."
2,22,"[224, 737, 740, 407, 606, 856, 227, 229, 888, ..."
3,24,"[1869, 16, 15, 19, 124, 17, 18, 38, 241, 125, ..."
4,31,"[1961, 1967, 1966, 977, 1942, 1926, 1943, 1962..."
...,...,...
2182,32965,"[407, 593, 16, 67, 1709, 1394, 714, 715, 13690..."
2183,32978,"[525, 1725, 52, 1006, 1024, 1023, 1007, 3429, ..."
2184,32979,"[733, 987, 264, 330, 1402, 1880, 16, 17, 38, 1..."
2185,32998,"[416, 417, 7105, 2021, 4246, 16, 15, 19, 124, ..."


In [63]:
def evaluation(
            X_train:pd.DataFrame(), 
            sumbission:pd.DataFrame(), 
            n:int, 
            MP_cand:pd.DataFrame()
            )->pd.DataFrame():
    
    MP_list = MP_cand.album_id.values
    
    # each user pred 25 items
    lgbm_sub_df = X_train.sort_values(by='pred', ascending=False).groupby('profile_id').head(25)
    lgbm_user_items_dict = lgbm_sub_df.groupby('profile_id')['album_id'].unique().to_dict()
    sumbission['predicted_list'] = sumbission['profile_id']\
                                            .apply(lambda x: lgbm_user_items_dict.get(x, []))
    
    # cold start user file MP_list top25
    sumbission_cold = sumbission.copy()
    sumbission_cold['predicted_list'] = sumbission['predicted_list']\
                                            .apply(lambda x: list(dict.fromkeys(np.append(x, MP_list)))[:25])
    
    return sumbission, sumbission_cold

In [65]:
submission = pd.read_csv(path + 'sample_submission.csv')
sumbission, sumbission_cold = evaluation(X_train_week, submission, n, MP_cand_week)

In [66]:
# 제출 조건 충족 확인
assert submission.profile_id.nunique() == sumbission_cold.profile_id.nunique()
for pred_list in sumbission_cold.predicted_list:
    assert len(pred_list) == 25

In [67]:
sample_sumbission_cold_week

Unnamed: 0,profile_id,album_id
0,5,"[224, 136, 157, 159, 41, 227, 158, 229, 226, 2..."
1,20,"[432, 416, 494, 33, 493, 505, 52, 491, 492, 47..."
2,22,"[224, 737, 740, 407, 606, 856, 227, 229, 888, ..."
3,24,"[1869, 16, 15, 19, 124, 17, 18, 38, 241, 125, ..."
4,31,"[1961, 1967, 1966, 977, 1942, 1926, 1943, 1962..."
...,...,...
2182,32965,"[407, 593, 16, 67, 1709, 1394, 714, 715, 13690..."
2183,32978,"[525, 1725, 52, 1006, 1024, 1023, 1007, 3429, ..."
2184,32979,"[733, 987, 264, 330, 1402, 1880, 16, 17, 38, 1..."
2185,32998,"[416, 417, 7105, 2021, 4246, 16, 15, 19, 124, ..."


In [None]:
# sample_sumbission_cold_week.to_csv('lgbm_basic_submission.csv', index=False)