In [1]:
import sys
sys.path.append("../../src")

import numpy as np
import pandas as pd

import lightgbm

from utils import ndcg_calculator

import datetime as dt

# dataload

In [2]:
def dataload(path:str='../../data/'):

    test_answer_week = pd.read_parquet(path + "test_answer_week.parquet")
    test_answer_month = pd.read_parquet(path + "test_answer_month.parquet")

    df_train_week = pd.read_parquet(path + "train_week.parquet")
    df_train_month = pd.read_parquet(path + "train_month.parquet")

    sample_sumbission_week = pd.read_parquet(path + "sample_sumbission_week.parquet")
    sample_sumbission_month = pd.read_parquet(path + "sample_sumbission_month.parquet")

    df_train_week.sort_values(by='log_dt', inplace=True)
    df_train_month.sort_values(by='log_dt', inplace=True)
    
    return test_answer_week, test_answer_month, df_train_week, df_train_month, sample_sumbission_week, sample_sumbission_month

In [3]:
path='../../data/'

test_answer_week, test_answer_month, \
df_train_week, df_train_month, \
sample_sumbission_week, sample_sumbission_month = dataload(path)

# preprocess

In [4]:
n = 25

# 1.Feature engineering

In [5]:
# week & day feature engineering
def week_day_feature(df_train:pd.DataFrame())->pd.DataFrame():
    df_train['week'] = df_train['log_date'].apply(lambda x: x.isocalendar()[1])
    df_train['day'] = df_train['log_date'].apply(lambda x: x.isocalendar()[2])
    week_min = df_train.week.min()
    df_train['week'] = df_train['week'].apply(lambda x: x-week_min)
    
    return  df_train

# album_cnt & album_rank feature engineering
def album_cnt_rank_feature(df_train:pd.DataFrame())->pd.DataFrame():
    album_cnt = df_train.album_id.value_counts().reset_index().rename(columns={'index':'album_id','album_id':'album_cnt'})
    album_cnt['rank'] = album_cnt['album_cnt'].rank(method='first', ascending=False)
    df_train = df_train.merge(album_cnt, on='album_id')
    
    return df_train

def feature_engineering(df_train:pd.DataFrame())->pd.DataFrame():
    df_train = week_day_feature(df_train)
    df_train = album_cnt_rank_feature(df_train)
    
    return df_train

In [6]:
df_train_week = feature_engineering(df_train_week)
df_train_month = feature_engineering(df_train_month)

# 2.basic candidate(MP@300 , latest each user@5)

In [7]:
# MP@300 & MP_percent feature made
def MP_candidate(df_train:pd.DataFrame(), cand=300)->pd.DataFrame():
    MP_df = df_train.album_id.value_counts().reset_index()
    MP_df.rename(columns={'index':'album_id','album_id':'item_cnt'}, inplace=True)

    # MP@300
    '''
    MP 중복 시청 포함 
    '''
    MP_cand = MP_df[:cand][['album_id']]
    return MP_cand


# latest 1 day each user@5
def latest_candidate(df_train:pd.DataFrame())->pd.DataFrame():
    seven_days = df_train['log_date'].max()- dt.timedelta(days=1)
    latest_history = df_train[df_train['log_date']>=seven_days]
    latest_history = latest_history.groupby(['album_id']).count()['profile_id'].reset_index().rename(columns={'profile_id':'latest_cnt'})
    
    # history count more than least 2 
    latest_cand = latest_history[latest_history['latest_cnt']>=2][['album_id']].drop_duplicates()
    
    return latest_cand


def candidate(df_train:pd.DataFrame(), cand=300)->pd.DataFrame():
    
    MP_cand = MP_candidate(df_train, cand)
    latest_cand = latest_candidate(df_train)
    cand = pd.concat([MP_cand,latest_cand])
    cand.drop_duplicates('album_id', inplace=True)
    cand['rating'] = 1
    
    return cand, MP_cand

In [8]:
cand_week, MP_cand_week = candidate(df_train_week,300)
cand_month, MP_cand_month = candidate(df_train_month,300)

In [9]:
def label_preprocess(df_train:pd.DataFrame(), cand:pd.DataFrame()):
    merge_train_week = df_train.drop_duplicates(subset=['profile_id','album_id'])

    train_df = pd.merge(merge_train_week, cand, how='left', on='album_id')
    drop_list = ['ss_id','act_target_dtl','payment','continuous_play','short_trailer','log_dt','log_date']
    train_df.drop(columns=drop_list,inplace=True)
    train_df.fillna(0, inplace=True)
    
    return train_df

def lgbm_preprocess(train_df:pd.DataFrame()):
    X_train = train_df.drop(columns=['rating'])
    y_train = train_df['rating']
    
    qids_train = train_df.groupby('profile_id')['profile_id'].count().to_numpy()
    return X_train, y_train, qids_train

def preprocess(train:pd.DataFrame(), cand:pd.DataFrame()):
    train_df = label_preprocess(train, cand)
    X_train, y_train, qids_train = lgbm_preprocess(train_df)
    
    return X_train, y_train, qids_train

In [10]:
train_week = preprocess(df_train_week, cand_week)
train_month = preprocess(df_train_month, cand_month)

X_train_week, y_train_week, qids_train_week = train_week
X_train_month, y_train_month, qids_train_month = train_month

In [11]:
y_train_week.value_counts(), y_train_month.value_counts()

(1.0    263078
 0.0     97270
 Name: rating, dtype: int64,
 1.0    158741
 0.0     62559
 Name: rating, dtype: int64)

In [12]:
X_train_week

Unnamed: 0,profile_id,album_id,week,day,album_cnt,rank
0,25844,18024,0,2,9,9198.0
1,17666,18024,1,7,9,9198.0
2,15730,18024,2,5,9,9198.0
3,17424,18024,4,1,9,9198.0
4,8391,18024,4,6,9,9198.0
...,...,...,...,...,...,...
360343,759,7568,7,6,1,17682.0
360344,6718,19102,7,6,1,17689.0
360345,6720,8106,7,6,1,17686.0
360346,6720,8107,7,6,1,17683.0


# model

In [13]:
def train(X_train:pd.DataFrame(), y_train:pd.Series(), qids_train:np.array, model_params:dict):
    model = lightgbm.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        boosting_type="dart",
        n_estimators= model_params['n_estimators'],
        importance_type='gain',
        verbose= model_params['verbose'],
        random_state= model_params['random_state']
    )
    
    model.fit(
    X=X_train,
    y=y_train,
    group=qids_train,
    )
    
    feature_importances_df = pd.DataFrame(dict(zip(X_train.columns, model.feature_importances_)), \
                                          index=['feature_importances']).T
    
    return model, feature_importances_df

  def train(X_train:pd.DataFrame(), y_train:pd.Series(), qids_train:np.array, model_params:dict):


In [14]:
model_params = {
    'n_estimators':100,
    'verbose':1,
    'random_state':42,
    'eval_at':25
}

model_week, feature_importances_df_week = train(X_train_week, y_train_week, qids_train_week, model_params)
model_month, feature_importances_df_month = train(X_train_month, y_train_month, qids_train_month, model_params)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1028
[LightGBM] [Info] Number of data points in the train set: 360348, number of used features: 6
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 221300, number of used features: 6


In [15]:
feature_importances_df_week.style.bar(color='lightgreen', subset='feature_importances')

Unnamed: 0,feature_importances
profile_id,620.967171
album_id,12815.929749
week,9123.443953
day,3177.949855
album_cnt,21429.868576
rank,43636.550086


# Evaluation

In [16]:
def valid_evaluation(
            model, 
            X_train:pd.DataFrame(), 
            sample_sumbission:pd.DataFrame(), 
            n:int, 
            MP_cand:pd.DataFrame(),
            test_answer
            )->pd.DataFrame():
    
    
    
    pred = model.predict(X_train)
    X_train['pred'] = pred
    
    MP_list = MP_cand.album_id.values
    
    # each user pred 25 items
    lgbm_sub_df = X_train.sort_values(by='pred', ascending=False).groupby('profile_id').head(25)
    lgbm_user_items_dict = lgbm_sub_df.groupby('profile_id')['album_id'].unique().to_dict()
    sample_sumbission['album_id'] = sample_sumbission['profile_id'].apply(lambda x: lgbm_user_items_dict[x])
    
    # cold start user file MP_list top25
    sample_sumbission_cold = sample_sumbission.copy()
    sample_sumbission_cold['album_id'] = sample_sumbission_cold['album_id']\
                                            .apply(lambda x: list(dict.fromkeys(np.append(x, MP_list)))[:25])
    
    print('lgbm ndcg:', ndcg_calculator(sample_sumbission, test_answer, n))
    print('lgbm ndcg cold_user to MP:', ndcg_calculator(sample_sumbission_cold, test_answer, n))
    
    return X_train, sample_sumbission, sample_sumbission_cold

In [17]:
print('week performance')
valid_evaluation_list_week = valid_evaluation(model_week, X_train_week, sample_sumbission_week, n, MP_cand_week, test_answer_week)
print('month performance')
valid_evaluation_list_month = valid_evaluation(model_month, X_train_month, sample_sumbission_month, n, MP_cand_month, test_answer_month)

week performance
lgbm ndcg: 0.15769713915388472
lgbm ndcg cold_user to MP: 0.15769713915388472
month performance
lgbm ndcg: 0.26233002556930307
lgbm ndcg cold_user to MP: 0.26233002556930307


In [18]:
(X_train, sample_sumbission_week, sample_sumbission_cold_week) = valid_evaluation_list_week
(X_train, sample_sumbission_month, sample_sumbission_cold_month) = valid_evaluation_list_month

In [19]:
display(sample_sumbission_week, sample_sumbission_cold_week)

Unnamed: 0,profile_id,album_id
0,5,"[224, 136, 157, 159, 41, 227, 158, 229, 226, 2..."
1,20,"[432, 416, 494, 33, 493, 505, 52, 491, 492, 47..."
2,22,"[224, 737, 740, 407, 606, 856, 227, 229, 888, ..."
3,24,[1869]
4,31,"[1961, 1967, 1966, 977, 1942, 1926, 1943, 1962..."
...,...,...
2182,32965,"[407, 593, 16, 67, 1709, 1394, 714, 715, 13690..."
2183,32978,"[525, 1725, 52, 1006, 1024, 1023, 1007, 3429, ..."
2184,32979,"[733, 987, 264, 330, 1402, 1880, 16, 17, 38, 1..."
2185,32998,"[416, 417, 7105, 2021, 4246]"


Unnamed: 0,profile_id,album_id
0,5,"[224, 136, 157, 159, 41, 227, 158, 229, 226, 2..."
1,20,"[432, 416, 494, 33, 493, 505, 52, 491, 492, 47..."
2,22,"[224, 737, 740, 407, 606, 856, 227, 229, 888, ..."
3,24,"[1869, 16, 15, 19, 124, 17, 18, 38, 241, 125, ..."
4,31,"[1961, 1967, 1966, 977, 1942, 1926, 1943, 1962..."
...,...,...
2182,32965,"[407, 593, 16, 67, 1709, 1394, 714, 715, 13690..."
2183,32978,"[525, 1725, 52, 1006, 1024, 1023, 1007, 3429, ..."
2184,32979,"[733, 987, 264, 330, 1402, 1880, 16, 17, 38, 1..."
2185,32998,"[416, 417, 7105, 2021, 4246, 16, 15, 19, 124, ..."


In [20]:
def evaluation(
            model, 
            X_train:pd.DataFrame(), 
            sumbission:pd.DataFrame(), 
            n:int, 
            MP_cand:pd.DataFrame()
            )->pd.DataFrame():
    
    MP_list = MP_cand.album_id.values
    
    # each user pred 25 items
    lgbm_sub_df = X_train.sort_values(by='pred', ascending=False).groupby('profile_id').head(25)
    lgbm_user_items_dict = lgbm_sub_df.groupby('profile_id')['album_id'].unique().to_dict()
    sumbission['predicted_list'] = sumbission['profile_id']\
                                            .apply(lambda x: lgbm_user_items_dict.get(x, []))
    
    # cold start user file MP_list top25
    sumbission_cold = sumbission.copy()
    sumbission_cold['predicted_list'] = sumbission['predicted_list']\
                                            .apply(lambda x: list(dict.fromkeys(np.append(x, MP_list)))[:25])
    
    return sumbission, sumbission_cold

In [21]:
submission = pd.read_csv(path + 'sample_submission.csv')
sumbission, sumbission_cold = evaluation(model_week, X_train_week, submission, n, MP_cand_week)

In [22]:
# 제출 조건 충족 확인
assert submission.profile_id.nunique() == sumbission_cold.profile_id.nunique()
for pred_list in sumbission_cold.predicted_list:
    assert len(pred_list) == 25

In [23]:
sample_sumbission_cold_week

Unnamed: 0,profile_id,album_id
0,5,"[224, 136, 157, 159, 41, 227, 158, 229, 226, 2..."
1,20,"[432, 416, 494, 33, 493, 505, 52, 491, 492, 47..."
2,22,"[224, 737, 740, 407, 606, 856, 227, 229, 888, ..."
3,24,"[1869, 16, 15, 19, 124, 17, 18, 38, 241, 125, ..."
4,31,"[1961, 1967, 1966, 977, 1942, 1926, 1943, 1962..."
...,...,...
2182,32965,"[407, 593, 16, 67, 1709, 1394, 714, 715, 13690..."
2183,32978,"[525, 1725, 52, 1006, 1024, 1023, 1007, 3429, ..."
2184,32979,"[733, 987, 264, 330, 1402, 1880, 16, 17, 38, 1..."
2185,32998,"[416, 417, 7105, 2021, 4246, 16, 15, 19, 124, ..."


In [24]:
# sample_sumbission_cold_week.to_csv('lgbm_basic_submission.csv', index=False)