In [1]:
import sys
sys.path.append("../../src")

import numpy as np
import pandas as pd
from datetime import timedelta
import math
import lightgbm

# from utils import ndcg_calculator
# from LGBM_Rank import LGBMRank
# from dataload import dataload, day_week_feature_engineering, train_label_split
from tqdm import tqdm

In [2]:
def _ndcg_calculator(gt, rec, idcg):
    dcg = 0.0
    for i, r in enumerate(rec):
        if r in gt:
            dcg += 1.0 / np.log(i + 2)
    return dcg / idcg

def ndcg_calculator(answer, submission, n):
    idcg = sum((1.0 / np.log(i + 1) for i in range(1, n + 1)))

    assert (answer.profile_id != submission.profile_id).sum() == 0

    ndcg_list = []
    for (_, row_answer), (_, row_submit) in zip(answer.iterrows(), submission.iterrows()):
        ndcg_list.append(_ndcg_calculator(row_answer.album_id, row_submit.album_id, idcg))

    ndcg_score = sum(ndcg_list) / len(answer)
    return ndcg_score

class LGBMRank():
    def __init__(self, 
                train_df:pd.DataFrame(), 
                model_params:dict={'n_estimators':5},
                path:str='../../data/',
                mode:str='week',
                n=25):

        self.train_df = train_df
        self.path = path
        self.model_params = model_params
        self.sample_sumbission_week = pd.read_parquet(path + 'sample_sumbission_week.parquet')
        self.sample_sumbission_month = pd.read_parquet(path + 'sample_sumbission_month.parquet')
        self.test_answer_week = pd.read_parquet(path + 'test_answer_week.parquet')
        self.test_answer_month = pd.read_parquet(path + 'test_answer_month.parquet')
        self.mode = mode

        self.n = n
        self.X_train, self.y_train, self.train_group = self.lgbm_preprocess(self.train_df)




    def lgbm_preprocess(self, train_df:pd.DataFrame()):
        X_train = train_df.drop(columns=['rating'])
        y_train = train_df['rating']
        
        train_group = train_df.groupby('profile_id')['profile_id'].count().to_numpy()

        self.item_idx = X_train["album_id"].copy()
        self.user_idx = X_train["profile_id"].copy()
        del X_train["album_id"], X_train["profile_id"]

        return X_train, y_train, train_group




    def train(self):

        model_params = self.model_params

        model = lightgbm.LGBMRanker(
            objective="lambdarank",
            metric="ndcg",
            boosting_type="dart",
            num_leaves= 20,
            learning_rate=0.005,
            n_estimators= model_params['n_estimators'],
            importance_type='gain',
            verbose= -1,
            random_state= 42
        )
        
        model.fit(
        X=self.X_train,
        y=self.y_train,
        group=self.train_group,
        )
        
        feature_importances_df = pd.DataFrame(dict(zip(self.X_train.columns, model.feature_importances_)), \
                                            index=['feature_importances']).T
        

        return model, feature_importances_df




    def valid_evaluation(self)->pd.DataFrame():
        
        X_train = self.X_train
        n = self.n
        
        model, feature_importances_df = self.train()
        print(feature_importances_df)

        pred = model.predict(X_train)
        X_train['pred'] = pred

        item_idx = self.item_idx 
        user_idx = self.user_idx
        X_train["album_id"] = item_idx
        X_train["profile_id"] = user_idx

        if self.mode == 'week':
            sample_sumbission = self.sample_sumbission_week
            test_answer = self.test_answer_week
            print('week performance')
        else:
            sample_sumbission = self.sample_sumbission_month
            test_answer = self.test_answer_month
            print('month performance')
        
        # each user pred 25 items
        lgbm_sub_df = X_train.sort_values(by='pred', ascending=False).groupby('profile_id').head(25)
        lgbm_user_items_dict = lgbm_sub_df.groupby('profile_id')['album_id'].unique().to_dict()
        sample_sumbission['album_id'] = sample_sumbission['profile_id'].apply(lambda x: lgbm_user_items_dict.get(x, []))

        print('lgbm ndcg:', ndcg_calculator(sample_sumbission, test_answer, n))
        
        return X_train, sample_sumbission
    
def dataload(path:str='../../data/'):

    test_answer_week = pd.read_parquet(path + "test_answer_week.parquet")
    test_answer_month = pd.read_parquet(path + "test_answer_month.parquet")

    df_train_week = pd.read_parquet(path + "train_week.parquet")
    df_train_month = pd.read_parquet(path + "train_month.parquet")

    sample_sumbission_week = pd.read_parquet(path + "sample_sumbission_week.parquet")
    sample_sumbission_month = pd.read_parquet(path + "sample_sumbission_month.parquet")

    df_train_week.sort_values(by='log_dt', inplace=True)
    df_train_month.sort_values(by='log_dt', inplace=True)
    
    return test_answer_week, test_answer_month, df_train_week, df_train_month, sample_sumbission_week, sample_sumbission_month




# week & day feature engineering
def day_feature(df_train:pd.DataFrame())->pd.Series():
    dates = df_train.log_date
    unique_dates = df_train.log_date.unique()
    unique_dates = np.sort(unique_dates)
    number_range = np.arange(len(unique_dates))
    date_number_dict = dict(zip(unique_dates, number_range))

    all_day_numbers = dates.map(date_number_dict)
    all_day_numbers = all_day_numbers.astype("int16")
    print('log date min:', dates.min(), 'log date max:', dates.max())
    print('min day:', all_day_numbers.min(), 'max day:', all_day_numbers.max())

    return all_day_numbers

def week_feature(df_train:pd.DataFrame())->pd.Series:
    pd_dates = df_train.log_date
    unique_dates = pd.Series(df_train.log_date.unique())
    numbered_days = unique_dates - unique_dates.min() + timedelta(1)
    numbered_days = numbered_days.dt.days
    extra_days = numbered_days.max() % 7
    numbered_days -= extra_days
    day_weeks = (numbered_days / 7).apply(lambda x: math.ceil(x))
    day_weeks_map = pd.DataFrame({"day_weeks": day_weeks, "unique_dates": unique_dates})\
                                                        .set_index("unique_dates")["day_weeks"]
    all_day_weeks = pd_dates.map(day_weeks_map)
    all_day_weeks = all_day_weeks.astype("int8")
    print('min week:', all_day_weeks.min(), 'max week:', all_day_weeks.max())

    return all_day_weeks

def day_week_feature_engineering(df_train:pd.DataFrame())->pd.DataFrame():
    all_day_numbers = day_feature(df_train)
    all_day_weeks = week_feature(df_train)
    
    df_train['day'] = all_day_numbers
    df_train['week'] = all_day_weeks

    return df_train


# train, label split
def train_label_split(df_train:pd.DataFrame())->pd.DataFrame():
    last_week = df_train['week'].max()
    print('split last week:', last_week)

    label_df = df_train.query(f'week=={last_week}')[['profile_id','album_id']]
    label_df.drop_duplicates(subset=['profile_id','album_id'],inplace=True)
    label_df['rating'] = 1

    df_train = df_train.query(f"week <= {last_week}")
    
    return df_train, label_df



# dataload

In [3]:
path='/kaggle/input/lg-train-test/'

test_answer_week, test_answer_month, \
df_train_week, df_train_month, \
sample_sumbission_week, sample_sumbission_month = dataload(path)

In [4]:
# # 전체 데이터 이용시
# df_history = pd.read_csv("/kaggle/input/lgground/history_data.csv")

# ## 날짜 전처리
# df_history = df_history.assign(log_dt = pd.to_datetime(df_history.log_time//100, format="%Y%m%d%H%M"))
# df_history = df_history.assign(log_date = df_history.log_dt.dt.floor("D"))
# df_history = df_history.drop("log_time", axis=1)

# df_train_week = df_history.copy()

# # month 이용
# # df_train_week = df_train_month.copy()

# Feature engineering

## History
+Add [album_viewcount] : History 내 Album id 에 대한 Frequency Encoding   
Short trailer - one hot   
Continuous play - one hot   

In [119]:
df_train_week.drop_duplicates()
# 945518 > 812471
df_train_week.info()

# History의 경우, 일단은 바로바로 모델성능을 실험할 week 데이터셋에 진행 
# categorical은 한번에 처리 가능하게 함수화 (full에도 바로 반영 가능하도록)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 945518 entries, 899160 to 127159
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   profile_id                   945518 non-null  int64         
 1   ss_id                        945518 non-null  int64         
 2   act_target_dtl               945518 non-null  object        
 3   album_id                     945518 non-null  int64         
 4   payment                      68144 non-null   float64       
 5   continuous_play              945518 non-null  category      
 6   short_trailer                945518 non-null  category      
 7   log_dt                       945518 non-null  datetime64[ns]
 8   log_date                     945518 non-null  datetime64[ns]
 9   album_viewcount_freq_encode  945518 non-null  float64       
 10  album_viewcount_freq         945518 non-null  float64       
dtypes: category(2), datet

In [28]:
####### History 

def history_feature_engineering(df):
    
    ####### Short trailer & Continuous play - categorical
    cat_features = ['short_trailer','continuous_play']
    for i in enumerate (cat_features) :
        col = i[1]
        df[col] = df[col].astype('category')
        
    ####### album_viewcount - Frequency    
    album_viewcount_df = df.groupby("album_id").size()/len(df)
    df.loc[:, "album_viewcount_freq"] = df["album_id"].map(album_viewcount_df)
    return df

In [29]:
history_feature_engineering(df_train_week)

Unnamed: 0,profile_id,ss_id,act_target_dtl,album_id,payment,continuous_play,short_trailer,log_dt,log_date,album_viewcount_freq_encode,album_viewcount_freq
899160,25844,20220301000456,MKID003,18024,,N,N,2022-03-01 00:04:00,2022-03-01,0.000010,0.000010
899162,25844,20220301000456,MKID003,1881,,N,N,2022-03-01 00:05:00,2022-03-01,0.001322,0.001322
899161,25844,20220301000456,MKID003,1881,,N,N,2022-03-01 00:05:00,2022-03-01,0.001322,0.001322
899163,25844,20220301000456,MKID003,4608,,N,N,2022-03-01 00:06:00,2022-03-01,0.000635,0.000635
899164,25844,20220301000456,MKID003,4608,,N,N,2022-03-01 00:06:00,2022-03-01,0.000635,0.000635
...,...,...,...,...,...,...,...,...,...,...,...
58990,1847,20220423220455,MKID003,472,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000315,0.000315
724845,19290,20220423235128,MKID003,2285,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000579,0.000579
724844,19290,20220423235128,MKID003,2285,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000579,0.000579
677911,17755,20220423235282,MKID003,315,,N,N,2022-04-23 23:59:00,2022-04-23,0.000151,0.000151


In [None]:
## float64 는 최대한 피해야 하긴 하는데.... memory reduce 단계에서 다시 고려
# 아니다 memory reduce도 데이터셋마다 고려해야 할지도. (history제외한 나머지는 함께 고려 가능(모아 붙이는 형태니까)

## Watch
Continuous play - categorical

In [15]:
watch = pd.read_csv("/kaggle/input/lgground/watch_e_data.csv")
watch.info()
# We only use [total_time] and [continuous_play] columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892794 entries, 0 to 892793
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   profile_id       892794 non-null  int64 
 1   ss_id            892794 non-null  int64 
 2   log_time         892794 non-null  int64 
 3   act_target_dtl   892794 non-null  object
 4   album_id         892794 non-null  int64 
 5   watch_time       892794 non-null  int64 
 6   total_time       892794 non-null  int64 
 7   continuous_play  892794 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 54.5+ MB


In [32]:
def watch_feature_engineering(watch):
    watch['continuous_play'] = watch['continuous_play'].astype('category')
    watch_feature = watch[['profile_id','album_id','watch_time','total_time']]
    return watch_feature

In [33]:
watch_feature_engineering(watch)

In [None]:
# watch에서 사용할 column만 따로 저장 (그런데 이어붙이러면 key가 필요한데.)
# [total_time] 은 무조건 album id 기준으로 merge 가능
# [continuous_play] 는 album+column id 기준으로 붙여볼 수 있을지도. (결측 예상)

## buy & Search

buy :::  +Add [Paid] column : 아이템별 유료항목 여부 - one hot [유료:1 / 무료 :0]    
Search ::: +Add [Searched] column : 아이템별 검색시청 여부 - One hot [검색됨:1 / 안됨:0]

In [36]:
buy = pd.read_csv("/kaggle/input/lgground/buy_data.csv")
search = pd.read_csv("/kaggle/input/lgground/search_data.csv")
# We only use [viewing log] from these datasets

In [117]:
def paid_feature(df,buy):
    buy_album = buy['profile_id'].unique().tolist()   # nunique 17개 뿐

    history_pay = df[['album_id','payment']].copy()  # history의 pay nunique 7168
    history_buy_album = history_pay.dropna().drop_duplicates().album_id.unique().tolist()

    paid_album = pd.DataFrame(list(set(buy_album + history_buy_album))).rename(columns={0:'album_id'})  # nunique 7181

    paid_df = pd.concat([paid_album.assign(label=True),df.assign(label=False)]).rename(columns={'label':'Paid'})
    paid_df['Paid'] = paid_df['Paid'].astype('category')
    return paid_df

In [113]:
##### searched 
# train data를 넣으면 알아서 concat하고 라벨링도 되게끔. 여기에 카테고리화까지. 
# info로 정상동작 확인 완료
# 1977개 

def searched_feature(df,search):
    search_album = pd.DataFrame(search['album_id'].unique().tolist()).rename(columns={0:'album_id'})
    searched_df = pd.concat([search_album.assign(label=True),df.assign(label=False)]).rename(columns={'label':'Searched'})
    searched_df['Searched'] = searched_df['Searched'].astype('category')
    return searched_df

In [118]:
paid_feature(df_train_week,buy)

Unnamed: 0,album_id,Paid,profile_id,ss_id,act_target_dtl,payment,continuous_play,short_trailer,log_dt,log_date,album_viewcount_freq_encode,album_viewcount_freq
0,6,True,,,,,,,NaT,NaT,,
1,33,True,,,,,,,NaT,NaT,,
2,59,True,,,,,,,NaT,NaT,,
3,61,True,,,,,,,NaT,NaT,,
4,74,True,,,,,,,NaT,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...
58990,472,False,1847.0,2.022042e+13,MKID003,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000315,0.000315
724845,2285,False,19290.0,2.022042e+13,MKID003,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000579,0.000579
724844,2285,False,19290.0,2.022042e+13,MKID003,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000579,0.000579
677911,315,False,17755.0,2.022042e+13,MKID003,,N,N,2022-04-23 23:59:00,2022-04-23,0.000151,0.000151


In [114]:
searched_feature(df_train_week,search)

Unnamed: 0,album_id,Searched,profile_id,ss_id,act_target_dtl,payment,continuous_play,short_trailer,log_dt,log_date,album_viewcount_freq_encode,album_viewcount_freq
0,2141,True,,,,,,,NaT,NaT,,
1,512,True,,,,,,,NaT,NaT,,
2,2142,True,,,,,,,NaT,NaT,,
3,2143,True,,,,,,,NaT,NaT,,
4,2150,True,,,,,,,NaT,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...
58990,472,False,1847.0,2.022042e+13,MKID003,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000315,0.000315
724845,2285,False,19290.0,2.022042e+13,MKID003,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000579,0.000579
724844,2285,False,19290.0,2.022042e+13,MKID003,,Y,N,2022-04-23 23:59:00,2022-04-23,0.000579,0.000579
677911,315,False,17755.0,2.022042e+13,MKID003,,N,N,2022-04-23 23:59:00,2022-04-23,0.000151,0.000151


## meta

Cast - one hot   
Genre - (dummies) / <고민>   
Country - one (결측 - unknown)   
Run time - numerical   
Tag - 고민    

meta_plus 사용 고민 

In [125]:
meta = pd.read_csv("/kaggle/input/lgground/meta_data.csv")
meta.info()

# we use  / sub_title / genre_large / genre_mid / run_time / country

############################### meta의 run_time과 watch의 total_time의 차이
# 결측치가 많은 고려 대상
# genre_small / cast 들

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42602 entries, 0 to 42601
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   album_id     42602 non-null  int64  
 1   title        42602 non-null  object 
 2   sub_title    42602 non-null  object 
 3   genre_large  42602 non-null  object 
 4   genre_mid    42602 non-null  object 
 5   genre_small  13419 non-null  object 
 6   country      33734 non-null  object 
 7   run_time     42602 non-null  int64  
 8   onair_date   5344 non-null   float64
 9   cast_1       27603 non-null  object 
 10  cast_2       22048 non-null  object 
 11  cast_3       16463 non-null  object 
 12  cast_4       12485 non-null  object 
 13  cast_5       6382 non-null   object 
 14  cast_6       2609 non-null   object 
 15  cast_7       762 non-null    object 
dtypes: float64(1), int64(2), object(13)
memory usage: 5.2+ MB


In [156]:
########## sub title은 카테고리로 넣을 만 하다!
# meta.title.nunique()  #36185
# meta.sub_title.nunique()  #2373

2373

In [163]:
# meta['sub_title']  # 같은 앨범 id에 다른 sub_title 

########### 고민 더 해봐야 함 

0          꼬마버스 타요1
1          꼬마버스 타요1
2          꼬마버스 타요1
3          꼬마버스 타요1
4          꼬마버스 타요1
            ...    
42597    로티프렌즈 미술놀이
42598    로티프렌즈 미술놀이
42599          4-5세
42600          아이맘콕
42601    베이비 타요 동요2
Name: sub_title, Length: 42602, dtype: category
Categories (2373, object): ['100분! 뽀요 인기 메들리', '100분! 뽀요 인기 메들리2', '10월 세계 여러나라', '11월 지구와 우주', ..., '히어로 써클', '힙합동요 쪼이송 공룡나라1', '힙합동요 쪼이송 동물퀴즈송 배우기1', '힙합동요 쪼이송 동물퀴즈송1']

In [158]:
# meta.country.unique()   ## 그래 이정도는 categorical로 넣을 만 하다!
### 한국 / 미국 /  컨텐츠가 너무 개수 적은 것은 합쳐서 카테고리화

array(['한국', '저지', nan, '미국', '영국', '일본', '중국', '프랑스', '오스트리아', '독일',
       '이탈리아', '크로아티아', '스위스', '캐나다', '아르헨티나', '우크라이나', '핀란드', '이스라엘',
       '벨기에', '벨라루스', '네덜란드'], dtype=object)

In [160]:
def meta_feature_engineering(meta):
    
    ####### sub_title & genre_large & genre_mid - categorical
    cat_features = ['sub_title','genre_large','genre_mid','country']
    for i in enumerate (cat_features) :
        col = i[1]
        meta[col] = meta[col].astype('category')
    return meta

In [161]:
meta_feature_engineering(meta)

Unnamed: 0,album_id,title,sub_title,genre_large,genre_mid,genre_small,country,run_time,onair_date,cast_1,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7
0,749,어둠이 무서워요,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
1,750,우리는 친구,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
2,2131,타요의 첫 운행,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
3,2625,길 잃은 타요,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
4,2594,새내기 꼬마 버스의 하루,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42597,39873,로티프렌즈와 색칠놀이! - 그리피 ＆ 사탕 바구니,로티프렌즈 미술놀이,키즈,놀이교실,,한국,477,,,,,,,,
42598,39874,로티프렌즈와 색칠놀이! - 베블리 ＆ 꽃,로티프렌즈 미술놀이,키즈,놀이교실,,한국,466,,,,,,,,
42599,4779,손가락을 빨게 돼요,4-5세,키즈,책,,한국,293,,,,,,,,
42600,4779,손가락을 빨게 돼요,아이맘콕,키즈,책,,한국,293,,,,,,,,


## Profile 


In [129]:
profile = pd.read_csv("/kaggle/input/lgground/profile_data.csv")
# we use  / sex / age / pr_interest_keyword_cd_1 / ch_interest_keyword_cd_1  as categorical feature
# age binning 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8311 entries, 0 to 8310
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   profile_id                8311 non-null   int64 
 1   sex                       8311 non-null   object
 2   age                       8311 non-null   int64 
 3   pr_interest_keyword_cd_1  8311 non-null   object
 4   pr_interest_keyword_cd_2  6778 non-null   object
 5   pr_interest_keyword_cd_3  6231 non-null   object
 6   ch_interest_keyword_cd_1  8311 non-null   object
 7   ch_interest_keyword_cd_2  6618 non-null   object
 8   ch_interest_keyword_cd_3  6029 non-null   object
dtypes: int64(2), object(7)
memory usage: 584.5+ KB


In [None]:
###### pr_interest_keyword_cd 경우에는 실제플랫폼 특성을 확인 요망 ... 좀 더 고민

In [150]:
def profile_feature_engineering(profile):
    #######  sex / age / pr_interest_keyword_cd_1 / ch_interest_keyword_cd_1 - categorical
    cat_features = ['sex','age','pr_interest_keyword_cd_1','ch_interest_keyword_cd_1']
    for i in enumerate(cat_features) :
        col = i[1]
        profile[col] = profile[col].astype('category')
        
    ####### age binning    
    bins = [0,2,5,7,10,13] 
    group_names = ['영아','유아','초등준비','초등저학년','초등고학년'] #한솔교육 제품군 참조
    profile['age_bin'] = pd.cut(profile['age'],bins,labels=group_names)
    profile['age_bin'] = profile['age_bin'].astype('category')    
    
    return profile

In [151]:
profile_feature_engineering(profile)

Unnamed: 0,profile_id,sex,age,pr_interest_keyword_cd_1,pr_interest_keyword_cd_2,pr_interest_keyword_cd_3,ch_interest_keyword_cd_1,ch_interest_keyword_cd_2,ch_interest_keyword_cd_3,age_bin
0,3,F,5,P02,P04,P07,K01,K03,K04,유아
1,5,M,5,P07,P08,P06,K05,K08,K09,유아
2,7,F,9,P05,P03,,K06,K04,,초등저학년
3,12,M,6,P03,P06,P02,K09,K07,K03,초등준비
4,16,F,12,P03,P06,P01,K01,K06,K04,초등고학년
...,...,...,...,...,...,...,...,...,...,...
8306,33022,M,1,P04,,,K04,K08,,영아
8307,33023,M,5,P06,P03,P07,K08,K04,K05,유아
8308,33026,F,8,P01,P03,P08,K05,K09,K06,초등저학년
8309,33027,F,4,P04,P05,P06,K03,K01,K05,유아


In [152]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8311 entries, 0 to 8310
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   profile_id                8311 non-null   int64   
 1   sex                       8311 non-null   category
 2   age                       8311 non-null   category
 3   pr_interest_keyword_cd_1  8311 non-null   category
 4   pr_interest_keyword_cd_2  6778 non-null   object  
 5   pr_interest_keyword_cd_3  6231 non-null   object  
 6   ch_interest_keyword_cd_1  8311 non-null   category
 7   ch_interest_keyword_cd_2  6618 non-null   object  
 8   ch_interest_keyword_cd_3  6029 non-null   object  
 9   age_bin                   8311 non-null   category
dtypes: category(5), int64(1), object(4)
memory usage: 367.0+ KB


In [None]:
############ 테스트해보기 (numeric or categorical?)

In [None]:
############ memory reduce + 자동화 

# day_week feature engineering
- to split label, train data

In [None]:
df_train_week = day_week_feature_engineering(df_train_week)

# label & train split

In [None]:
# Label, train data split
df_train, label_df = train_label_split(df_train_week)

In [None]:
n = 25

# prerpocess

In [None]:
personal_train = df_train.drop_duplicates(subset=['profile_id','album_id','ss_id'])
df_train = df_train.drop_duplicates(subset=['profile_id','album_id'])

In [None]:
# 라벨 유저 한정1
# customers = label_df.profileQ_id.unique()
# 전체 유저 한정
customers = df_train.profile_id.unique()

# MP

In [None]:
MP_df = df_train.album_id.value_counts().head(50).reset_index()
MP_df.columns = ['album_id','total_counts']
MP_df['join_col'] = 1

customer_df = df_train_week[['profile_id']].drop_duplicates()
customer_df['join_col'] = 1

In [None]:
MP_cand = customer_df.merge(MP_df, on='join_col').drop_duplicates(subset=['profile_id','album_id'])[['profile_id','album_id','total_counts']]

# general MP
- 마지막 1주, 2주의 MP를 각 유저마다 넣는다.

In [None]:
last_week_list = np.sort(df_train.week.unique())

In [None]:
print('중복제거 후 데이터 수:', len(df_train_week))

In [None]:
# 마지막 6,5주 각각 MP를 10개 뽑음
last_week_ver1 = last_week_list[-1]
last_week_ver2 = last_week_list[-2]

MP_latest_ver1_df = df_train.query(f"week == {last_week_ver1}")

In [None]:
MP_df = MP_latest_ver1_df.groupby('album_id')['profile_id'].count().sort_values(ascending=False)
MP_df = MP_df.reset_index()
MP_df.columns = ['album_id','counts']
MP_candidate_df = MP_df[:10].copy()
MP_candidate_df['join_col'] = 1

In [None]:
# df_train_week 전체 유저 대상으로 후보군을 뽑을 것임
customer_df = df_train_week[df_train_week['profile_id'].isin(customers)][['profile_id']]
customer_df['join_col'] = 1
popular_articles_cand_ver1 = customer_df.copy()
popular_articles_cand_ver1 = popular_articles_cand_ver1.merge(MP_candidate_df, on="join_col")

popular_articles_cand_ver1.drop_duplicates(subset=['profile_id','album_id'],inplace=True)

In [None]:
MP_latest_ver2_df = df_train.query(f"week == {last_week_ver2}")

In [None]:
MP_df = MP_latest_ver2_df.groupby('album_id')['profile_id'].count().sort_values(ascending=False)
MP_df = MP_df.reset_index()
MP_df.columns = ['album_id','general_counts']
MP_candidate_df = MP_df[:10].copy()
MP_candidate_df['join_col'] = 1

In [None]:
customer_df = df_train_week[df_train_week['profile_id'].isin(customers)][['profile_id']]
customer_df['join_col'] = 1
popular_articles_cand_ver2 = customer_df.copy()
popular_articles_cand_ver2 = popular_articles_cand_ver2.merge(MP_candidate_df, on="join_col")

popular_articles_cand_ver2.drop_duplicates(subset=['profile_id','album_id'],inplace=True)

In [None]:
popular_articles_cand = pd.concat([popular_articles_cand_ver1, popular_articles_cand_ver2])
popular_articles_cand = popular_articles_cand.groupby(['profile_id','album_id'])['general_counts'].sum().reset_index()

# personal_MP

In [None]:
personal_MP_df = personal_train.groupby(['profile_id','album_id'])[['ss_id']].count().reset_index()
personal_MP_df.columns = ['profile_id','album_id','personal_counts']

In [None]:
# 서로 다른날 5회 이상 시청한 앨범만
personal_MP = personal_MP_df[personal_MP_df['personal_counts'] >= 5]
personal_MP = personal_MP.sort_values(by=['profile_id','personal_counts'],ascending=False)

In [None]:
# 상위 5개만 pick
head_df_list = []
# 전체 유저 대상으로 뽑기
for user_id in tqdm(customer_df.profile_id.unique()):
    personal_MP_user_len = len(personal_MP[personal_MP['profile_id']==user_id].head())
    random_choice_list = personal_MP.album_id.unique()
    if personal_MP_user_len <5:
        # 5개 아이템이 없는 경우 랜덤으로 없는 개수만 만큼 choice
        user_df = personal_MP[personal_MP['profile_id']==user_id]
        df = pd.DataFrame()
        random_choices = np.random.choice(random_choice_list, size=(5-personal_MP_user_len))
        df['profile_id'] = [user_id for _ in range(5-personal_MP_user_len)]
        df['album_id'] = random_choices
        df = pd.concat([user_df, df])
        head_df_list.append(df)
    else:
        head_df_list.append(personal_MP[personal_MP['profile_id']==user_id].head())
        
personal_MP_candidate = pd.concat(head_df_list)

# MP_user_genre

In [None]:
# week, day, album_cnt, rank 컬럼 candidate 붙여야 함
# df_train

In [None]:
meta_df = pd.read_csv(path+'meta_data.csv')
meta_df = meta_df[['album_id','genre_mid','run_time','cast_1','cast_2','cast_3']]

In [None]:
df_train_meta = df_train.merge(meta_df, on='album_id')

In [None]:
user_genre_df = df_train_meta.groupby(['profile_id','genre_mid']).count()['ss_id'].reset_index()
user_genre_df.columns = ['profile_id','genre_mid','genre_cnt']
user_genre_df = user_genre_df.groupby(['profile_id','genre_mid']).sum().reset_index().sort_values(by=['profile_id','genre_cnt'],ascending=False)

# 장르 선호도 피처 만들기
## 100이상 시청한 사람들만 percent
user_total_watch_dict = user_genre_df.groupby('profile_id')['genre_cnt'].sum()\
                        [user_genre_df.groupby('profile_id')['genre_cnt'].sum()>=100].to_dict()
# 전체 시청 피처 만들기
user_genre_df['user_genre_cnt'] = user_genre_df['profile_id'].apply(lambda x: user_total_watch_dict.get(x, None))
user_genre_df['user_genre_percent'] = user_genre_df['genre_cnt']/user_genre_df['user_genre_cnt']
user_genre_df.drop(columns=['user_genre_cnt'],inplace=True)
user_genre_df.dropna(subset=['user_genre_percent'],axis=0,inplace=True)

In [None]:
genre_top_items = {}
genre_count = df_train_meta['genre_mid'].value_counts()
for genre in genre_count.index:
    genre_top_items[genre] = list(df_train_meta[df_train_meta['genre_mid']==genre]['album_id'].value_counts().head(10).index)

In [None]:
df_list = []
for user_id in customer_df.profile_id.unique():
    user_genres = user_genre_df[user_genre_df['profile_id']== user_id].head(2)['genre_mid']
    
    df = pd.DataFrame()
    if len(user_genres) == 0:
        
        df['album_id'] = genre_top_items['노래율동']
        df['album_id'] = genre_top_items['TV만화']
        
    elif len(user_genres) == 1:
        genre_list_1 = genre_top_items[user_genres.values[0]]
        genre_list_2 = genre_top_items['노래율동']
        df['album_id'] = list(dict.fromkeys(np.append(genre_list_1,genre_list_2)))
        
    elif len(user_genres) == 2:
        genre_list_1 = genre_top_items.get(user_genres.values[0],[])
        genre_list_2 = genre_top_items.get(user_genres.values[1],[])
        df['album_id'] = list(dict.fromkeys(np.append(genre_list_1,genre_list_2)))

    df['profile_id'] = user_id
    df_list.append(df)

In [None]:
genre_candidate = pd.concat(df_list, ignore_index=True)
genre_candidate = genre_candidate[['profile_id','album_id']]

# ALS MF candidate

In [None]:
from ALS_MF import MF
model = MF(df_train_week,label_df)
als_candidate, item_factors_feature, user_factors_feature = model.mf_train()

# apriori candidaate

In [None]:
from apriori import apriori_train, apriori_candidate

In [None]:
rules_confidence_item_week = apriori_train(df_train_week, 0.1, 0.8)
apriori_candidate, apriori_feature = apriori_candidate(df_train_week, rules_confidence_item_week)

In [None]:
apri_list = []
for i_list in apriori_candidate.album_id:
    apri_list.append(str(i_list))

In [None]:
pd.Series(apri_list).value_counts()

# candidate merge

In [None]:
popular_articles_cand.album_id.nunique(), personal_MP_candidate.album_id.nunique(), genre_candidate.album_id.nunique()

In [None]:
# MP_cand.drop(columns=['counts'],inplace=True)

In [None]:
candidate_0 = MP_cand
candidate_1 = popular_articles_cand[['profile_id','album_id']]
candidate_2 = personal_MP_candidate[['profile_id','album_id']]
candidate_3 = genre_candidate[['profile_id','album_id']]
candidate_4 = als_candidate.copy()
candidate_5 = apriori_candidate.copy()

cand = pd.concat([candidate_0, candidate_1, candidate_2, candidate_3, candidate_4, candidate_5])
# cand = candidate_0.copy()
cand.drop_duplicates(subset=['profile_id','album_id'],inplace=True)

In [None]:
candidate = pd.merge(cand, popular_articles_cand[['album_id','general_counts']].drop_duplicates(), how='left', on='album_id')
# personal_MP_df 달라짐
candidate = pd.merge(candidate,personal_MP_df, how='left', on=['profile_id','album_id'])
# candidate = pd.merge(candidate, apriori_feature, how="left", on=["profile_id","album_id"])

In [None]:
print('candidate 데이터 수:',len(candidate), 'cand 데이터 수:',len(cand))
print('candidate nunique: ', candidate.album_id.nunique())

# model preprocess

In [None]:
profile_df = pd.read_csv(path+'profile_data.csv')

In [None]:
candidate_add_features = pd.merge(candidate,profile_df, how='left', on='profile_id')
candidate_add_features = pd.merge(candidate_add_features,meta_df.drop_duplicates('album_id'), how='left', on='album_id')

In [None]:
# columns = ['sex','pr_interest_keyword_cd_1','pr_interest_keyword_cd_2','pr_interest_keyword_cd_3','ch_interest_keyword_cd_1','ch_interest_keyword_cd_2','ch_interest_keyword_cd_3',\
# 'genre_mid','cast_1','cast_2','cast_3']
# from sklearn.preprocessing import LabelEncoder
# for col in columns:
#     LE = LabelEncoder()
#     candidate_add_features[col] = LE.fit_transform(candidate_add_features[col])

In [None]:
candidate_add_features = candidate.copy()
# candidate_add_features = candidate_add_features.drop(columns=['personal_counts','pr_interest_keyword_cd_1',\
#                                                               'pr_interest_keyword_cd_2','pr_interest_keyword_cd_3'\
#                                                               ,'ch_interest_keyword_cd_1','ch_interest_keyword_cd_2'\
#                                                               ,'ch_interest_keyword_cd_3',\
#                                                                 'genre_mid','cast_1','cast_2','cast_3'])

In [None]:
candidate_add_features = pd.merge(candidate_add_features, item_factors_feature, how="left", on="album_id")
candidate_add_features = pd.merge(candidate_add_features, user_factors_feature, how="left", on="profile_id")

In [None]:
train_df = pd.merge(candidate_add_features, label_df, how='left', on=['profile_id','album_id'])
train_df['rating'] = train_df['rating'].fillna(0)
# train_df = train_df.drop(columns="personal_counts")

# model

In [None]:
train_df.rating.value_counts()

In [None]:
lgbmrank = LGBMRank(train_df, mode='week', model_params={'n_estimators':5})
X_train, sample_sumbission = lgbmrank.valid_evaluation()

In [None]:
lgbmrank = LGBMRank(train_df, mode='week', model_params={'n_estimators':5})
X_train, sample_sumbission = lgbmrank.valid_evaluation()

In [None]:
# cold user MP로 채우기 
MP_list = MP_cand.album_id.unique()

sample_sumbission_cold = sample_sumbission.copy()
sample_sumbission_cold['album_id'] = sample_sumbission_cold['album_id']\
                                        .apply(lambda x: list(dict.fromkeys(np.append(x, MP_list)))[:n])

In [None]:
ndcg_calculator(sample_sumbission_cold, test_answer_week, n)

In [None]:
set_list = {}
for list_i in sample_sumbission.album_id:
    set_list[str(list_i)] = 0
len(set_list)

# Evaluation

### experiment
- num_leaves= 20, learning_rate=0.005, n_estimators:5
- 어느 cand 중요한 지 판단
- popular_articles_cand :(10,10) , personal_MP_candidate:(5), genre_candidate: max(10,10)

### week
- week ndcg score : 0.05611031122402796
- popular_articles_cand week: ndcg 0.028022766659891125 (total unique item: 12)
- personal_MP_candidate week: ndcg 0.02896244104871668  (total unique item: 4303)
- genre_candidate week:       ndcg 0.01579135161487282  (total unique item: 157)
- (popular_articles_cand, personal_MP_candidate) week: ndcg 0.05206722084683086  (total unique item: 4303)
- (popular_articles_cand, genre_candidate) week: 0.03809584406788012 (total unique item: 161)
- (personal_MP_candidate, genre_candidate) week: 0.041085241338965385 (total unique item: 4338)

### month
- month ndcg score : 0.08036130090004782
- popular_articles_cand month: ndcg 0.05945057771810242 (total unique item: 15)
- personal_MP_candidate month: ndcg 0.028477345633295483  (total unique item: 2826)
- genre_candidate       month: ndcg 0.017431233023063257  (total unique item: 150)
- (popular_articles_cand, personal_MP_candidate) month: ndcg 0.05206722084683086  (total unique item: 2826)
- (popular_articles_cand, genre_candidate) month: 0.0680887232688415 (total unique item: 155)
- (personal_MP_candidate, genre_candidate) month: 0.041085241338965385 (total unique item: 2872)

In [None]:
def evaluation(
            X_train:pd.DataFrame(), 
            sumbission:pd.DataFrame(), 
            n:int, 
            MP_cand:pd.DataFrame()
            )->pd.DataFrame():
    
    MP_list = MP_cand.album_id.unique()
    
    # each user pred 25 items
    lgbm_sub_df = X_train.sort_values(by='pred', ascending=False).groupby('profile_id').head(n)
    lgbm_user_items_dict = lgbm_sub_df.groupby('profile_id')['album_id'].unique().to_dict()
    sumbission['predicted_list'] = sumbission['profile_id']\
                                            .apply(lambda x: lgbm_user_items_dict.get(x, []))
    
    # cold start user file MP_list top25
    sumbission_cold = sumbission.copy()
    sumbission_cold['predicted_list'] = sumbission_cold['predicted_list']\
                                            .apply(lambda x: list(dict.fromkeys(np.append(x, MP_list)))[:n])
    
    return sumbission, sumbission_cold

In [None]:
submission = pd.read_csv(path + 'sample_submission.csv')
sumbission_pred, sumbission_cold = evaluation(X_train, submission, n, MP_cand)

In [None]:
set_list = {}
for list_i in sumbission_cold.predicted_list:
    set_list[str(list_i)] = 0

In [None]:
print('같은 추천을 받은 유저 수:', 8311 - len(set_list))

In [None]:
sumbission_cold

In [None]:
# 제출 조건 충족 확인
assert submission.profile_id.nunique() == sumbission_cold.profile_id.nunique()
for pred_list in sumbission_cold.predicted_list:
    assert len(pred_list) == 25

In [None]:
sumbission_cold

In [None]:
# sumbission_cold.to_csv('lgbm_candidate_submission_ver5_alsvector.csv', index=False)