In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.base import ClassifierMixin
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Evaluation
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from sklearn.feature_selection import RFE

import gc
import random
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold

# 한글 폰트 설정
from statsmodels import robust
from matplotlib import font_manager, rc
%matplotlib inline

import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
D_code = pd.read_csv('data/속성_D_코드.csv')
H_code = pd.read_csv('data/속성_H_코드.csv')
L_code = pd.read_csv('data/속성_L_코드.csv')

In [4]:
train.shape, test.shape

((501951, 35), (46404, 34))

In [5]:
D_code.shape, H_code.shape, L_code.shape

((1114, 6), (294, 2), (2025, 5))

In [6]:
train.iloc[:5, :19]

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2
0,0,True,True,True,False,False,False,1,4,3,5,275,370,369,8,1,1,4,95
1,1,False,False,False,True,True,False,1,3,4,1,114,181,175,4,1,1,131,101
2,2,False,False,False,True,False,False,2,0,3,5,464,175,452,3,1,1,54,263
3,3,False,False,False,True,False,False,2,0,2,5,703,705,704,3,1,1,72,227
4,4,True,True,True,False,False,False,1,3,4,5,275,370,369,4,1,1,214,210


In [7]:
train.iloc[:5, 19:]

Unnamed: 0,person_prefer_h_3,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,59,3,3,10,2,1,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,96,1,3,5,1,1,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,56,3,1,10,2,1,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0
3,2,1,3,5,1,1,2,1608,275,5,3,74,827967,572323,2020-01-13 18:09:34,0
4,209,1,1,10,2,1,2,1608,275,1,4,74,831614,573899,2020-03-09 20:39:22,0


## EDA

여러가지 속성이 있지만 D,L,H 속성은 종류가 너무 많아서 따로 빼둔 듯

### D_code

In [8]:
D_code = D_code.iloc[:, :5]

In [9]:
# D_code 대분류 11종류
D_code['속성 D 대분류코드'].unique()

array([   1,  216,  377,  482,  522,  618,  744,  864,  926, 1235, 1258],
      dtype=int64)

In [10]:
D_code.groupby('속성 D 대분류코드')['속성 D 중분류코드'].nunique()

속성 D 대분류코드
1       18
216     19
377     12
482      7
522      8
618     13
744     11
864      6
926     37
1235     5
1258     1
Name: 속성 D 중분류코드, dtype: int64

In [11]:
D_code

Unnamed: 0,속성 D 코드,속성 D 세분류코드,속성 D 소분류코드,속성 D 중분류코드,속성 D 대분류코드
0,4,4,2,3,1
1,5,5,2,3,1
2,7,7,2,6,1
3,8,8,2,6,1
4,9,8,2,6,1
...,...,...,...,...,...
1109,1254,1254,1235,1254,1235
1110,1255,1254,1235,1254,1235
1111,1256,1254,1235,1254,1235
1112,1257,1254,1235,1254,1235


### L_code

In [12]:
# L_code 대분류 22종류
L_code['속성 L 대분류코드'].unique()

array([2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025],
      dtype=int64)

In [13]:
L_code.groupby('속성 L 대분류코드')['속성 L 중분류코드'].nunique()

속성 L 대분류코드
2004     4
2005     5
2006    26
2007     2
2008     5
2009     3
2010     4
2011     5
2012     3
2013     7
2014     4
2015     2
2016     5
2017     4
2018     2
2019     2
2020     3
2021     3
2022     4
2023     3
2024     2
2025     1
Name: 속성 L 중분류코드, dtype: int64

In [14]:
L_code.head()

Unnamed: 0,속성 L 코드,속성 L 세분류코드,속성 L 소분류코드,속성 L 중분류코드,속성 L 대분류코드
0,1,1,1,1,2004
1,2,2,2,1,2004
2,3,3,2,1,2004
3,4,3,2,1,2004
4,5,5,2,1,2004


### H_code

In [15]:
H_code.head()

Unnamed: 0,속성 H 코드,속성 H 상위코드
0,2,1
1,4,3
2,5,3
3,6,3
4,7,3


In [16]:
H_code['속성 H 상위코드'].unique()

array([  1,   3,  30,  48,  58,  71,  78,  85,  92,  94, 149, 169, 188,
       208, 226, 250, 277, 302, 308, 312, 314], dtype=int64)

### train

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501951 entries, 0 to 501950
Data columns (total 35 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   id                      501951 non-null  int64 
 1   d_l_match_yn            501951 non-null  bool  
 2   d_m_match_yn            501951 non-null  bool  
 3   d_s_match_yn            501951 non-null  bool  
 4   h_l_match_yn            501951 non-null  bool  
 5   h_m_match_yn            501951 non-null  bool  
 6   h_s_match_yn            501951 non-null  bool  
 7   person_attribute_a      501951 non-null  int64 
 8   person_attribute_a_1    501951 non-null  int64 
 9   person_attribute_b      501951 non-null  int64 
 10  person_prefer_c         501951 non-null  int64 
 11  person_prefer_d_1       501951 non-null  int64 
 12  person_prefer_d_2       501951 non-null  int64 
 13  person_prefer_d_3       501951 non-null  int64 
 14  person_prefer_e         501951 non-n

In [18]:
# 비율 반반 -> 너무 깔끔해~~~
train['target'].value_counts()

0    251106
1    250845
Name: target, dtype: int64

In [19]:
# person_rn, contents_rn 중복가능
train['person_rn'].nunique(), train['contents_rn'].nunique()

(300177, 283359)

## feature preprocessing & engineering

아이디어
1. 컨텐츠 열람 일시 관련 시계열 피처 생성
2. 회원속성(선호속성) + 컨텐츠속성의 조합 -> 조합을 피쳐로 생성 후 군집분석 or 임베딩
3. 조윤호 교수님 코드 최대한으로 활용해보자

### 데이터 - 속성코드 매칭

In [8]:
def add_code(df, d_code, h_code, l_code):
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    df['person_prefer_h_1_u'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    df['person_prefer_h_2_u'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    df['person_prefer_h_3_u'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    df['contents_attribute_h_u'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 상위코드'])

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    return df

In [9]:
d_code = pd.read_csv('data/속성_D_코드.csv', index_col=0).iloc[:, :4].T.to_dict()
h_code = pd.read_csv('data/속성_H_코드.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('data/속성_L_코드.csv', index_col=0).T.to_dict()

In [10]:
train = add_code(train, d_code, h_code, l_code)
test = add_code(test, d_code, h_code, l_code)

In [11]:
train.shape, test.shape

((501951, 59), (46404, 58))

### contents_open_dt 관련

In [12]:
train['contents_open_dt'] = pd.to_datetime(train['contents_open_dt'])
test['contents_open_dt'] = pd.to_datetime(test['contents_open_dt'])

In [13]:
train['contents_open_dt'].min(), train['contents_open_dt'].max()

(Timestamp('2020-01-01 00:01:03'), Timestamp('2020-11-30 23:59:56'))

In [14]:
test['contents_open_dt'].min(), test['contents_open_dt'].max()

(Timestamp('2020-12-01 00:00:07'), Timestamp('2020-12-31 23:59:08'))

In [15]:
#train['contents_open_year'] = train['contents_open_dt'].dt.year
train['contents_open_month'] = train['contents_open_dt'].dt.month
train['contents_open_day'] = train['contents_open_dt'].dt.day
train['contents_open_dow'] = train['contents_open_dt'].dt.dayofweek
train['contents_open_hour'] = train['contents_open_dt'].dt.hour

#test['contents_open_year'] = test['contents_open_dt'].dt.year
test['contents_open_month'] = test['contents_open_dt'].dt.month
test['contents_open_day'] = test['contents_open_dt'].dt.day
test['contents_open_dow'] = test['contents_open_dt'].dt.dayofweek
test['contents_open_hour'] = test['contents_open_dt'].dt.hour

In [16]:
train['contents_open_season'] = train['contents_open_month'].apply(lambda x : 'spring' if x in [3, 4, 5]
                                                                  else 'summer' if x in [6, 7, 8]
                                                                  else 'fall' if x in [9, 10, 11]
                                                                  else 'winter')

test['contents_open_season'] = test['contents_open_month'].apply(lambda x : 'spring' if x in [3, 4, 5]
                                                                  else 'summer' if x in [6, 7, 8]
                                                                  else 'fall' if x in [9, 10, 11]
                                                                  else 'winter')

In [17]:
# 보통 회사 식사시간 기준
train['contents_open_ts'] = train['contents_open_hour'].apply(lambda x : '아침' if (x >= 7) and (x <= 11)
                                                                  else '점심' if (x > 11) and (x <= 13)
                                                                  else '오후' if (x > 13) and (x <= 19)
                                                                  else '휴식')

test['contents_open_ts'] = test['contents_open_hour'].apply(lambda x : '아침' if (x >= 7) and (x <= 11)
                                                                  else '점심' if (x > 11) and (x <= 13)
                                                                  else '오후' if (x > 13) and (x <= 19)
                                                                  else '휴식')

In [18]:
train['contents_open_mgroup'] = pd.cut(train['contents_open_day'], bins = [1, 11, 21, 32], 
                              right = False, labels = ['Cho', 'Jung', 'Mal'])

test['contents_open_mgroup'] = pd.cut(test['contents_open_day'], bins = [1, 11, 21, 32], 
                              right = False, labels = ['Cho', 'Jung', 'Mal'])

In [19]:
train['weekend'] = train['contents_open_dow'].apply(lambda x : 1 if x > 4 else 0)
test['weekend'] = test['contents_open_dow'].apply(lambda x : 1 if x > 4 else 0)

In [20]:
# 10:30 ~ 20:00
train['contents_open_open'] = train['contents_open_hour'].apply(lambda x : 1 if (x >= 10) and (x <= 20) else 0)
test['contents_open_open'] = test['contents_open_hour'].apply(lambda x : 1 if (x >= 10) and (x <= 20) else 0)

### 회원속성

In [21]:
# 고객 취향 - 컨텐츠 속성 True 개수
true_sum = []
for i in range (0, train.shape[0], 1):
    true_all = train.iloc[i:i+1, 1:7].sum().sum()
    true_sum.append(true_all)

true_sum_te = []
for i in range (0, test.shape[0], 1):
    true_all = test.iloc[i:i+1, 1:7].sum().sum()
    true_sum_te.append(true_all)

true_df = pd.DataFrame(true_sum)
true_df.columns = ['true값']

true_df_te = pd.DataFrame(true_sum_te)
true_df_te.columns = ['true값']

train = pd.concat([train, true_df], axis=1)
test = pd.concat([test, true_df_te], axis=1)

KeyboardInterrupt: 

In [None]:
train2 = train.sort_values(by='contents_open_dt').reset_index(drop=True)
test2 = test.sort_values(by='contents_open_dt').reset_index(drop=True)

In [None]:
train2['누적_컨텐츠접촉'] = train2.groupby('person_rn')['person_rn'].cumcount() + 1
test2['누적_컨텐츠접촉'] = test2.groupby('person_rn')['person_rn'].cumcount() + 1

In [None]:
train['누적_컨텐츠접속'] = train2.sort_values(by='id').reset_index()['누적_컨텐츠접촉']
test['누적_컨텐츠접속'] = test2.sort_values(by='id').reset_index()['누적_컨텐츠접촉']

### 회원속성, 컨텐츠속성 조합 fasttext embedding

In [None]:
train['회원특성'] = train['person_attribute_a'].astype(str) + '_' + train['person_attribute_a_1'].astype(str) + '_' + train['person_attribute_b'].astype(str) + \
'_' + train['person_prefer_c'].astype(str) + '_' + train['person_prefer_d_1'].astype(str) + '_' + train['person_prefer_d_2'].astype(str) + '_' + train['person_prefer_d_3'].astype(str) + \
'_' + train['person_prefer_e'].astype(str) + '_' + train['person_prefer_f'].astype(str) + '_' + train['person_prefer_g'].astype(str) + '_' + train['person_prefer_h_1'].astype(str) + \
'_' + train['person_prefer_h_2'].astype(str) + '_' + train['person_prefer_h_3'].astype(str)

train['컨텐츠특성'] = train['contents_attribute_i'].astype(str) + '_' + train['contents_attribute_a'].astype(str) + '_' + train['contents_attribute_j_1'].astype(str) + \
'_' + train['contents_attribute_j'].astype(str) + '_' + train['contents_attribute_c'].astype(str) + '_' + train['contents_attribute_k'].astype(str) + '_' + train['contents_attribute_l'].astype(str) + \
'_' + train['contents_attribute_d'].astype(str) + '_' + train['contents_attribute_m'].astype(str)+ '_' + train['contents_attribute_e'].astype(str) + '_' + train['contents_attribute_h'].astype(str)

In [None]:
test['회원특성'] = test['person_attribute_a'].astype(str) + '_' + test['person_attribute_a_1'].astype(str) + '_' + test['person_attribute_b'].astype(str) + \
'_' + test['person_prefer_c'].astype(str) + '_' + test['person_prefer_d_1'].astype(str) + '_' + test['person_prefer_d_2'].astype(str) + '_' + test['person_prefer_d_3'].astype(str) + \
'_' + test['person_prefer_e'].astype(str) + '_' + test['person_prefer_f'].astype(str) + '_' + test['person_prefer_g'].astype(str) + '_' + test['person_prefer_h_1'].astype(str) + \
'_' + test['person_prefer_h_2'].astype(str) + '_' + test['person_prefer_h_3'].astype(str)

test['컨텐츠특성'] = test['contents_attribute_i'].astype(str) + '_' + test['contents_attribute_a'].astype(str) + '_' + test['contents_attribute_j_1'].astype(str) + \
'_' + test['contents_attribute_j'].astype(str) + '_' + test['contents_attribute_c'].astype(str) + '_' + test['contents_attribute_k'].astype(str) + '_' + test['contents_attribute_l'].astype(str) + \
'_' + test['contents_attribute_d'].astype(str) + '_' + test['contents_attribute_m'].astype(str)+ '_' + test['contents_attribute_e'].astype(str) + '_' + test['contents_attribute_h'].astype(str)

In [33]:
train['회원_컨텐츠_조합'] = train['회원특성'] + '+' + train['컨텐츠특성']
test['회원_컨텐츠_조합'] = test['회원특성'] + '+' + test['컨텐츠특성']

In [34]:
import random

train_data = list(train['회원_컨텐츠_조합'].unique())
test_data = list(test['회원_컨텐츠_조합'].unique())

In [39]:
train_data[0]

'1_4_3_5_275_370_369_8_1_1_4_95_59+3_3_10_2_1_2_1608_275_1_4_139'

In [35]:
len(train_data), len(test_data)

(500937, 46389)

In [38]:
from gensim.models.fasttext import FastText

In [40]:
model = FastText(sentences=train_data, size=30, window=5, min_count=1, sg=1)

In [41]:
model.train(sentences=train_data, epochs=5, total_examples=model.corpus_count, 
            total_words=model.corpus_total_words)

In [42]:
train_mean_vector = []
for words in tqdm(train_data):
    tmp = np.zeros(30)
    cnt = 0
    for word in words:
        try:
            tmp += model.wv[word]
            cnt += 1
        except:
            pass
    tmp /= cnt
    train_mean_vector.append(tmp)
train_mean_vector = np.array(train_mean_vector)

100%|████████████████████████████████████████████████████████████████████████| 500937/500937 [02:00<00:00, 4149.50it/s]


In [43]:
test_mean_vector = []
for words in tqdm(test_data):
    tmp = np.zeros(30)
    cnt = 0
    for word in words:
        try:
            tmp += model.wv[word]
            cnt += 1
        except:
            pass
    tmp /= cnt
    test_mean_vector.append(tmp)
test_mean_vector = np.array(test_mean_vector)

100%|██████████████████████████████████████████████████████████████████████████| 46389/46389 [00:11<00:00, 4192.63it/s]


In [44]:
train_mean_vector = pd.DataFrame(train_mean_vector)
train_mean_vector.columns = 'combination + ' + train_mean_vector.columns.astype(str)

test_mean_vector = pd.DataFrame(test_mean_vector)
test_mean_vector.columns = 'combination + ' + test_mean_vector.columns.astype(str)

In [47]:
train_mean_vector['회원_컨텐츠_조합'] = train_data
test_mean_vector['회원_컨텐츠_조합'] = test_data

In [50]:
train = train.merge(train_mean_vector, on='회원_컨텐츠_조합', how='left')
test = test.merge(test_mean_vector, on='회원_컨텐츠_조합', how='left')

### 회원속성, 컨텐츠속성 조합 kmeans

In [89]:
train.columns

Index(['id', 'd_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_rn', 'contents_rn', 'contents_open_dt',
       'target', 'person_prefer_d_1_n', 'person_prefer_d_1_s',
       'person_prefer_d_1_m', 'person_prefer_d_1_l', 'person_prefer_d_2_n',
       'person_prefer_d_2_s', 'person_prefer_d_2_m', 'person_prefer_d_2_l',
       'person_prefer_

In [90]:
per_features = ["d_l_match_yn", "d_m_match_yn", "d_s_match_yn", "h_l_match_yn", "h_m_match_yn",
               "h_s_match_yn", "person_attribute_a", "person_attribute_a_1", "person_attribute_b", "person_prefer_c",
               "person_prefer_e", "contents_open_hour", "person_prefer_d_1", "person_prefer_d_2", "person_prefer_d_3",
               "person_prefer_h_1", "person_prefer_h_2", "person_prefer_h_3", 'true값', 'contents_open_dow']

con_features = ["contents_attribute_i", "contents_attribute_a", "contents_attribute_j_1", "contents_attribute_j",
               "contents_attribute_c", "contents_attribute_k", "contents_attribute_m", "contents_attribute_e",
               "contents_open_hour", "contents_attribute_l", "contents_attribute_d", "contents_attribute_h"]

In [91]:
len(per_features), len(con_features)

(20, 12)

In [92]:
from sklearn.cluster import KMeans

class KMeansFeaturizer:
    """ 숫자 데이터를 k-평균 클러스터 멤버십으로 변환.

    이 변환기는 입력 데이터에 k-평균을 수행해 각 데이터 포인트를 가장 가까운 클러스터의 id로 변환한다.
    만약 목표 변수가 주어지면 유사한 데이터 포인트와 함께 grouping되고,
    분류 경계에 따르는 클러스터를 생성하기 위해 스케일링되고, k-평균 입력에 포함된다.
    """

    def __init__(self, k = 100, target_scale = 5.0, random_state = None):
        self.k = k
        self.target_scale = target_scale
        self.random_state = random_state

    def fit(self, X, y = None):
        """ 입력 데이터에 k-평균을 수행하고 중심점을 찾는다.
        """
        if y is None: # 목표 변수가 없으면 단순한 k-평균 수행
            km_model = KMeans(n_clusters = self.k, n_init = 20, random_state = self.random_state)
            km_model.fit(X)
            
            self.inertia_ = km_model.inertia_
            self.km_model = km_model
            self.cluster_centers_ = km_model.cluster_centers_
            return self

        # 목표 변수가 있으면, 적절한 스케일링을 적용하고, 이를 k-평균에 대한 입력 데이터에 포함시킨다.
        data_with_target = np.hstack((X, y[:, np.newaxis] * self.target_scale))
        # 데이터와 타겟에 대해 사전 학습할 k-평균 모델 구축
        km_model_pretrain = KMeans(n_clusters = self.k, n_init = 20, random_state = self.random_state)
        km_model_pretrain.fit(data_with_target)

        # k평균을 두번째로 실행해 목표 변수 없이 원시 공간에서 클러스터를 얻는다. 사전 학습을 통해 얻은 중심점을 활용해 초기화한다.
        # 반복을 통해 클러스터 할당과 중심점 계산을 다시 수행한다.

        km_model = KMeans(n_clusters = self.k, init = km_model_pretrain.cluster_centers_[:,:data_with_target.shape[1]-1], n_init = 1, max_iter = 1)

        km_model.fit(X)
        
        self.inertia_ = km_model.inertia_
        self.km_model = km_model
        self.cluster_centers_ = km_model.cluster_centers_
        return self

    def transform(self, X, y = None):
        """ 각 입력 데이터 포인트에 대해 가장 가까운 클러스터 ID 산출
        """
        clusters = self.km_model.predict(X)
        return clusters[:, np.newaxis]

    def fit_transform(self, X, y = None):
        self.fit(X, y)
        return self.transform(X, y)

In [93]:
per_train = train.loc[:, per_features].copy()
per_test = test.loc[:, per_features].copy()

con_train = train.loc[:, con_features].copy()
con_test = test.loc[:, con_features].copy()

In [94]:
km = KMeansFeaturizer(k=100, random_state=42)

In [95]:
ktr_per = km.fit_transform(per_train, train["target"])
kte_per = km.transform(per_test)

KeyboardInterrupt: 

In [None]:
ktr_con = km.fit_transform(con_train, train["target"])
kte_con = km.transform(con_test)

In [None]:
train['per_km'] = ktr_per.astype(str)
test['per_km'] = kte_per.astype(str)

In [None]:
train['con_km'] = ktr_con.astype(str)
test['con_km'] = kte_con.astype(str)

### 시간 sin/cos encoding

In [25]:
train['sin_time'] = np.sin(2*np.pi*train['contents_open_hour'] / 24)
train['cos_time'] = np.cos(2*np.pi*train['contents_open_hour'] / 24)

test['sin_time'] = np.sin(2*np.pi*train['contents_open_hour'] / 24)
test['cos_time'] = np.cos(2*np.pi*train['contents_open_hour'] / 24)

### 또 뭘해야하지

In [39]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501951 entries, 0 to 501950
Data columns (total 72 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   id                      501951 non-null  int64         
 1   d_l_match_yn            501951 non-null  bool          
 2   d_m_match_yn            501951 non-null  bool          
 3   d_s_match_yn            501951 non-null  bool          
 4   h_l_match_yn            501951 non-null  bool          
 5   h_m_match_yn            501951 non-null  bool          
 6   h_s_match_yn            501951 non-null  bool          
 7   person_attribute_a      501951 non-null  int64         
 8   person_attribute_a_1    501951 non-null  int64         
 9   person_attribute_b      501951 non-null  int64         
 10  person_prefer_c         501951 non-null  int64         
 11  person_prefer_d_1       501951 non-null  int64         
 12  person_prefer_d_2       501951

## dataset setting

In [27]:
train.columns

Index(['id', 'd_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_rn', 'contents_rn', 'contents_open_dt',
       'target', 'person_prefer_d_1_n', 'person_prefer_d_1_s',
       'person_prefer_d_1_m', 'person_prefer_d_1_l', 'person_prefer_d_2_n',
       'person_prefer_d_2_s', 'person_prefer_d_2_m', 'person_prefer_d_2_l',
       'person_prefer_

In [98]:
ftr = train.drop(['id', 'contents_open_dt', 'person_rn', 'contents_rn', 'target', 'person_prefer_f', 'person_prefer_g'], axis=1)
ftr_te = test.drop(['id', 'contents_open_dt', 'person_rn', 'contents_rn', 'person_prefer_f', 'person_prefer_g'], axis=1)

In [99]:
#ftr = train[features]
#ftr_te = test[features]
target = train['target']

In [100]:
ftr.shape, ftr_te.shape

((501951, 65), (46404, 65))

In [101]:
from sklearn.preprocessing import LabelEncoder
lst = ['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn', 'h_m_match_yn', 'h_s_match_yn', 'contents_open_ts', 'contents_open_mgroup', 'contents_open_season']

for i in lst:
    le = LabelEncoder()
    ftr[i] = le.fit_transform(ftr[i])
    ftr_te[i] = le.transform(ftr_te[i])

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(ftr.corr(), annot=True, cmap="YlGnBu")
plt.show()

In [102]:
col_name = ftr.columns

In [103]:
cat_features = ['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h',
       'person_prefer_d_1_n', 'person_prefer_d_1_s',
       'person_prefer_d_1_m', 'person_prefer_d_1_l', 'person_prefer_d_2_n',
       'person_prefer_d_2_s', 'person_prefer_d_2_m', 'person_prefer_d_2_l',
       'person_prefer_d_3_n', 'person_prefer_d_3_s', 'person_prefer_d_3_m',
       'person_prefer_d_3_l', 'contents_attribute_d_n',
       'contents_attribute_d_s', 'contents_attribute_d_m',
       'contents_attribute_d_l', 'person_prefer_h_1_u', 'person_prefer_h_2_u',
       'person_prefer_h_3_u', 'contents_attribute_h_u',
       'contents_attribute_l_n', 'contents_attribute_l_s',
       'contents_attribute_l_m', 'contents_attribute_l_l',
       'contents_open_month', 'contents_open_day', 'contents_open_dow',
       'contents_open_hour', 'contents_open_season', 'contents_open_ts',
       'contents_open_mgroup', 'weekend', 'contents_open_open']

In [104]:
num_features = list(set(ftr.columns) - set(cat_features))

### Scaling - Gauss rank

In [105]:
import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted

class GaussRankScaler(BaseEstimator, TransformerMixin):
    """Transform features by scaling each feature to a normal distribution.
    Parameters
        ----------
        epsilon : float, optional, default 1e-4
            A small amount added to the lower bound or subtracted
            from the upper bound. This value prevents infinite number
            from occurring when applying the inverse error function.
        copy : boolean, optional, default True
            If False, try to avoid a copy and do inplace scaling instead.
            This is not guaranteed to always work inplace; e.g. if the data is
            not a NumPy array, a copy may still be returned.
        n_jobs : int or None, optional, default None
            Number of jobs to run in parallel.
            ``None`` means 1 and ``-1`` means using all processors.
        interp_kind : str or int, optional, default 'linear'
           Specifies the kind of interpolation as a string
            ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
            'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
            refer to a spline interpolation of zeroth, first, second or third
            order; 'previous' and 'next' simply return the previous or next value
            of the point) or as an integer specifying the order of the spline
            interpolator to use.
        interp_copy : bool, optional, default False
            If True, the interpolation function makes internal copies of x and y.
            If False, references to `x` and `y` are used.
        Attributes
        ----------
        interp_func_ : list
            The interpolation function for each feature in the training set.
        """

    def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False):
        self.epsilon = epsilon
        self.copy = copy
        self.interp_kind = interp_kind
        self.interp_copy = interp_copy
        self.fill_value = 'extrapolate'
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Fit interpolation function to link rank with original data for future scaling
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to fit interpolation function for later scaling along the features axis.
        y
            Ignored
        """
        X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T)
        return self

    def _fit(self, x):
        x = self.drop_duplicates(x)
        rank = np.argsort(np.argsort(x))
        bound = 1.0 - self.epsilon
        factor = np.max(rank) / 2.0 * bound
        scaled_rank = np.clip(rank / factor - bound, -bound, bound)
        return interp1d(
            x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)

    def transform(self, X, copy=None):
        """Scale the data with the Gauss Rank algorithm
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _transform(self, i, x):
        return erfinv(self.interp_func_[i](x))

    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _inverse_transform(self, i, x):
        inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind,
                                   copy=self.interp_copy, fill_value=self.fill_value)
        return inv_interp_func(erf(x))

    @staticmethod
    def drop_duplicates(x):
        is_unique = np.zeros_like(x, dtype=bool)
        is_unique[np.unique(x, return_index=True)[1]] = True
        return x[is_unique]

In [106]:
scaler = GaussRankScaler()
ftr[num_features] = pd.DataFrame(scaler.fit_transform(ftr[num_features]))
ftr_te[num_features] = pd.DataFrame(scaler.transform(ftr_te[num_features]))

ftr[num_features].columns = num_features
ftr_te[num_features].columns = num_features

In [107]:
ftr.head()

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,contents_open_hour,contents_open_season,contents_open_ts,contents_open_mgroup,weekend,contents_open_open,true값,누적_컨텐츠접속,sin_time,cos_time
0,1,1,1,0,0,0,1,4,3,5,...,12,3,2,1,0,1,0.000177,-2.751064,0.179345,-2.751064
1,0,0,0,1,1,0,1,3,4,1,...,17,2,1,1,0,1,-0.304408,-2.751064,-0.905952,-0.127121
2,0,0,0,1,0,0,2,0,3,5,...,20,2,3,0,0,1,-0.683882,-2.751064,-0.732672,0.304797
3,0,0,0,1,0,0,2,0,2,5,...,18,3,1,1,0,1,-0.683882,-2.751064,-2.751064,-0.042053
4,1,1,1,0,0,0,1,3,4,5,...,20,1,3,0,0,1,0.000177,-2.751064,-0.732672,0.304797


## modeling

In [108]:
from lightgbm import LGBMClassifier
from xgboost import XGBRFClassifier
from ngboost import NGBRegressor
from catboost import CatBoostClassifier, Pool

In [109]:
kf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

In [113]:
cat_features = ['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h',
       'person_prefer_d_1_n', 'person_prefer_d_1_s',
       'person_prefer_d_1_m', 'person_prefer_d_1_l', 'person_prefer_d_2_n',
       'person_prefer_d_2_s', 'person_prefer_d_2_m', 'person_prefer_d_2_l',
       'person_prefer_d_3_n', 'person_prefer_d_3_s', 'person_prefer_d_3_m',
       'person_prefer_d_3_l', 'contents_attribute_d_n',
       'contents_attribute_d_s', 'contents_attribute_d_m',
       'contents_attribute_d_l', 'person_prefer_h_1_u', 'person_prefer_h_2_u',
       'person_prefer_h_3_u', 'contents_attribute_h_u',
       'contents_attribute_l_n', 'contents_attribute_l_s',
       'contents_attribute_l_m', 'contents_attribute_l_l',
       'contents_open_month', 'contents_open_day', 'contents_open_dow',
       'contents_open_hour', 'contents_open_season', 'contents_open_ts',
       'contents_open_mgroup', 'weekend', 'contents_open_open']

In [114]:
ftr.shape, target.shape, ftr_te.shape

((501951, 65), (501951,), (46404, 65))

### catboost

In [115]:
cb_pred = np.zeros((ftr_te.shape[0]))
f1_list = []
for tr_idx, val_idx in kf.split(ftr, target):
    tr_x, val_x = ftr.iloc[tr_idx], ftr.iloc[val_idx]
    tr_y, val_y = target.iloc[tr_idx], target.iloc[val_idx]
    train_data = Pool(data=tr_x, label=tr_y, cat_features=cat_features)
    val_data = Pool(data=val_x, label=val_y, cat_features=cat_features)
    cb = CatBoostClassifier(iterations=1000, learning_rate=0.01, eval_metric='F1',
                            silent=True, loss_function='Logloss')
    cb.fit(train_data, eval_set=val_data, early_stopping_rounds=1000, use_best_model=True, verbose=2000)
    best = cb.best_iteration_
    pred = cb.predict(val_x, ntree_end=best)
    f1 = f1_score(val_y, pred)
    f1_list.append(f1)
    print(f'FOLD f1 score = {f1}')
    sub_pred = cb.predict(ftr_te, ntree_end=best) / 5
    cb_pred += sub_pred
print(f'\n{cb.__class__.__name__} f1_score = {np.mean(f1_list)}')

0:	learn: 0.5957368	test: 0.5933305	best: 0.5933305 (0)	total: 2.39s	remaining: 39m 52s
999:	learn: 0.6621415	test: 0.6805126	best: 0.6805425 (996)	total: 44m 11s	remaining: 0us

bestTest = 0.6805424627
bestIteration = 996

Shrink model to first 997 iterations.
FOLD f1 score = 0.6804864617653372
0:	learn: 0.6377765	test: 0.6367190	best: 0.6367190 (0)	total: 2.05s	remaining: 34m 4s
999:	learn: 0.6615944	test: 0.6844396	best: 0.6844571 (992)	total: 43m 56s	remaining: 0us

bestTest = 0.6844571196
bestIteration = 992

Shrink model to first 993 iterations.
FOLD f1 score = 0.6844507819516847
0:	learn: 0.6073858	test: 0.6056943	best: 0.6056943 (0)	total: 1.89s	remaining: 31m 27s
999:	learn: 0.6618486	test: 0.6820727	best: 0.6821675 (990)	total: 45m 2s	remaining: 0us

bestTest = 0.6821675097
bestIteration = 990

Shrink model to first 991 iterations.
FOLD f1 score = 0.6820866325104797
0:	learn: 0.6134435	test: 0.6236598	best: 0.6236598 (0)	total: 2.14s	remaining: 35m 41s
999:	learn: 0.6610256	t

### lightgbm

In [88]:
ftr[cat_features] = ftr[cat_features].astype('category')
ftr_te[cat_features] = ftr_te[cat_features].astype('category')

In [89]:
lgb = LGBMClassifier(random_state=0, max_depth=16, n_estimators=20000, learning_rate=0.01)

In [90]:
lgb_pred = np.zeros((ftr_te.shape[0]))
f1_list = []
for tr_idx, val_idx in kf.split(ftr.values, target):
    tr_x, val_x = ftr.values[tr_idx], ftr.values[val_idx]
    tr_y, val_y = target[tr_idx], target[val_idx]
    lgb.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (val_x, val_y)], eval_metric='F1', early_stopping_rounds=2000, 
             verbose=2000)
    pred = lgb.predict(val_x)
    f1 = f1_score(val_y, pred)
    f1_list.append(f1)
    print(f'FOLD f1 = {f1}')
    sub_pred = lgb.predict(ftr_te) / 5
    lgb_pred += sub_pred
print(f'\n{lgb.__class__.__name__} F1_score = {np.mean(f1_list)}')

Training until validation scores don't improve for 2000 rounds
[2000]	training's binary_logloss: 0.635044	valid_1's binary_logloss: 0.644523
[4000]	training's binary_logloss: 0.621798	valid_1's binary_logloss: 0.641082
[6000]	training's binary_logloss: 0.610609	valid_1's binary_logloss: 0.639355
[8000]	training's binary_logloss: 0.6006	valid_1's binary_logloss: 0.638293
[10000]	training's binary_logloss: 0.591113	valid_1's binary_logloss: 0.63744
[12000]	training's binary_logloss: 0.582166	valid_1's binary_logloss: 0.636837
[14000]	training's binary_logloss: 0.573687	valid_1's binary_logloss: 0.636495
[16000]	training's binary_logloss: 0.565466	valid_1's binary_logloss: 0.636082
[18000]	training's binary_logloss: 0.557446	valid_1's binary_logloss: 0.635708
[20000]	training's binary_logloss: 0.549936	valid_1's binary_logloss: 0.635482
Did not meet early stopping. Best iteration is:
[20000]	training's binary_logloss: 0.549936	valid_1's binary_logloss: 0.635482
FOLD f1 = 0.646797051071469

## submission

In [116]:
submission = pd.read_csv('data/sample_submission.csv')

In [117]:
submission['target'] = cb_pred

In [118]:
submission['target'] = submission['target'].apply(lambda x : 1 if x > 0.5 else 0)

In [119]:
submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [120]:
submission['target'].value_counts()
# baseline 23762
# 22706
# 30458
# 24127

1    24127
0    22277
Name: target, dtype: int64

In [121]:
submission.to_csv('submission/jp_1220_noscale_cb.csv', index=False)