In [7]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.base import ClassifierMixin
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Evaluation
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from sklearn.feature_selection import RFE

import gc
import random
import re
from typing import List ,Dict, Tuple

from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold

# 한글 폰트 설정
from statsmodels import robust
from matplotlib import font_manager, rc
%matplotlib inline

import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)

In [8]:
DATA_PATH = "data/"
SUBMIT_PATH = "submission/"
SEED = 42

In [9]:
train = pd.read_csv(f'{DATA_PATH}train.csv')
test = pd.read_csv(f'{DATA_PATH}test.csv')

d_code = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv').iloc[:, :-1]
h_code = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv')
l_code = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv')

train.shape, test.shape

((501951, 35), (46404, 34))

## preprocessing & engineering

### 순서형 변수 처리
단순 category 형태가 아닌 ordinal 처리

In [10]:
train['person_attribute_a_1'] = train['person_attribute_a_1'].astype(pd.CategoricalDtype(ordered=True))
train['person_attribute_b'] = train['person_attribute_b'].astype(pd.CategoricalDtype(ordered=True))
train['person_prefer_e'] = train['person_prefer_e'].astype(pd.CategoricalDtype(ordered=True))
train['contents_attribute_e'] = train['contents_attribute_e'].astype(pd.CategoricalDtype(ordered=True))

In [11]:
test['person_attribute_a_1'] = test['person_attribute_a_1'].astype(pd.CategoricalDtype(ordered=True))
test['person_attribute_b'] = test['person_attribute_b'].astype(pd.CategoricalDtype(ordered=True))
test['person_prefer_e'] = test['person_prefer_e'].astype(pd.CategoricalDtype(ordered=True))
test['contents_attribute_e'] = test['contents_attribute_e'].astype(pd.CategoricalDtype(ordered=True))

### 데이터 - 속성코드 매칭
컨텐츠 특징 나열

In [12]:
def add_code(df, d_code, h_code, l_code):
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    df['person_prefer_h_1_u'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    df['person_prefer_h_2_u'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    df['person_prefer_h_3_u'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    df['contents_attribute_h_u'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 상위코드'])

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    return df

In [13]:
code_d = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv', index_col=0).iloc[:, :4].T.to_dict()
code_h = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv', index_col=0).T.to_dict()
code_l = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv', index_col=0).T.to_dict()

In [14]:
train = add_code(train, code_d, code_h, code_l)
test = add_code(test, code_d, code_h, code_l)

In [15]:
train.shape, test.shape

((501951, 59), (46404, 58))

In [16]:
train.iloc[:5, :20]

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3
0,0,True,True,True,False,False,False,1,4,3,5,275,370,369,8,1,1,4,95,59
1,1,False,False,False,True,True,False,1,3,4,1,114,181,175,4,1,1,131,101,96
2,2,False,False,False,True,False,False,2,0,3,5,464,175,452,3,1,1,54,263,56
3,3,False,False,False,True,False,False,2,0,2,5,703,705,704,3,1,1,72,227,2
4,4,True,True,True,False,False,False,1,3,4,5,275,370,369,4,1,1,214,210,209


In [17]:
train.iloc[:5, 20:40]

Unnamed: 0,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target,person_prefer_d_1_n,person_prefer_d_1_s,person_prefer_d_1_m,person_prefer_d_1_l,person_prefer_d_2_n
0,3,3,10,2,1,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1,275,274,274,216,369
1,1,3,5,1,1,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0,114,56,109,1,175
2,3,1,10,2,1,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0,464,450,463,377,175
3,1,3,5,1,1,2,1608,275,5,3,74,827967,572323,2020-01-13 18:09:34,0,703,690,703,618,703
4,1,1,10,2,1,2,1608,275,1,4,74,831614,573899,2020-03-09 20:39:22,0,275,274,274,216,369


In [18]:
train.iloc[:5, 40:60]

Unnamed: 0,person_prefer_d_2_s,person_prefer_d_2_m,person_prefer_d_2_l,person_prefer_d_3_n,person_prefer_d_3_s,person_prefer_d_3_m,person_prefer_d_3_l,contents_attribute_d_n,contents_attribute_d_s,contents_attribute_d_m,contents_attribute_d_l,person_prefer_h_1_u,person_prefer_h_2_u,person_prefer_h_3_u,contents_attribute_h_u,contents_attribute_l_n,contents_attribute_l_s,contents_attribute_l_m,contents_attribute_l_l
0,297,368,216,369,297,368,216,275,274,274,216,3,94,58,94,1607,1606,1605,2016
1,56,152,1,175,56,152,1,275,274,274,216,94,94,94,94,1607,1606,1605,2016
2,56,152,1,452,450,451,377,92,56,91,1,48,250,48,48,1599,1595,1572,2016
3,690,703,618,703,690,703,618,275,274,274,216,71,226,1,71,1607,1606,1605,2016
4,297,368,216,369,297,368,216,275,274,274,216,208,208,208,71,1607,1606,1605,2016


In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501951 entries, 0 to 501950
Data columns (total 59 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   id                      501951 non-null  int64   
 1   d_l_match_yn            501951 non-null  bool    
 2   d_m_match_yn            501951 non-null  bool    
 3   d_s_match_yn            501951 non-null  bool    
 4   h_l_match_yn            501951 non-null  bool    
 5   h_m_match_yn            501951 non-null  bool    
 6   h_s_match_yn            501951 non-null  bool    
 7   person_attribute_a      501951 non-null  int64   
 8   person_attribute_a_1    501951 non-null  category
 9   person_attribute_b      501951 non-null  category
 10  person_prefer_c         501951 non-null  int64   
 11  person_prefer_d_1       501951 non-null  int64   
 12  person_prefer_d_2       501951 non-null  int64   
 13  person_prefer_d_3       501951 non-null  int64   
 14  pers

### contents_open_dt 관련 피처
연월일시간 중에 어떤 요소가 중요한가

In [20]:
train['contents_open_dt'] = pd.to_datetime(train['contents_open_dt'])
test['contents_open_dt'] = pd.to_datetime(test['contents_open_dt'])

In [21]:
#train['contents_open_year'] = train['contents_open_dt'].dt.year
train['contents_open_month'] = train['contents_open_dt'].dt.month
train['contents_open_week'] = train['contents_open_dt'].dt.week
train['contents_open_day'] = train['contents_open_dt'].dt.day
train['contents_open_dow'] = train['contents_open_dt'].dt.dayofweek
train['contents_open_hour'] = train['contents_open_dt'].dt.hour

#test['contents_open_year'] = test['contents_open_dt'].dt.year
test['contents_open_month'] = test['contents_open_dt'].dt.month
test['contents_open_week'] = test['contents_open_dt'].dt.week
test['contents_open_day'] = test['contents_open_dt'].dt.day
test['contents_open_dow'] = test['contents_open_dt'].dt.dayofweek
test['contents_open_hour'] = test['contents_open_dt'].dt.hour

### 회원속성

In [23]:
train2 = train.sort_values(by='contents_open_dt').reset_index(drop=True)
test2 = test.sort_values(by='contents_open_dt').reset_index(drop=True)

In [24]:
train2['누적_컨텐츠접촉'] = train2.groupby('person_rn')['person_rn'].cumcount() + 1
test2['누적_컨텐츠접촉'] = test2.groupby('person_rn')['person_rn'].cumcount() + 1

In [25]:
train['누적_컨텐츠접촉'] = train2.sort_values(by='id').reset_index()['누적_컨텐츠접촉']
test['누적_컨텐츠접촉'] = test2.sort_values(by='id').reset_index()['누적_컨텐츠접촉']

### 회원속성 - 컨텐츠속성 일치 여부 

In [26]:
d_code.columns = ["attribute_d", "attribute_d_d", "attribute_d_s", "attribute_d_m", "attribute_d_l"]
h_code.columns = ["attribute_h", "attribute_h_p"]
l_code.columns = ["attribute_l", "attribute_l_d", "attribute_l_s", "attribute_l_m", "attribute_l_l"]

In [27]:
def merge_codes(df:pd.DataFrame,df_code:pd.DataFrame,col:str)->pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how="left", on=col)

In [28]:
def preprocess_data(
                    df:pd.DataFrame, is_train:bool = True, cols_merge:List[Tuple[str, pd.DataFrame]] = [], 
                    cols_equi:List[Tuple[str, str]]= [] ,
                    cols_drop:List[str] = ["id", "person_prefer_f", "person_prefer_g", "contents_open_dt"]
                    )->Tuple[pd.DataFrame, np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df, df_code, col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2]).astype(int)

    df = df.drop(columns=cols_drop)
    return (df, y_data)

In [29]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , d_code),
              ("person_prefer_d_2" , d_code),
              ("person_prefer_d_3" , d_code),
              ("contents_attribute_d" , d_code),
              ("person_prefer_h_1" , h_code),
              ("person_prefer_h_2" , h_code),
              ("person_prefer_h_3" , h_code),
              ("contents_attribute_h" , h_code),
              ("contents_attribute_l" , l_code),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c", "person_prefer_c"),
    ("contents_attribute_e", "person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s", "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m", "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l", "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s", "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m", "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l", "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_p", "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_2_attribute_h_p", "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_3_attribute_h_p", "contents_attribute_h_attribute_h_p"),

]

# 학습에 필요없는 컬럼 리스트
cols_drop = ["id", "person_prefer_f", "person_prefer_g", "contents_open_dt", "contents_rn", 'person_rn']

In [31]:
x_train, y_train = preprocess_data(train, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_test, _ = preprocess_data(test, is_train=False, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_train.shape, y_train.shape, x_test.shape

((501951, 93), (501951,), (46404, 93))

### 범주형 칼럼 리스트

In [32]:
cat_features = x_train.columns[x_train.nunique() > 2].tolist()

In [33]:
cat_features = set(cat_features) - set(['누적_컨텐츠접촉'])

In [8]:
x_train

NameError: name 'x_train' is not defined

### 학습 파라미터

In [35]:
is_holdout = False
n_splits = 5
iterations = 3000
patience = 50

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

### 학습

In [36]:
scores = []
models = []


models = []
for tri, vai in cv.split(x_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations, 
                               random_state=SEED,
                               #task_type="GPU",
                               eval_metric="F1",
                               cat_features=cat_features,
                               one_hot_max_size=4)
    model.fit(x_train.iloc[tri], y_train[tri], 
            eval_set=[(x_train.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience ,
            verbose = 100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    



Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.086395
0:	learn: 0.6271045	test: 0.6342993	best: 0.6342993 (0)	total: 3.51s	remaining: 2h 55m 19s
100:	learn: 0.6646329	test: 0.6842006	best: 0.6845411 (96)	total: 8m 22s	remaining: 4h 10s
200:	learn: 0.6716887	test: 0.6860758	best: 0.6860758 (200)	total: 16m 49s	remaining: 3h 54m 20s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6860763862
bestIteration = 201

Shrink model to first 202 iterations.
Learning rate set to 0.086395
0:	learn: 0.6249856	test: 0.6300820	best: 0.6300820 (0)	total: 3.09s	remaining: 2h 34m 26s
100:	learn: 0.6646489	test: 0.6856481	best: 0.6865826 (96)	total: 11m 49s	remaining: 5h 42m 45s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6875319318
bestIteration = 146

Shrink model to first 147 iterations.
Learning rate set to 0.086395
0:	learn: 0.6228982	test: 0.6300020	best: 0.6300020 (0)	total: 4.08s	remaining: 3h 24m 8s
100:	learn: 0.6640610	test: 0.6799150	best: 0.6813235 (96)	total: 9m 32s	remai

### cv 결과 확인

In [37]:
print(scores)
print(np.mean(scores))

[0.6860763862259377, 0.6875319318031298, 0.6813235027612716, 0.6925713858713419, 0.6781437972583108]
0.6851294007839984


### threshold 정의

In [4]:
threshold = 0.4

### threshold값 변경에 따른 검증점수 확인 및 추론

In [39]:
pred_list = []
scores = []
for i,(tri, vai) in enumerate( cv.split(x_train) ):
    pred = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(x_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

[0.7061393037465403, 0.7076019314239894, 0.7024781124795965, 0.7090073896544837, 0.7004102555281287]
0.7051273985665477


### 산술평균 앙상블

In [40]:
pred = np.mean(pred_list, axis=0)
pred = np.where(pred >= threshold, 1, 0)

### submission

In [41]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = pred
sample_submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [42]:
sample_submission['target'].value_counts()

1    29254
0    17150
Name: target, dtype: int64

In [43]:
sample_submission.to_csv(f"{SUBMIT_PATH}jp_1223_5.csv", index=False)