In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.base import ClassifierMixin
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Evaluation
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from sklearn.feature_selection import RFE

import gc
import random
import re
from typing import List ,Dict, Tuple

from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold

# 한글 폰트 설정
from statsmodels import robust
from matplotlib import font_manager, rc
%matplotlib inline

import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)

In [2]:
DATA_PATH = "data/"
SUBMIT_PATH = "submission/"
SEED = 42

In [3]:
train = pd.read_csv(f'{DATA_PATH}train.csv')
test = pd.read_csv(f'{DATA_PATH}test.csv')

d_code = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv').iloc[:, :-1]
h_code = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv')
l_code = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv')

train.shape, test.shape

((501951, 35), (46404, 34))

## preprocessing & engineering

### 순서형 변수 처리
단순 category 형태가 아닌 ordinal 처리

In [4]:
train['person_attribute_a_1'] = train['person_attribute_a_1'].astype(pd.CategoricalDtype(ordered=True))
train['person_attribute_b'] = train['person_attribute_b'].astype(pd.CategoricalDtype(ordered=True))
train['person_prefer_e'] = train['person_prefer_e'].astype(pd.CategoricalDtype(ordered=True))
train['contents_attribute_e'] = train['contents_attribute_e'].astype(pd.CategoricalDtype(ordered=True))

In [5]:
test['person_attribute_a_1'] = test['person_attribute_a_1'].astype(pd.CategoricalDtype(ordered=True))
test['person_attribute_b'] = test['person_attribute_b'].astype(pd.CategoricalDtype(ordered=True))
test['person_prefer_e'] = test['person_prefer_e'].astype(pd.CategoricalDtype(ordered=True))
test['contents_attribute_e'] = test['contents_attribute_e'].astype(pd.CategoricalDtype(ordered=True))

In [6]:
train.iloc[:, :20].head()

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3
0,0,True,True,True,False,False,False,1,4,3,5,275,370,369,8,1,1,4,95,59
1,1,False,False,False,True,True,False,1,3,4,1,114,181,175,4,1,1,131,101,96
2,2,False,False,False,True,False,False,2,0,3,5,464,175,452,3,1,1,54,263,56
3,3,False,False,False,True,False,False,2,0,2,5,703,705,704,3,1,1,72,227,2
4,4,True,True,True,False,False,False,1,3,4,5,275,370,369,4,1,1,214,210,209


In [7]:
train.iloc[:, 20:].head()

Unnamed: 0,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,3,3,10,2,1,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,3,5,1,1,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,3,1,10,2,1,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0
3,1,3,5,1,1,2,1608,275,5,3,74,827967,572323,2020-01-13 18:09:34,0
4,1,1,10,2,1,2,1608,275,1,4,74,831614,573899,2020-03-09 20:39:22,0


### 하위속성 연결

In [8]:
train['person_attribute_A'] = train['person_attribute_a'].astype(str) + '_' + train['person_attribute_a_1'].astype(str)
train['contents_attribute_J'] = train['contents_attribute_j'].astype(str) + '_' + train['contents_attribute_j_1'].astype(str)

In [9]:
test['person_attribute_A'] = test['person_attribute_a'].astype(str) + '_' + test['person_attribute_a_1'].astype(str)
test['contents_attribute_J'] = test['contents_attribute_j'].astype(str) + '_' + test['contents_attribute_j_1'].astype(str)

### 속성 여러 개인 것들 연결

In [None]:
train['person_prefer_D'] = train['person_prefer_d_1'].astype(str) + '_' + train['person_prefer_d_2'].astype(str) + '_' + train['person_prefer_d_3'].astype(str)
train['person_prefer_H'] = train['person_prefer_h_1'].astype(str) + '_' + train['person_prefer_h_2'].astype(str) + '_' + train['person_prefer_h_3'].astype(str)

In [None]:
test['person_prefer_D'] = test['person_prefer_d_1'].astype(str) + '_' + test['person_prefer_d_2'].astype(str) + '_' + test['person_prefer_d_3'].astype(str)
test['person_prefer_H'] = test['person_prefer_h_1'].astype(str) + '_' + test['person_prefer_h_2'].astype(str) + '_' + test['person_prefer_h_3'].astype(str)

### person_attribute 연결

In [None]:
train['person_attribute'] = train['person_attribute_a'].astype(str) + '_' + train['person_attribute_a_1'].astype(str) + '_' + train['person_attribute_b'].astype(str)
test['person_attribute'] = test['person_attribute_a'].astype(str) + '_' + test['person_attribute_a_1'].astype(str) + '_' + test['person_attribute_b'].astype(str)

### 속성코드 연결
회원속성 - 컨텐츠속성 일치 여부 

In [12]:
d_code.columns = ["attribute_d", "attribute_d_d", "attribute_d_s", "attribute_d_m", "attribute_d_l"]
h_code.columns = ["attribute_h", "attribute_h_p"]
l_code.columns = ["attribute_l", "attribute_l_d", "attribute_l_s", "attribute_l_m", "attribute_l_l"]

In [13]:
def merge_codes(df:pd.DataFrame, 
                df_code:pd.DataFrame,
                col:str)->pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how="left", on=col)

In [14]:
def preprocess_data(
                    df:pd.DataFrame, 
                    is_train:bool = True, 
                    cols_merge:List[Tuple[str, pd.DataFrame]] = [], 
                    cols_equi:List[Tuple[str, str]] = [] ,
                    cols_drop:List[str] = ["id", "person_prefer_f", "person_prefer_g", "contents_open_dt"]
                    )->Tuple[pd.DataFrame, np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df, df_code, col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2]).astype(int)

    df = df.drop(columns=cols_drop)
    return (df, y_data)

In [15]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1", d_code),
              ("person_prefer_d_2", d_code),
              ("person_prefer_d_3", d_code),
              ("contents_attribute_d", d_code),
              ("person_prefer_h_1", h_code),
              ("person_prefer_h_2", h_code),
              ("person_prefer_h_3", h_code),
              ("contents_attribute_h", h_code),
              ("contents_attribute_l", l_code),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c", "person_prefer_c"),
    ("contents_attribute_e", "person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s", "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m", "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l", "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s", "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m", "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l", "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_p", "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_2_attribute_h_p", "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_3_attribute_h_p", "contents_attribute_h_attribute_h_p"),

]

# 학습에 필요없는 컬럼 리스트
cols_drop = ["id", "person_prefer_f", "person_prefer_g", "contents_open_dt", "contents_rn", 'person_rn']

In [16]:
x_train, y_train = preprocess_data(train, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_test, _ = preprocess_data(test, is_train=False, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_train.shape, y_train.shape, x_test.shape

((501951, 67), (501951,), (46404, 67))

### 속성 D, H

In [17]:
x_train['선호속성D분류1'] = x_train['person_prefer_d_1_attribute_d_l'].astype(str) + '_' + x_train['person_prefer_d_1_attribute_d_m'].astype(str) + '_' + x_train['person_prefer_d_1_attribute_d_s'].astype(str)
x_train['선호속성D분류2'] = x_train['person_prefer_d_2_attribute_d_l'].astype(str) + '_' + x_train['person_prefer_d_2_attribute_d_m'].astype(str) + '_' + x_train['person_prefer_d_2_attribute_d_s'].astype(str)
x_train['선호속성D분류3'] = x_train['person_prefer_d_3_attribute_d_l'].astype(str) + '_' + x_train['person_prefer_d_3_attribute_d_m'].astype(str) + '_' + x_train['person_prefer_d_3_attribute_d_s'].astype(str)

x_test['선호속성D분류1'] = x_test['person_prefer_d_1_attribute_d_l'].astype(str) + '_' + x_test['person_prefer_d_1_attribute_d_m'].astype(str) + '_' + x_test['person_prefer_d_1_attribute_d_s'].astype(str)
x_test['선호속성D분류2'] = x_test['person_prefer_d_2_attribute_d_l'].astype(str) + '_' + x_test['person_prefer_d_2_attribute_d_m'].astype(str) + '_' + x_test['person_prefer_d_2_attribute_d_s'].astype(str)
x_test['선호속성D분류3'] = x_test['person_prefer_d_3_attribute_d_l'].astype(str) + '_' + x_test['person_prefer_d_3_attribute_d_m'].astype(str) + '_' + x_test['person_prefer_d_3_attribute_d_s'].astype(str)

In [18]:
x_train['선호D속성_컨텐츠D속성_일치점수'] = 3*(x_train['d_l_match_yn'] + x_train['d_m_match_yn'] + x_train['d_s_match_yn']) \
+ 2*(x_train['person_prefer_d_2_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_train['person_prefer_d_2_attribute_d_m_contents_attribute_d_attribute_d_m'] + x_train['person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l']) \
+ 1*(x_train['person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_train['person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m'] + x_train['person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l'])

x_test['선호D속성_컨텐츠D속성_일치점수'] = 3*(x_test['d_l_match_yn'] + x_test['d_m_match_yn'] + x_test['d_s_match_yn']) \
+ 2*(x_test['person_prefer_d_2_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_test['person_prefer_d_2_attribute_d_m_contents_attribute_d_attribute_d_m'] + x_test['person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l']) \
+ 1*(x_test['person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_test['person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m'] + x_test['person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l'])

In [19]:
x_train['선호H속성_컨텐츠H속성_일치점수'] = 2*(x_train['h_l_match_yn'] + x_train['h_m_match_yn'] + x_train['h_s_match_yn']) \
+ 1*(x_train['person_prefer_h_1_attribute_h_p_contents_attribute_h_attribute_h_p'] + x_train['person_prefer_h_2_attribute_h_p_contents_attribute_h_attribute_h_p'] + x_train['person_prefer_h_3_attribute_h_p_contents_attribute_h_attribute_h_p'])

x_test['선호H속성_컨텐츠H속성_일치점수'] = 2*(x_test['h_l_match_yn'] + x_test['h_m_match_yn'] + x_test['h_s_match_yn']) \
+ 1*(x_test['person_prefer_h_1_attribute_h_p_contents_attribute_h_attribute_h_p'] + x_test['person_prefer_h_2_attribute_h_p_contents_attribute_h_attribute_h_p'] + x_test['person_prefer_h_3_attribute_h_p_contents_attribute_h_attribute_h_p'])

### embedding 할 수 있는 것들 찾아보기

### 범주형 칼럼 리스트

In [20]:
cat_features = x_train.columns[x_train.nunique() >= 2].tolist()

In [21]:
cat_features = list(set(cat_features) - set(['선호D속성_컨텐츠D속성_일치점수', '선호H속성_컨텐츠H속성_일치점수']))

In [22]:
len(cat_features)

70

### 학습 파라미터

In [23]:
is_holdout = False
n_splits = 10
iterations = 10000
patience = 100

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

### 학습

In [24]:
scores = []
models = []


models = []
for tri, vai in cv.split(x_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations, 
                               random_state=SEED,
                               #task_type="GPU",
                               eval_metric="F1",
                               cat_features=cat_features,
                               one_hot_max_size=4)
    
    model.fit(x_train.iloc[tri], y_train[tri],
              eval_set=[(x_train.iloc[vai], y_train[vai])],
              early_stopping_rounds=patience, verbose=100)
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    

Learning rate set to 0.052683
0:	learn: 0.6230869	test: 0.6254085	best: 0.6254085 (0)	total: 2.62s	remaining: 7h 15m 58s
100:	learn: 0.6591116	test: 0.6867159	best: 0.6869707 (96)	total: 5m 5s	remaining: 8h 18m 45s
200:	learn: 0.6653748	test: 0.6908752	best: 0.6909253 (194)	total: 9m 45s	remaining: 7h 55m 33s
300:	learn: 0.6688739	test: 0.6929561	best: 0.6931330 (299)	total: 14m 39s	remaining: 7h 52m 30s
400:	learn: 0.6714251	test: 0.6938538	best: 0.6939539 (372)	total: 19m 54s	remaining: 7h 56m 23s
500:	learn: 0.6737887	test: 0.6951908	best: 0.6953474 (498)	total: 25m 16s	remaining: 7h 59m 8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.695347355
bestIteration = 498

Shrink model to first 499 iterations.
Learning rate set to 0.052683
0:	learn: 0.6012924	test: 0.6069882	best: 0.6069882 (0)	total: 2.53s	remaining: 7h 1m 58s
100:	learn: 0.6595166	test: 0.6902488	best: 0.6903276 (99)	total: 5m 18s	remaining: 8h 39m 54s
200:	learn: 0.6653870	test: 0.6972104	best: 0.

KeyboardInterrupt: 

### cv 결과 확인

In [25]:
print(scores)
print(np.mean(scores))

[0.6953473550031867, 0.7027558324987946, 0.6985513739545998, 0.6983449412117128, 0.6983091741908729]
0.6986617353718334


### threshold 정의

In [26]:
threshold = 0.4

### threshold값 변경에 따른 검증점수 확인 및 추론

In [27]:
pred_list = []
scores = []
for i,(tri, vai) in enumerate( cv.split(x_train) ):
    pred = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(x_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

IndexError: list index out of range

### 산술평균 앙상블

In [40]:
pred = np.mean(pred_list, axis=0)
pred = np.where(pred >= threshold, 1, 0)

### submission

In [41]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = pred
sample_submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [42]:
sample_submission['target'].value_counts()

1    29254
0    17150
Name: target, dtype: int64

In [43]:
sample_submission.to_csv(f"{SUBMIT_PATH}jp_1226_3.csv", index=False)