In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')
import re

# 데이터-속성 매칭
- [코드공유](https://dacon.io/competitions/official/235863/codeshare/3877?page=1&dtype=recent)

In [2]:
d_code = pd.read_csv('./_data/속성_D_코드.csv', index_col=0).T.to_dict()
h_code = pd.read_csv('./_data/속성_H_코드.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('./_data/속성_L_코드.csv', index_col=0).T.to_dict()

t1 = pd.read_csv('./_data/train.csv',index_col=0)
t2 = pd.read_csv('./_data/test.csv',index_col=0)

In [3]:
from typing import Dict
def add_code(
    df: pd.DataFrame,
    d_code: Dict[int, Dict[str, int]], 
    h_code: Dict[int, Dict[str, int]], 
    l_code: Dict[int, Dict[str, int]],
) -> pd.DataFrame:
    
    # Copy input data
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    
    df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    
    df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    
    return df

In [4]:
df_train = add_code(t1, d_code, h_code, l_code)
df_test = add_code(t2, d_code, h_code, l_code)

## 클러스터 생성
- person_attribute와 D속성, 콘텐츠 속성을 각각 kmeans를 이용하여 클러스터생성

In [5]:
from sklearn.cluster import KMeans

def k_means_features(df,cols:list, k=3):
    X = df[cols]
    SEED = 42
    km = KMeans(n_clusters=k,
               init='k-means++',
               n_init=10,
               max_iter=300,
               tol=1e-04,
               random_state=SEED)
    y_km = km.fit_predict(X)
    return km, y_km

In [6]:
cluster_cols = ['person_attribute_a','person_attribute_a_1', 'person_attribute_b']
cluster_d_cols = ['person_prefer_d_1','person_prefer_d_2', 'person_prefer_d_3']
cluster_contents_cols = ['contents_attribute_a', 'contents_attribute_j_1',
       'contents_attribute_j', 'contents_attribute_c', 'contents_attribute_k',
       'contents_attribute_l', 'contents_attribute_m']

In [7]:
km, y_km = k_means_features(df_train, cluster_cols, 3)
df_train['person_attribute_km'] = y_km
df_test['person_attribute_km'] = km.predict(df_test[cluster_cols])

km_d, y_kmd = k_means_features(df_train, cluster_d_cols, 2)
df_train['person_attribute_kmd'] = y_kmd
df_test['person_attribute_kmd'] = km_d.predict(df_test[cluster_d_cols])

km_c, y_kmc = k_means_features(df_train, cluster_contents_cols, 3)
df_train['contents_attribute_kmc'] = y_kmc
df_test['contents_attribute_kmc'] = km_c.predict(df_test[cluster_contents_cols])

In [8]:
df_train

Unnamed: 0_level_0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,person_prefer_h_3_m,contents_attribute_h_l,contents_attribute_h_m,contents_attribute_l_n,contents_attribute_l_s,contents_attribute_l_m,contents_attribute_l_l,person_attribute_km,person_attribute_kmd,contents_attribute_kmc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,True,True,True,False,False,False,1,4,3,5,...,368,94,422,1607,1606,1605,2016,2,1,0
1,False,False,False,True,True,False,1,3,4,1,...,399,94,417,1607,1606,1605,2016,2,1,0
2,False,False,False,True,False,False,2,0,3,5,...,366,48,363,1599,1595,1572,2016,0,1,0
3,False,False,False,True,False,False,2,0,2,5,...,315,71,381,1607,1606,1605,2016,0,0,0
4,True,True,True,False,False,False,1,3,4,5,...,480,71,381,1607,1606,1605,2016,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,False,False,False,True,False,False,1,1,2,2,...,398,58,373,352,347,346,2006,0,0,2
501947,True,True,False,True,False,False,1,6,2,1,...,398,94,425,155,138,99,2006,1,1,2
501948,True,True,True,True,False,False,1,7,4,1,...,422,58,373,437,417,407,2006,1,1,2
501949,True,False,False,True,False,False,1,1,2,1,...,523,250,524,659,652,651,2006,0,1,2


In [9]:
df_test

Unnamed: 0_level_0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,person_prefer_h_3_m,contents_attribute_h_l,contents_attribute_h_m,contents_attribute_l_n,contents_attribute_l_s,contents_attribute_l_m,contents_attribute_l_l,person_attribute_km,person_attribute_kmd,contents_attribute_kmc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,True,False,False,True,True,True,1,1,2,1,...,359,250,528,1146,1128,1021,2010,0,0,1
1,False,False,False,True,False,False,2,0,2,2,...,359,250,528,1610,1606,1605,2016,0,0,0
2,True,False,False,True,True,True,2,3,2,1,...,452,169,453,1812,1811,1810,2020,2,0,0
3,True,False,False,True,True,True,1,2,2,5,...,452,169,453,101,100,99,2006,2,1,2
4,True,False,False,True,False,False,1,6,4,5,...,316,169,453,984,980,954,2009,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46399,True,True,True,False,False,False,2,0,4,1,...,398,85,396,759,759,759,2006,0,1,1
46400,True,False,False,True,False,False,2,0,4,1,...,542,85,396,759,759,759,2006,0,1,1
46401,True,True,False,True,True,True,2,0,3,1,...,542,277,547,759,759,759,2006,0,1,1
46402,True,True,False,True,True,True,1,3,3,2,...,542,277,547,759,759,759,2006,2,1,1


# 동일한 속성 매칭 및 순서형 데이터 처리
- 열람일시 데이터로 시간, 요일 정보 추가
- 같은 알파벳은 동일한 속성을 의미. 속성 매칭 여부 정보 추가

In [10]:
from sklearn.preprocessing import LabelEncoder

def match_cols(df, del_columns=False):
    
    del_cols = ['person_attribute_a','person_prefer_f','person_prefer_g', 'contents_open_dt', 'person_rn', 'contents_rn']
    
    
    # d속성과 h속성 일치여부
    p_cols = []
    c_d_cols = []
    c_h_cols = []    
    for i in df.columns:
        if re.findall('person_prefer_._._',i):
            p_cols.append(i)
        if re.findall('contents_attribute_d_.',i):
            c_d_cols.append(i)    
        if re.findall('contents_attribute_h+',i):
            c_h_cols.append(i)
            
    for i in p_cols:
        if i.split('_')[2] == 'd':
            for cols in c_d_cols:
                if i.split('_')[-1] == cols[-1]:
                    df[i[-5:]+'_match_yn'] = (df[i]==df[cols]).astype(int)
        else:
            for cols in c_h_cols:
                if i.split('_')[-1] == cols[-1]:
                    df[i[-5:]+'_match_yn'] = (df[i]==df[cols]).astype(int)
    

    # 선호속성c와 컨텐츠 속성c 일치여부
    df['match_c'] = (df.person_prefer_c==df.contents_attribute_c).astype(int)  
    # 회원선호속성 c는 1~5까지 있는 반면, 컨텐츠 속성c는 1~4만 있음
    # 5 = 무응답이나 기타일것같은.. 없으면 성능 떨어짐
    df['person_prefer_c5'] = (df.person_prefer_c==5).astype(int) 
    
    # 선호속성e와 컨텐츠 속성e 일치여부
    df['match_e'] = (df.person_prefer_e==df.contents_attribute_e).astype(int)
    # person_prefer_e 와 contents_attribute_e차이
    df['differ_e'] = df['person_prefer_e'] - df['contents_attribute_e']
    
    # 순서가 있는 속성
    df = pd.get_dummies(df, columns=['person_attribute_a_1','person_attribute_b','person_prefer_e'])

    #contents_open_dt
    df['contents_open_dt'] = pd.to_datetime(df['contents_open_dt'])
    df['contents_open_hour'] = df['contents_open_dt'].dt.hour
    df['contents_open_day'] = df['contents_open_dt'].dt.dayofweek
    
    if del_columns:
        del_cols += (p_cols + c_d_cols + c_h_cols)
                     
    new_df = df.drop(del_cols, axis=1)
    
    return new_df

In [11]:
train = match_cols(df_train,True)
test = match_cols(df_test,True)

In [12]:
train.columns

Index(['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_prefer_c', 'person_prefer_d_1',
       'person_prefer_d_2', 'person_prefer_d_3', 'person_prefer_h_1',
       'person_prefer_h_2', 'person_prefer_h_3', 'contents_attribute_i',
       'contents_attribute_a', 'contents_attribute_j_1',
       'contents_attribute_j', 'contents_attribute_c', 'contents_attribute_k',
       'contents_attribute_l', 'contents_attribute_d', 'contents_attribute_m',
       'contents_attribute_e', 'target', 'contents_attribute_l_n',
       'contents_attribute_l_s', 'contents_attribute_l_m',
       'contents_attribute_l_l', 'person_attribute_km', 'person_attribute_kmd',
       'contents_attribute_kmc', 'd_1_n_match_yn', 'd_1_s_match_yn',
       'd_1_m_match_yn', 'd_1_l_match_yn', 'd_2_n_match_yn', 'd_2_s_match_yn',
       'd_2_m_match_yn', 'd_2_l_match_yn', 'd_3_n_match_yn', 'd_3_s_match_yn',
       'd_3_m_match_yn', 'd_3_l_match_yn', 'h_1_l_match_yn'

In [13]:
print(len(train.columns))
print(len(test.columns))

81
80


feature가 너무 많아서 줄이고 싶었는데 줄이면 성능이 하락 함 ㅠㅠ

# CatBoostClassifier 사용
- [코드공유](https://dacon.io/competitions/official/235863/codeshare/3887?page=1&dtype=recent) 참고

In [21]:
from catboost import Pool,CatBoostClassifier
import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 


In [16]:
y_train = train['target']
X_train = train.drop(['target'],axis=1)
X_test = test

In [17]:
cat_features = X_train.columns[X_train.nunique() > 2].tolist()
cat_features

['person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3',
 'contents_attribute_i',
 'contents_attribute_a',
 'contents_attribute_j_1',
 'contents_attribute_c',
 'contents_attribute_l',
 'contents_attribute_d',
 'contents_attribute_m',
 'contents_attribute_e',
 'contents_attribute_l_n',
 'contents_attribute_l_s',
 'contents_attribute_l_m',
 'contents_attribute_l_l',
 'person_attribute_km',
 'contents_attribute_kmc',
 'differ_e',
 'contents_open_hour',
 'contents_open_day']

In [28]:
cat_features.pop(cat_features.index('differ_e'))
cat_features.pop(cat_features.index('contents_attribute_l_l'))

'contents_attribute_l_l'

In [29]:
len(cat_features)

22

In [30]:
SEED = 42
is_holdout = False
n_splits = 5
iterations = 3000
patience = 100

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

In [31]:
scores = []
models = []
for tri, vai in cv.split(X_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations,random_state=SEED,task_type="CPU",eval_metric="F1",cat_features=cat_features)#,one_hot_max_size=4
    model.fit(X_train.iloc[tri], y_train[tri], 
            eval_set=[(X_train.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience ,
            verbose = 100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break 



Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.086395
0:	learn: 0.6143834	test: 0.6146433	best: 0.6146433 (0)	total: 632ms	remaining: 31m 35s
100:	learn: 0.6484057	test: 0.6654016	best: 0.6654016 (100)	total: 1m 39s	remaining: 47m 42s
200:	learn: 0.6559775	test: 0.6737208	best: 0.6737416 (199)	total: 3m 21s	remaining: 46m 39s
300:	learn: 0.6597906	test: 0.6766481	best: 0.6767321 (297)	total: 5m 6s	remaining: 45m 47s
400:	learn: 0.6628227	test: 0.6779664	best: 0.6780292 (395)	total: 6m 46s	remaining: 43m 56s
500:	learn: 0.6645922	test: 0.6788869	best: 0.6789015 (496)	total: 8m 25s	remaining: 42m 3s
600:	learn: 0.6664082	test: 0.6794776	best: 0.6796368 (592)	total: 10m 3s	remaining: 40m 10s
700:	learn: 0.6678415	test: 0.6799917	best: 0.6800383 (690)	total: 11m 47s	remaining: 38m 41s
800:	learn: 0.6690246	test: 0.6803845	best: 0.6806364 (789)	total: 13m 28s	remaining: 36m 58s
900:	learn: 0.6701059	test: 0.6806790	best: 0.6808676 (887)	total: 15m 14s	remaining: 35m 30s
Stopped by overfitting detector  (100 iterat

In [32]:
print(scores)
print(np.mean(scores))

[0.6808675812055629, 0.6803442301898162, 0.6777615700835433, 0.677696136525243, 0.6769207470018647]
0.678718053001206


In [33]:
threshold = 0.4
pred_list = []
scores = []
for i,(tri, vai) in enumerate( cv.split(X_train) ):
    pred = models[i].predict_proba(X_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(X_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

[0.7111641035623905, 0.7083151796808185, 0.7062439182614337, 0.7073764023462606, 0.7056457824877768]
0.7077490772677361


In [34]:
pred = np.mean( pred_list , axis = 0 )
pred = np.where(pred >= threshold , 1, 0)

In [35]:
submission = pd.read_csv('./_data/sample_submission.csv')
submission.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [36]:
submission['target'] = pred
submission

Unnamed: 0,id,target
0,0,0
1,1,1
2,2,1
3,3,0
4,4,1
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [37]:
submission.to_csv('./submission21.csv', index=False)

# PUBLIC 0.6997740291