In [1]:
import pandas as pd
pd.set_option("display.max_columns", 200)
import numpy as np
import datetime
from glob import glob

# 4. Encoding & Interaction Features
from itertools import combinations
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from pycaret.classification import setup, get_config
from imblearn.over_sampling import ADASYN

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
rocauc = make_scorer(roc_auc_score)

from bayes_opt import BayesianOptimization
from sklearn.ensemble import ExtraTreesClassifier

# Others
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

- read data

In [2]:
pd.set_option('max_columns',100)

path = './data'

data = pd.read_csv(path + '/trainset.csv')
test = pd.read_csv( path + '/testset_final.csv')
sub = pd.read_csv( path + '/sample_submission_final.csv')

In [3]:
data['매니저생년월일'] = data['매니저생년월일'].apply(lambda x : x[:4])
data = data.drop(['매니저최초가입일' , '매니저최초서비스일' , '매니저주소'] , axis = 1 )

- 결측치 제거

In [4]:
#결측 제거
data[['반려동물','우선청소','매니저사용휴대폰','매니저이동방법']] = data[['반려동물','우선청소','매니저사용휴대폰','매니저이동방법']].fillna('정보없음')
data[['부재중여부','CS교육이수여부','청소교육이수여부']] = data[['부재중여부','CS교육이수여부','청소교육이수여부']].fillna(0)

test[['반려동물','우선청소','매니저사용휴대폰','매니저이동방법']] = test[['반려동물','우선청소','매니저사용휴대폰','매니저이동방법']].fillna('정보없음')
test[['부재중여부','CS교육이수여부','청소교육이수여부']] = test[['부재중여부','CS교육이수여부','청소교육이수여부']].fillna(0)

In [5]:
# test 결재형태 결측 제거 - train의 결재형태 최빈값으로 채워줌
test['결재형태'] = test['결재형태'].fillna('신용카드')

In [6]:
#평수 결측값 제거
new_feet = {
'10평대' : 0,
'20평대' : 1,
'30평대' : 2,
'40평대이상' : 3    
}

data['평수'] = data['평수'].map(new_feet).fillna(2)
test['평수'] = test['평수'].map(new_feet).fillna(2)

- Feature engineering

In [7]:
#일자 데이터 datetime으로 변경
time_cols = ['접수일','접수시각', '최초서비스일', '서비스일자', '서비스시작시간','서비스종료시간', '고객가입일']

data[time_cols] = data[time_cols].apply(pd.to_datetime)
test[time_cols] = test[time_cols].apply(pd.to_datetime)

In [8]:
#서비스요일 변경
data['서비스요일'] = data['서비스일자'].apply(lambda x: x.weekday())

test['서비스요일'] = test['서비스일자'].apply(lambda x: x.weekday())

In [9]:
#가입_최초서비스 : 최초서비스일 - 고객가입일

service_to_day = (data['서비스일자'] - data['최초서비스일']).mean()
na_index = test['고객가입일'].isna()
test.loc[na_index, '고객가입일'] = test[test['고객가입일'].isna()].apply(lambda x : x['최초서비스일'] - service_to_day, axis = 1)

In [10]:
# 고객가입일_time

data['고객가입_year'] = data['고객가입일'].dt.year
data['고객가입_month'] = data['고객가입일'].dt.month
data['고객가입_day'] = data['고객가입일'].dt.day
data['고객가입_요일'] = data['고객가입일'].dt.weekday

test['고객가입_year'] = test['고객가입일'].dt.year
test['고객가입_month'] = test['고객가입일'].dt.month
test['고객가입_day'] = test['고객가입일'].dt.day
test['고객가입_요일'] = test['고객가입일'].dt.weekday

In [11]:
# 서비스 주소 == 근무가능주소 
add_lst = list(data['서비스주소'].unique())

add_seoul = [i for i in add_lst if '서울' in i]
add_ch = [i for i in add_lst if '충남' in i]
add_gw = [i for i in add_lst if '강원' in i]
add_pt = [i for i in add_lst if '평택' in i]
add_gj = [i for i in add_lst if '광주' in i]
add_city = [i for i in add_lst if ('서울' in i) or ('경기' in i)]
add_bs = [i for i in add_lst if '부산' in i]

def add_modify(address):
    if address in add_seoul: return '서울'
    elif address in add_ch: return '천안/아산'
    elif address in add_gw: return '원주/춘천'
    elif address in add_pt: return '평택'
    elif address in add_gj: return '광주'
    elif address in add_city: return '수도권'
    elif address in add_bs: return '부산'
    else: return '기타'

def matching_add(service, possible):
    if service == possible:
        return 1
    elif (service == '서울') and (possible == '수도권'):
        return 1
    else :
        return 0

data['서비스주소_2'] = data['서비스주소'].apply(add_modify)
data['지역_매칭'] = data.apply(lambda x: matching_add(x.서비스주소_2,x.근무가능지역),axis=1)
data.drop('서비스주소_2',axis = 1,inplace = True)

test['서비스주소_2'] = test['서비스주소'].apply(add_modify)
test['지역_매칭'] = test.apply(lambda x: matching_add(x.서비스주소_2,x.근무가능지역),axis=1)
test.drop('서비스주소_2',axis = 1,inplace = True)

In [12]:
# 총서비스시간
data['서비스시작시간'] = data['서비스시작시간'].apply(lambda x: x.hour)
data['서비스종료시간'] = data['서비스종료시간'].apply(lambda x: x.hour)

data['총서비스시간'] = data['서비스종료시간'] - data['서비스시작시간']

test['서비스시작시간'] = test['서비스시작시간'].apply(lambda x: x.hour)
test['서비스종료시간'] = test['서비스종료시간'].apply(lambda x: x.hour)

test['총서비스시간'] = test['서비스종료시간'] - test['서비스시작시간']

In [13]:
#서비스 월
data['서비스월'] = data['서비스일자'].apply(lambda x: x.month)

test['서비스월'] = test['서비스일자'].apply(lambda x: x.month)

In [14]:
#서비스일자와 접수일자 차이
def 서비스_접수_차이(service_day, recep_day):
    time = service_day - recep_day
    return time.days

data['서비스-접수-차이'] = data.apply(lambda x: 서비스_접수_차이(x.서비스일자,x.접수일),axis=1)
test['서비스-접수-차이'] = test.apply(lambda x: 서비스_접수_차이(x.서비스일자,x.접수일),axis=1)

In [15]:
#매니저 연령대

def age(recep_day,birth):
    return recep_day.year - int(birth) + 1
       
data['매니저연령'] = data.apply(lambda x: age(x.접수일,x.매니저생년월일),axis=1)
data['매니저연령대'] = data['매니저연령'].apply(lambda x: int(str(x)[0]))
data['매니저연령_qcut'] = data['매니저연령'].apply(lambda x: 0 if x <= 50 else ( 1 if x <= 52 else ( 2 if x <= 56 else ( 3 if x <= 60 else 4))))

test['매니저연령'] = test.apply(lambda x: age(x.접수일,x.매니저생년월일),axis=1)
test['매니저연령대'] = test['매니저연령'].apply(lambda x: int(str(x)[0]))
test['매니저연령_qcut'] = test['매니저연령'].apply(lambda x: 0 if x <= 50 else ( 1 if x <= 52 else ( 2 if x <= 56 else ( 3 if x <= 60 else 4))))

In [16]:
#부재중 서비스가능여부와 부재중여부 일치
def 부재중(possible,whether):
    return '서비스가능여부' + str(possible) + '/부재중여부' + str(whether)

data['부재중_일치'] = data.apply(lambda x : 부재중(x.부재중서비스가능여부,x.부재중여부),axis = 1)

test['부재중_일치'] = test.apply(lambda x : 부재중(x.부재중서비스가능여부,x.부재중여부),axis = 1)

In [17]:
#반려동물여부
data['반려동물여부'] = data['반려동물'].apply(lambda x : 0 if x in ["없음" , '정보없음'] else 1)
test['반려동물여부'] = test['반려동물'].apply(lambda x : 0 if x in ["없음" , '정보없음'] else 1)

In [18]:
#서비스이용기간 : 서비스일자 - 최초서비스일
data['서비스이용기간'] = (data['서비스일자'] - data['최초서비스일']).astype(str).apply(lambda x : int(x[:-5]))

test['서비스이용기간'] = (test['서비스일자'] - test['최초서비스일']).astype(str).apply(lambda x : int(x[:-5]))

In [19]:
#회차별 일자
data['회차별일자'] = data.apply(lambda x : x['서비스이용기간']/x['현재회차'], axis = 1)

test['회차별일자'] = test.apply(lambda x : x['서비스이용기간']/x['현재회차'], axis = 1)

In [20]:
#가입_최초서비스 : 최초서비스일 - 고객가입일
data['가입_최초서비스'] = (data['최초서비스일'] - data['고객가입일']).astype(str).apply(lambda x :int(x.split()[0]))

test['가입_최초서비스'] = (test['최초서비스일'] - test['고객가입일']).astype(str).apply(lambda x : int(x.split()[0]))

In [21]:
#접수시간
data['접수시간'] = data['접수시각'].dt.hour.fillna(-1)

test['접수시간'] = test['접수시각'].dt.hour.fillna(-1)

In [22]:
#서비스진행비율
data['서비스진행비율'] = data['현재회차'] / data['전체회차']

test['서비스진행비율'] = test['현재회차'] / test['전체회차']

In [23]:
#서비스연도, 서비스 계절
data['서비스연도'] = data['서비스일자'].dt.year
data['서비스계절'] = data['서비스일자'].dt.month.apply(lambda x : '겨울' if x in [12,1,2] else ( '봄' if x in [3,4,5] else ( '여름' if x in [6,7,8] else '가을')) )

test['서비스연도'] = test['서비스일자'].dt.year
test['서비스계절'] = test['서비스일자'].dt.month.apply(lambda x : '겨울' if x in [12,1,2] else ( '봄' if x in [3,4,5] else ( '여름' if x in [6,7,8] else '가을')) )

In [24]:
# 가입 유지 기간
data['이용연도'] = 2021 - data['고객가입일'].dt.year + 1

test['이용연도'] = 2021 - test['고객가입일'].dt.year + 1

In [25]:
#지역
data['지역'] = data['서비스주소'].apply(lambda x : x.split(' ')[0])
test['지역'] = test['서비스주소'].apply(lambda x : x.split(' ')[0])

location_dict = { key : num for num, key in enumerate(data['지역'].unique())}

data['지역'] = data['지역'].map(location_dict)
test['지역'] = test['지역'].map(location_dict)

In [26]:
# 서비스시작시각
# 시간이 안적힌것도 있움
data['서비스시작시각'] = data['서비스시작시간'].apply(lambda x : x // 6).apply(lambda x : 1 if x == 0 else x)

test['서비스시작시각'] = test['서비스시작시간'].apply(lambda x : x // 6).apply(lambda x : 1 if x == 0 else x)

In [27]:
# 서비스진행비율_qcut
data['서비스진행비율_qcut'] = data['서비스진행비율'].apply(lambda x: 0.2 if x <= 0.2 else (0.4 if x<=0.4 else(0.6 if x<= 0.6 else(0.8 if x<= 0.8 else 1))))

test['서비스진행비율_qcut'] = test['서비스진행비율'].apply(lambda x: 0.2 if x <= 0.2 else (0.4 if x<=0.4 else(0.6 if x<= 0.6 else(0.8 if x<= 0.8 else 1))))

In [28]:
# 서비스_접수_차이_qcut
data['서비스_접수_차이_qcut'] = data['서비스-접수-차이'].apply(lambda x: 0 if x <= 4 else ( 1 if x <= 7 else ( 2 if x <= 14 else ( 3 if x <= 21 else 4))))

test['서비스_접수_차이_qcut'] = test['서비스-접수-차이'].apply(lambda x: 0 if x <= 4 else ( 1 if x <= 7 else ( 2 if x <= 14 else ( 3 if x <= 21 else 4))))

In [29]:
# 서비스이용기간_qcut
data['서비스이용기간_qcut'] = data['서비스이용기간'].apply(lambda x: 0 if x <= 7 else ( 1 if x <= 14 else 2 ))

test['서비스이용기간_qcut'] = test['서비스이용기간'].apply(lambda x: 0 if x <= 7 else ( 1 if x <= 14 else 2 ))

In [30]:
# 접수시간_qcut
data['접수시간_qcut'] = data['접수시간'].apply(lambda x: 0 if x <= 11 else ( 1 if x <= 14 else ( 2 if x <= 18 else 3 )))

test['접수시간_qcut'] = test['접수시간'].apply(lambda x: 0 if x <= 11 else ( 1 if x <= 14 else ( 2 if x <= 18 else 3 )))

In [31]:
# 가입_최초서비스_qcut
data['가입_최초서비스_qcut'] = data['가입_최초서비스'].apply(lambda x : 0 if x <= 0 else ( 1 if x <= 37 else ( 2 if x <= 96 else ( 3 if x <= 235 else 4))))

test['가입_최초서비스_qcut'] = test['가입_최초서비스'].apply(lambda x : 0 if x <= 0 else ( 1 if x <= 37 else ( 2 if x <= 96 else ( 3 if x <= 235 else 4))))

In [32]:
# 회차별일자_qcut
data['회차별일자_qcut'] = data['회차별일자'].apply(lambda x : 0 if x<= 2 else( 1 if x <= 6 else ( 2 if x <= 9 else ( 3 if x <= 27 else 4))))

test['회차별일자_qcut'] = test['회차별일자'].apply(lambda x : 0 if x<= 2 else( 1 if x <= 6 else ( 2 if x <= 9 else ( 3 if x <= 27 else 4))))

In [33]:
# 전체회차 이상치 12로 변경
data['전체회차'] = data['전체회차'].apply(lambda x : 12 if x > 10 else x )

test['전체회차'] = test['전체회차'].apply(lambda x : 12 if x > 10 else x )

In [34]:
# 매니저_교육이수정도
data['매니저_교육이수정도'] = data['CS교육이수여부'] + data['청소교육이수여부']

test['매니저_교육이수정도'] = test['CS교육이수여부'] + test['청소교육이수여부']

In [35]:
# cus_type
data['cus_type'] = (data['기존고객여부'].astype(str) + data['결재형태'] + data['평수'].astype(str) + data['반려동물여부'].astype(str) +data['주거형태'] + data['서비스주소'])
test['cus_type'] = (test['기존고객여부'].astype(str) + test['결재형태'] + test['평수'].astype(str) + test['반려동물여부'].astype(str) +test['주거형태'] + test['서비스주소'])

house = list( set(test['cus_type'].unique()) - set(data['cus_type'].unique()))
house = house + list(set(data['cus_type'].unique()) - set(test['cus_type'].unique())) 

data['cus_type'] = data['cus_type'].apply(lambda x : "기타" if x in house else x )
test['cus_type'] = test['cus_type'].apply(lambda x : "기타" if x in house else x )

In [36]:
# cus_type_count_1

print(set(test['cus_type'] )- set(data['cus_type']))

manager_satisfaction = data.query('매칭성공여부 == 1').groupby(['cus_type'])['매칭성공여부'].agg('count').to_dict() ####
data['cus_type_count_1'] = data['cus_type'].map(manager_satisfaction)
data['cus_type_count_1'] = data['cus_type_count_1'].fillna(0)

test['cus_type_count_1'] = test['cus_type'].map(manager_satisfaction)
test['cus_type_count_1'] = test['cus_type_count_1'].fillna(0)

set()


In [37]:
# house_type
data['house_type'] = (data['평수'].astype(str) + data['반려동물여부'].astype(str) +data['주거형태'] + data['서비스주소'])
test['house_type'] = (test['평수'].astype(str) + test['반려동물여부'].astype(str) +test['주거형태'] + test['서비스주소'])

house = list( set(test['house_type'].unique()) - set(data['house_type'].unique()))
house = house + list(set(data['house_type'].unique()) - set(test['house_type'].unique())) 

data['house_type'] = data['house_type'].apply(lambda x : "기타" if x in house else x )
test['house_type'] = test['house_type'].apply(lambda x : "기타" if x in house else x )

In [38]:
# house_type_count_1

print(set(test['house_type'] )- set(data['house_type']))

manager_satisfaction = data.query('매칭성공여부 == 1').groupby(['house_type'])['매칭성공여부'].agg('count').to_dict() ####
data['house_type_count_1'] = data['house_type'].map(manager_satisfaction)
data['house_type_count_1'] = data['house_type_count_1'].fillna(0)

test['house_type_count_1'] = test['house_type'].map(manager_satisfaction)
test['house_type_count_1'] = test['house_type_count_1'].fillna(0)

set()


In [39]:
# manager_type
data['manager_type'] = (data['매니저사용휴대폰'] + data['매니저이동방법'] + data['근무가능지역'] + data['CS교육이수여부'].astype(str) + data['청소교육이수여부'].astype(str)+data['추천인여부'].astype(str))
test['manager_type'] = (test['매니저사용휴대폰'] + test['매니저이동방법'] + test['근무가능지역'] + test['CS교육이수여부'].astype(str) + test['청소교육이수여부'].astype(str)+data['추천인여부'].astype(str))

manager = list(set(data['manager_type'].unique()) - set(test['manager_type'].unique()))
manager = manager + list(set(test['manager_type'].unique()) - set(data['manager_type'].unique()))

data['manager_type'] = data['manager_type'].apply(lambda x : "기타" if x in manager else x )
test['manager_type'] = test['manager_type'].apply(lambda x : "기타" if x in manager else x )

In [40]:
# manager_type_count_1
manager_satisfaction = data.query('매칭성공여부 == 1').groupby(['manager_type'])['매칭성공여부'].agg('count').to_dict() ####
data['manager_type_count_1'] = data['manager_type'].map(manager_satisfaction)
data['manager_type_count_1'] = data['manager_type_count_1'].fillna(0)

test['manager_type_count_1'] = test['manager_type'].map(manager_satisfaction)
test['manager_type_count_1'] = test['manager_type_count_1'].fillna(0)

label encoding

In [41]:
encoding_lst = ['결재형태','주거형태','우선청소','매니저사용휴대폰','매니저이동방법','서비스계절','부재중_일치','house_type' ,'manager_type' , 'cus_type']

In [42]:
#라벨인코딩 방식
for i in encoding_lst:
    encoder = LabelEncoder()
    encoder.fit(data[i])
    data[i] = encoder.transform(data[i])
    test[i] = encoder.transform(test[i])

- 불필요한 columns drop

In [43]:
data = data.drop(['접수시각','접수일','최초서비스일','서비스일자','서비스주소','고객가입일','매니저생년월일','매니저성별','근무가능지역','반려동물' ,'house_type' ], axis =1)
test = test.drop(['접수시각','접수일','최초서비스일','서비스일자','서비스주소','고객가입일','매니저생년월일','매니저성별','근무가능지역','반려동물' ,'house_type'], axis =1)

In [44]:
data.shape

(23009, 56)

In [45]:
categorical_columns = list(set(data.columns[data.nunique() < 10]) - set(['매칭성공여부']))
numerical_columns = list(data.columns[data.nunique() >= 10])

## Interaction Features

In [46]:
available = []
not_available = 0
for num in range(1, 3) : 
    l = len([x for x in combinations(categorical_columns, num)])
    for columns in tqdm(combinations(categorical_columns, num), total = l) : 
        columns = list(columns)
        column = '_'.join(columns)
        try : 
            data[column] = data[columns].astype('str').sum(axis = 1)
            test[column] = test[columns].astype('str').sum(axis = 1)
            mean_dict = data.groupby([column])['매칭성공여부'].agg('mean').to_dict()
            data[column] = data[column].map(mean_dict)
            test[column] = test[column].map(mean_dict)
            test[column] = test[column].fillna(data.groupby([column])['매칭성공여부'].agg('mean').mean())
            available.append(column)
        except : 
            not_available += 1
print(f'{not_available} Columns Are Not Available')

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 47.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 595/595 [00:49<00:00, 11.95it/s]

0 Columns Are Not Available





In [47]:
X_train = data.drop(columns  = ['SEQ', '매칭성공여부'])
y_train = data['매칭성공여부']
X_test = test.drop(columns = ['SEQ'])

## Numerical Data -> Polynomial Features

In [62]:
len(num_columns)

44

In [48]:
num_columns = X_train.columns[X_train.nunique() > 25 ]
X_train_num = X_train.loc[:,num_columns]
X_test_num = X_test.loc[:,num_columns]

poly = PolynomialFeatures(degree = 2, include_bias=False)
columns = X_train_num.columns
X_train_num = poly.fit_transform(X_train_num)
X_test_num = poly.fit_transform(X_test_num)

columns = poly.get_feature_names(columns)
X_train_num = pd.DataFrame(X_train_num,columns = columns)
X_test_num = pd.DataFrame(X_test_num,columns = columns)

X_train = pd.concat([X_train.drop(columns = num_columns), X_train_num], axis = 1)
X_test = pd.concat([X_test.drop(columns = num_columns), X_test_num], axis = 1)

## Remove Perfect Collinearity

In [49]:
train = X_train.copy()
train['매칭성공여부'] = y_train
test = X_test.copy()

clf = setup(train, target = '매칭성공여부', train_size = 0.99, silent=  True,
                            session_id = 162,  numeric_features= list(X_train.columns),)

Unnamed: 0,Description,Value
0,session_id,162
1,Target,매칭성공여부
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(23009, 1640)"
5,Missing Values,False
6,Numeric Features,1639
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [50]:
new_columns = get_config('X_train').columns
X_train = X_train[new_columns]
X_test = X_test[new_columns]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(23009, 1378)
(23009,)
(4244, 1378)


In [51]:
ros = ADASYN(sampling_strategy=0.2, random_state=1000)
X_train, y_train = ros.fit_resample(X_train, y_train)

X_train = pd.DataFrame(X_train, columns = new_columns)
X_test = pd.DataFrame(X_test, columns = new_columns)

In [52]:
train = X_train.copy()
train['매칭성공여부'] = y_train
test = X_test.copy()

### Save Feature Engineered Data

In [53]:
train.to_csv('./data/submission_3_train_data.csv', index = False)
test.to_csv('./data/submission_3_test_data.csv', index = False)

# Modeling

### Reload Data

In [54]:
train = pd.read_csv('./data/submission_3_train_data.csv')
test = pd.read_csv('./data/submission_3_test_data.csv')

## 1) Set Data For Modeling

In [55]:
X_train = train.drop(columns  = ['매칭성공여부'])
y_train = train['매칭성공여부']
X_test = test.copy()

## 2) Bayesian Optimization - ExtraTreesClassifier

In [57]:
def bayes_tune_ext(X_train, y_train, X_test, init_iter, n_iter) : 
    ext_pbounds = {
                'n_estimators': (30, 1000),
                'max_depth': (50,300),
                'min_samples_leaf' : (1,2 *3),
    }

    def ext_opt(n_estimators, max_depth, min_samples_leaf):

        params = {
            'n_estimators' : int(round(n_estimators)),
            'max_depth' : int(round(max_depth)),
            'min_samples_leaf' : int(round(min_samples_leaf)),
            'n_jobs' : -1,
        }

        ext = ExtraTreesClassifier(**params)
        skf = StratifiedKFold(n_splits=4 , shuffle=True, random_state=50)
        score = cross_val_score(ext, X_train, y_train , scoring = rocauc, cv=skf, n_jobs=-1)

        return np.mean(sorted(score)[:2])

    BO_ext = BayesianOptimization(f = ext_opt, pbounds = ext_pbounds, random_state=1) 

    BO_ext.maximize(init_points=init_iter, n_iter=n_iter)

    max_params = BO_ext.max['params']

    max_params['n_estimators'] = int(round(max_params['n_estimators']))
    max_params['max_depth'] = int(round(max_params['max_depth']))
    max_params['min_samples_leaf'] = int(round(max_params['min_samples_leaf']))
    max_params['n_jobs'] = -1

    print(max_params)

    ext_clf = ExtraTreesClassifier(**max_params)
    scores = cross_val_score(ext_clf, X_train, y_train, scoring=rocauc, cv=4, n_jobs=-1)

    print(scores)
    print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')
    return ext_clf

# 튜닝 중간 과정은 생략 -> 파라미터
# ext_clf = bayes_tune_ext(X_train, y_train, X_test, 1, 1)

params = {'max_depth': 166, 'min_samples_leaf': 1, 'n_estimators': 583, 'n_jobs': -1}
ext_clf = ExtraTreesClassifier(**params)

## 3) Make Prediction (StratifiedKFold)

In [58]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
ext_preds = []
for num, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    ext_X_train = X_train.iloc[train_idx]
    ext_y_train = y_train.iloc[train_idx]
    
    ext_X_valid = X_train.iloc[valid_idx]
    ext_y_valid = y_train.iloc[valid_idx]
    
    ext_clf.fit(ext_X_train, ext_y_train)
    
    pred = ext_clf.predict_proba(ext_X_valid)[:,1]
    score = roc_auc_score(ext_y_valid, pred)
    pred = ext_clf.predict_proba(X_test)[:,1]
    
    print('FOLD : ', num)
    print('SCORE : ', score)
    print('')
    
    ext_preds.append(pred)

FOLD :  0
SCORE :  0.9771649051165026

FOLD :  1
SCORE :  0.9793675694423415

FOLD :  2
SCORE :  0.9839957895700328

FOLD :  3
SCORE :  0.98023472002929

FOLD :  4
SCORE :  0.979828407816755



## 4) Save Submission File

In [60]:
today = datetime.datetime.today()
today = str(today.month) + str(today.day)
ext_preds_final = np.mean(ext_preds, axis = 0)

sample= pd.read_csv('./data/sample_submission_final.csv')
sample['pred'] = ext_preds_final

num = len(glob(f'./submissions/재성_{today}_ext_*'))
sample.to_csv(f'./submissions/submission_3.csv', index=False)

print(f' Submission File Saved In : "./submissions/submission_3.csv"')

print(sample['pred'].mean())

 Submission File Saved In : "./submissions/재성_1124_ext_1.csv"
0.12600806223456681
[[1.         0.99051877]
 [0.99051877 1.        ]]
