In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from jun_function import fmerge, kwd_len_mean, sess_dt_std
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
warnings.filterwarnings(action='ignore')
%matplotlib inline

# Data Loading

In [None]:
target = pd.read_csv('cust_train.csv', encoding = 'UTF-8')
features = pd.read_csv('features.csv', encoding = 'UTF-8') 

In [None]:
features = features.iloc[:, 1:]

In [None]:
# 로그변환, 정규화, 표준화 ...
from sklearn.preprocessing import MinMaxScaler, StandardScaler

mms = MinMaxScaler()
ss = StandardScaler()

# 정규화
features = mms.fit_transform(features)

# 표준화
features = ss.fit_transform(features)

In [None]:
# to dataframe
features = pd.DataFrame(features, columns = [str(i) for i in range(features.shape[1])])

In [None]:
features_clnt = pd.read_csv('features.csv', encoding = 'UTF-8')

In [None]:
features = pd.concat([features, features_clnt[['CLNT_ID']]], axis=1)

In [None]:
sparse_pca = pd.read_csv('sparse_pca.csv', encoding = 'UTF-8')

In [None]:
master = features.merge(sparse_pca, on='CLNT_ID')

In [None]:
del master['CLNT_ID']

In [None]:
master_tr = master.iloc[:263104,:]
master_te = master.iloc[263104:,:]

# Data Split

In [None]:
data = master_tr
data_te = master_te
target = target['LABEL']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.3, random_state=0)

# Modeling

In [None]:
import lightgbm as lgbm
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_validate
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

### LGBM_BO (6시간)

In [None]:
pbounds = { 'learning_rate': (0.05, 1.5),
            'n_estimators': (50, 500),
            'max_depth': (3,10),   
            'subsample': (0.8,0.95), 
            'colsample_bytree': (0.75,0.9),   
            'num_leaves': (2,10),
            'min_child_weight': (1, 7)}


def lgbm_opt(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, num_leaves, min_child_weight):

    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample_bytree' : colsample_bytree,
        'num_leaves' : int(round(num_leaves)),
        'min_child_weight' : min_child_weight,
        'n_jobs' : -1
    }
    
    lgbm = LGBMClassifier(**params)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=True, random_state=1)

    score = cross_val_score(lgbm, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    
    return np.mean(score)

BO_lgbm = BayesianOptimization(f = lgbm_opt, pbounds = pbounds, random_state=1)  
BO_lgbm.maximize(init_points=50, n_iter=50)
max_params = BO_lgbm.max['params']

max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['num_leaves'] = int(round(max_params['num_leaves']))

lgbm_clf = LGBMClassifier(**max_params)
lgbm_clf.fit(X_train, y_train)
scores = cross_val_score(lgbm_clf, X_train, y_train, scoring='neg_log_loss', cv=4, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

In [None]:
features_clnt_te = features_clnt.iloc[263104:,:112760]
features_clnt_te = features_clnt_te.reset_index()

In [None]:
pred = pd.DataFrame(lgbm_clf.predict_proba(data_te))

result = pd.concat([features_clnt_te.CLNT_ID, pred],axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result.CLNT_ID = result.CLNT_ID.astype(int)
result

#result.to_csv('master_632_LGBM.csv',index=False)

### LR_BO (8시간)

In [None]:
pbounds = { 'C' : (0.1, 1),
            'max_iter' : (100, 1000)}


def lr_opt(C, max_iter):

    params = {
        'C' : C,
        'max_iter' : max_iter,
        'n_jobs' : -1
    }
    
    lr = LogisticRegression(**params)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=True, random_state=1)

    score = cross_val_score(lr, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    
    return np.mean(score)

BO_lr = BayesianOptimization(f = lr_opt, pbounds = pbounds, random_state=1)  
BO_lr.maximize(init_points=50, n_iter=50)
max_params = BO_lr.max['params']

lr_clf = LogisticRegression(**max_params)
lr_clf.fit(X_train, y_train)
scores = cross_val_score(lr_clf, X_train, y_train, scoring='neg_log_loss', cv=4, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

In [None]:
features_clnt_te = features_clnt.iloc[263104:,:112760]
features_clnt_te = features_clnt_te.reset_index()

In [None]:
pred = pd.DataFrame(lr_clf.predict_proba(data_te))

result = pd.concat([features_clnt_te.CLNT_ID, pred],axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result.CLNT_ID = result.CLNT_ID.astype(int)
result

# result.to_csv('master_632_LR.csv',index=False)

### RF_BO (iter당 한시간 반)

In [None]:
pbounds = { 'n_estimators': (50,250),
            'max_depth': (3,15), 
            'max_features': (0.7,0.95),
            'min_samples_split':(5,50),
            'min_samples_leaf': (5,50)}


def rf_opt(n_estimators, max_depth, max_features, min_samples_split, min_samples_leaf):

    params = {
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'max_features' : max_features,
        'min_samples_split' : int(round(min_samples_split)),
        'min_samples_leaf' : int(round(min_samples_leaf)),
        'n_jobs' : -1
    }
    
    rf = RandomForestClassifier(**params)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=True, random_state=1)

    score = cross_val_score(rf, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    
    return np.mean(score)

BO_rf = BayesianOptimization(f = rf_opt, pbounds = pbounds, random_state=1)  
BO_rf.maximize(init_points=10, n_iter=10)
max_params = BO_rf.max['params']

max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['min_samples_split'] = int(round(max_params['min_samples_split']))
max_params['min_samples_leaf'] = int(round(max_params['min_samples_leaf']))

rf_clf = RandomForestClassifier(**max_params)
rf_clf.fit(X_train, y_train)
scores = cross_val_score(rf_clf, X_train, y_train, scoring='neg_log_loss', cv=4, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

In [None]:
features_clnt_te = features_clnt.iloc[263104:,:112760]
features_clnt_te = features_clnt_te.reset_index()

In [None]:
pred = pd.DataFrame(rf_clf.predict_proba(data_te))

result = pd.concat([features_clnt_te.CLNT_ID, pred],axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result.CLNT_ID = result.CLNT_ID.astype(int)
result

# result.to_csv('master_632_RF.csv',index=False)

### XGB_BO (iter당 4시간)

In [None]:
pbounds = { 'learning_rate': (0.05, 1.5),
            'n_estimators': (50, 500),
            'max_depth': (3,10),   
            'subsample': (0.8,0.95), 
            'colsample_bytree': (0.75,0.9),   
            'num_leaves': (2,10),
            'gamma': (0, 5)}


def xgb_opt(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, num_leaves, gamma):

    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample_bytree' : colsample_bytree,
        'num_leaves' : int(round(num_leaves)),
        'gamma' : gamma,
        'n_jobs' : -1
    }
    
    xgb = XGBClassifier(**params)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=True, random_state=1)

    score = cross_val_score(xgb, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1)
    
    return np.mean(score)

BO_xgb = BayesianOptimization(f = xgb_opt, pbounds = pbounds, random_state=1)  
BO_xgb.maximize(init_points=10, n_iter=10)
max_params = BO_xgb.max['params']

max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['num_leaves'] = int(round(max_params['num_leaves']))

xgb_clf = XGBClassifier(**max_params)
xgb_clf.fit(X_train, y_train)
scores = cross_val_score(xgb_clf, X_train, y_train, scoring='neg_log_loss', cv=4, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

In [None]:
features_clnt_te = features_clnt.iloc[263104:,:112760]
features_clnt_te = features_clnt_te.reset_index()

In [26]:
pred = pd.DataFrame(lr_clf.predict_proba(data_te))
pred

Unnamed: 0,0,1,2,3,4,5
0,0.076587,0.451968,0.362109,0.018908,0.037440,0.052989
1,0.319942,0.411618,0.211292,0.019998,0.018489,0.018661
2,0.109195,0.383553,0.388692,0.020166,0.035190,0.063205
3,0.017052,0.602806,0.290908,0.002618,0.043847,0.042769
4,0.182139,0.458194,0.291561,0.018170,0.025022,0.024913
...,...,...,...,...,...,...
112755,0.124490,0.258259,0.215649,0.074573,0.166346,0.160683
112756,0.028136,0.100923,0.746853,0.001462,0.016375,0.106251
112757,0.145911,0.392282,0.343681,0.029407,0.040762,0.047957
112758,0.063040,0.214862,0.360594,0.101131,0.107315,0.153057


In [None]:
pred = pd.DataFrame(xgb_clf.predict_proba(data_te))

result = pd.concat([features_clnt_te.CLNT_ID, pred],axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result.CLNT_ID = result.CLNT_ID.astype(int)
result

# result.to_csv('master_632_XGB.csv',index=False)

### CatBoost (iter당 30분)

In [None]:
from catboost import CatBoostClassifier

pbounds = {
    'learning_rate' : (0.01, 1), # 학습률. boosting에서 error에 대한 가중치 갱신을 얼만큼 할 것인가
#    'depth' : (3, 15), # 트리의 깊이. 클수록 overfitting
#    'iterations' : (500, 1200), # 트리의 개수. 클수록 overfitting
    'l2_leaf_reg' : (2, 20) # 뭘까 이건
}

def cat_opt(learning_rate,l2_leaf_reg):
    params = {
    'learning_rate' : learning_rate,
#    'depth' : int(round(depth)),
#    'iterations' : int(round(iterations)),
    'l2_leaf_reg' : int(round(l2_leaf_reg))        
    }
    
    cat = CatBoostClassifier(**params)
    skf = StratifiedKFold(n_splits=4 , shuffle=False, random_state=50)
    score = cross_val_score(cat, X_train, y_train, scoring='neg_log_loss', cv=skf, n_jobs=-1) # cv가 아닌 일반 score를 하면 시간이 훨씬 단축됨.
    return np.mean(score)
    
BO_cat = BayesianOptimization(f = cat_opt, pbounds = pbounds, random_state=1)  
BO_cat.maximize(init_points=20, n_iter=20)
max_params = BO_cat.max['params']

#max_params['iterations'] = int(round(max_params['iterations']))
#max_params['depth'] = int(round(max_params['depth']))
max_params['l2_leaf_reg'] = int(round(max_params['l2_leaf_reg']))
cat_clf = CatBoostClassifier(**max_params)
cat_clf.fit(X_train, y_train)
scores = cross_val_score(cat_clf, X_train, y_train, scoring='neg_log_loss', cv=4, n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

In [None]:
features_clnt_te = features_clnt.iloc[263104:,:112760]

= features_clnt_te.reset_index()

In [None]:
pred = pd.DataFrame(cat_clf.predict_proba(data_te))

result = pd.concat([features_clnt_te.CLNT_ID, pred],axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result.CLNT_ID = result.CLNT_ID.astype(int)
result

# result.to_csv('master_632_catboost.csv',index=False)