In [None]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

from sklearn.preprocessing import OneHotEncoder
import random
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
!pip install Bayesian-Optimization

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score


In [None]:
import os
os.chdir('/content/drive/MyDrive/data/dacon/신용카드 사용자 연체 예측 AI')
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submit=pd.read_csv('sample_submission.csv')


In [None]:
# 결측치 합
train.isnull().sum()

In [None]:
# 결측치 퍼센트 
missing_df = train.isnull().sum().reset_index()
missing_df.columns = ['column', 'count']
missing_df['ratio'] = missing_df['count'] / train.shape[0]
missing_df.loc[missing_df['ratio'] != 0]

In [None]:
# 기본적인 데이터 
train.describe()

In [None]:
object_col = []
int_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)
    else:
        int_col.append(col)

In [None]:
# 범주형
object_col

In [None]:
# 연속형
int_col

In [None]:
# 종속변수 체크
train['credit'].value_counts().plot(kind='bar') 

In [None]:
# 범주형 변수 분포
for col in object_col: 
    train[col].value_counts().plot(kind='bar') 
    plt.title(col) 
    plt.show()

In [None]:
# 이산형 변수 분포
for col in int_col:
    sns.distplot(train.loc[train[col].notnull(), col])
    plt.title(col)
    plt.show()

In [None]:
# 이산형 변수 EDA 
train.loc[:,int_col].describe()

In [None]:
numerical_feature = list(set(train.columns) - set(object_col) - set(['credit']))
numerical_feature = np.sort(numerical_feature)
numerical_feature


In [None]:
train.loc[:,int_col]

In [None]:
# 이변수, 삼변수 탐색
sns.pairplot(train[list(numerical_feature) + ['credit']], hue='credit', 
             x_vars=numerical_feature, y_vars=numerical_feature)
plt.show()

In [None]:
# 수치형, 명목형 변수 간의 관계 탐색
unique_list = train['credit'].unique()
for row in object_col:
    for col in numerical_feature:
        plt.figure(figsize=(12,6))
        sns.boxplot(x=row, y=col, hue='credit', data=train.dropna())
        plt.title(row + " - {}".format(col))
        plt.show()



In [None]:
train.loc[:,int_col]

In [None]:

train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

In [None]:
# MinMaxScaler
# from sklearn import preprocessing as pc

# for i in int_col:
#     if i == 'credit':
#         continue
#     train[[i]] =pc.MinMaxScaler((0,5)).fit_transform(train[[i]])
# # titanic[['Fare']] = pc.MinMaxScaler((0,10)).fit_transform(titanic[['Fare']])

In [None]:
train.loc[:,object_col]

In [None]:
train

In [None]:
# OnehotEncoder 부분
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [None]:
# 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
# lightgbm의 default parameter로 훈련.
# 30번 이상 개선 없을 경우 중단.
# 각 5개의 fold를 훈련하여 저장

random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')


In [None]:
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)

In [None]:
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)

In [None]:
def bayesOpt(train_x, train_y):
    lgbBO = BayesianOptimization(lgb_evaluate, {  'numLeaves':  (5, 90),  'maxDepth': (2, 90),   'scaleWeight': (1, 10000),  'minChildWeight': (0.01, 70), 'subsample': (0.4, 1), 'colSam': (0.4, 1) })
    lgbBO.maximize(init_points=5, n_iter=30)
    print(lgbBO.res)
    return lgbBO

In [None]:
def lgb_evaluate(numLeaves, maxDepth, scaleWeight, minChildWeight, subsample, colSam, output = 'score'):
    reg=LGBMClassifier(num_leaves=31, max_depth= 2,scale_pos_weight= scaleWeight, min_child_weight= minChildWeight, subsample= 0.4, colsample_bytree= 0.4, learning_rate=0.05,   n_estimators=20)
    scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='roc_auc')
    # scores = cross_val_score(reg, train_x, train_y, cv=5, scoring='neg_mean_squared_error')
 
    if output == 'score' :
      return np.mean(scores)
    if output == 'model' :
      return reg

In [None]:
y_train

In [None]:
lgbBO = bayesOpt(X_train, y_train)

In [None]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [None]:
submit

In [None]:
submit.to_csv('baseline_submission2.csv', index=False) # 0.7272812144
