In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data 살펴보기

In [None]:
train_df = pd.read_csv('/kaggle/input/sejongai-challenge-pretest-1/train.csv')
test_df = pd.read_csv('/kaggle/input/sejongai-challenge-pretest-1/test_data.csv')
submission = pd.read_csv('/kaggle/input/sejongai-challenge-pretest-1/submit_sample.csv')

In [None]:
display(train_df.head())
display(test_df.head())

In [None]:
train_df.drop('Unnamed: 0', axis=1, inplace=True)
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
train_df['8'].value_counts()

## Baseline Model

In [None]:
from sklearn.model_selection import train_test_split

df_copy = train_df.copy()

X_features = df_copy.iloc[:, :-1]
y_target = df_copy.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

print((X_train.shape, y_train.shape), (X_test.shape, y_test.shape))

In [None]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    
    # ROC-AUC
    roc_auc = roc_auc_score(y_test, pred_proba) # 각 클래스에 대한 확률
    print('오차 행렬')
    print(confusion)
    
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, AUC:{3:.4f}'.format(accuracy, precision, recall, roc_auc))
    print('\n')

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]

rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]

xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=5, min_child_weight=1.0, random_state=0)
xgb_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_train, y_train), (X_test, y_test)])
xgb_pred = xgb_clf.predict(X_test)
xgb_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

dt_clf = DecisionTreeClassifier(max_depth=5, random_state=0)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_pred_proba = dt_clf.predict_proba(X_test)[:, 1]

In [None]:
def get_model_train_eval(y_test, clf, clf_pred, clf_pred_proba):
    print('## {} 예측 성능'.format(clf.__class__.__name__))
    get_clf_eval(y_test, clf_pred, clf_pred_proba)
    print('\n')

get_model_train_eval(y_test, lr_clf, lr_pred, lr_pred_proba)
get_model_train_eval(y_test, rf_clf, rf_pred, rf_pred_proba)
get_model_train_eval(y_test, xgb_clf, xgb_pred, xgb_pred_proba)
get_model_train_eval(y_test, dt_clf, dt_pred, dt_pred_proba)

## EDA & feature engineering

### IQR을 이용한 이상치 제거

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(9,9))
corr = train_df.corr()
sns.heatmap(corr, cmap='RdBu', annot=True)

In [None]:
# 결정 레이블인 class와 음의 상관관계가 가장 높은 피처는 1번과 5번
# 1번에 대해서만 이상치를 찾아 제거해보자

diabetes = train_df[train_df['8']==1]['1']
quantile_25 = np.percentile(diabetes.values, 25)
quantile_75 = np.percentile(diabetes.values, 75)

# IQR을 구하고, IQR에 1.5를 곱해 최댓갑과 최솟값 지점 구함
iqr = quantile_75 - quantile_25
iqr_weight = iqr * 1.5
lowest_val = quantile_25 - iqr_weight
highest_val = quantile_75 + iqr_weight

outlier_index = diabetes[(diabetes < lowest_val) | (diabetes > highest_val)].index
print('이상치 데이터 인덱스: ', outlier_index)

In [None]:
df_copy.drop(outlier_index, axis=0, inplace=True)
df_copy.reset_index(drop=True, inplace=True) # 1번 칼럼 이상치 제거

X_features = df_copy.iloc[:, :-1]
y_target = df_copy.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

print((X_train.shape, y_train.shape), (X_test.shape, y_test.shape))

In [None]:
#lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]

#rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]

#xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=5, min_child_weight=1.0, random_state=0)
xgb_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_train, y_train), (X_test, y_test)])
xgb_pred = xgb_clf.predict(X_test)
xgb_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

#dt_clf = DecisionTreeClassifier(max_depth=5, random_state=0)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_pred_proba = dt_clf.predict_proba(X_test)[:, 1]

In [None]:
get_model_train_eval(y_test, lr_clf, lr_pred, lr_pred_proba)
get_model_train_eval(y_test, rf_clf, rf_pred, rf_pred_proba)
get_model_train_eval(y_test, xgb_clf, xgb_pred, xgb_pred_proba)
get_model_train_eval(y_test, dt_clf, dt_pred, dt_pred_proba)

In [None]:
# RF 특성 중요도
print("{} 특성 중요도 : \n{}".format(rf_clf.__class__.__name__, rf_clf.feature_importances_))


# 특성 중요도 시각화 하기
def plot_feature_importances_cancer(model):
    n_features = X_features.shape[1]
    plt.barh(range(n_features), rf_clf.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X_features.columns)
    plt.xlabel("feature importances")
    plt.ylabel("feature")
    plt.ylim(-1, n_features)
    plt.title('Random Forest Feature Importance')

plt.show()

plot_feature_importances_cancer(rf_clf)

In [None]:
# XGB 특성 중요도
print("{} 특성 중요도 : \n{}".format(xgb_clf.__class__.__name__, xgb_clf.feature_importances_))


# 특성 중요도 시각화 하기
def plot_feature_importances_cancer(model):
    n_features = X_features.shape[1]
    plt.barh(range(n_features), xgb_clf.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X_features.columns)
    plt.xlabel("feature importances")
    plt.ylabel("feature")
    plt.ylim(-1, n_features)
    plt.title('XGB Feature Importance')

plt.show()

plot_feature_importances_cancer(xgb_clf)

### feature 분포 확인

In [None]:
sns.set()
for col in X_features.columns:
    flg, ax = plt.subplots(1, 1, figsize=(8,5))
    sns.distplot(df_copy[col])

In [None]:
log_df = df_copy.drop('8', axis=1).copy()
log_df = np.log1p(log_df)
log_df

In [None]:
X_features = log_df
y_target = df_copy.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

print((X_train.shape, y_train.shape), (X_test.shape, y_test.shape))

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]

rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]

xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=5, min_child_weight=1.0, random_state=0)
xgb_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_train, y_train), (X_test, y_test)])
xgb_pred = xgb_clf.predict(X_test)
xgb_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

dt_clf = DecisionTreeClassifier(max_depth=5, random_state=0)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_pred_proba = dt_clf.predict_proba(X_test)[:, 1]

In [None]:
get_model_train_eval(y_test, lr_clf, lr_pred, lr_pred_proba)
get_model_train_eval(y_test, rf_clf, rf_pred, rf_pred_proba)
get_model_train_eval(y_test, xgb_clf, xgb_pred, xgb_pred_proba)
get_model_train_eval(y_test, dt_clf, dt_pred, dt_pred_proba)

### 각 feature마다 target값에 따른 분포 확인하여 이상치 제거

In [None]:
df_drop_outlier = df_copy.copy()

In [None]:
sns.stripplot(x="8", y='0', data=df_drop_outlier, jitter=True)

In [None]:
drop_index_0 = df_drop_outlier[df_drop_outlier['0'] > 0.25].index
df_drop_outlier.drop(drop_index_0, axis=0, inplace=True)
df_drop_outlier.reset_index(drop=True, inplace=True)

In [None]:
sns.stripplot(x="8", y='1', data=df_drop_outlier, jitter=True)

In [None]:
drop_index_1 = df_drop_outlier[df_drop_outlier['1'] < -0.4].index
df_drop_outlier.drop(drop_index_1, axis=0, inplace=True)
df_drop_outlier.reset_index(drop=True, inplace=True)

In [None]:
sns.stripplot(x="8", y='2', data=df_drop_outlier, jitter=True)

In [None]:
drop_index_2 = df_drop_outlier[(df_drop_outlier['2'] > 0.6) | (df_drop_outlier['2'] < -0.2)].index
df_drop_outlier.drop(drop_index_2, axis=0, inplace=True)
df_drop_outlier.reset_index(drop=True, inplace=True)

In [None]:
sns.stripplot(x="8", y='3', data=df_drop_outlier, jitter=True)

In [None]:
drop_index_3 = df_drop_outlier[df_drop_outlier['3'] > 0.00].index
df_drop_outlier.drop(drop_index_3, axis=0, inplace=True)
df_drop_outlier.reset_index(drop=True, inplace=True)

In [None]:
sns.stripplot(x="8", y='4', data=df_drop_outlier, jitter=True)

In [None]:
drop_index_4 = df_drop_outlier[df_drop_outlier['3'] > 0.00].index
df_drop_outlier.drop(drop_index_4, axis=0, inplace=True)
df_drop_outlier.reset_index(drop=True, inplace=True)

In [None]:
sns.stripplot(x="8", y='5', data=df_drop_outlier, jitter=True)

In [None]:
drop_index_5 = df_drop_outlier[df_drop_outlier['5'] > 0.5].index
df_drop_outlier.drop(drop_index_5, axis=0, inplace=True)
df_drop_outlier.reset_index(drop=True, inplace=True)

In [None]:
sns.stripplot(x="8", y='6', data=df_drop_outlier, jitter=True)

In [None]:
drop_index_6 = df_drop_outlier[df_drop_outlier['6'] > 0.2].index
df_drop_outlier.drop(drop_index_6, axis=0, inplace=True)
df_drop_outlier.reset_index(drop=True, inplace=True)

In [None]:
sns.stripplot(x="8", y='7', data=df_drop_outlier, jitter=True)

In [None]:
drop_index_7 = df_drop_outlier[df_drop_outlier['7'] > 0.5].index
df_drop_outlier.drop(drop_index_7, axis=0, inplace=True)
df_drop_outlier.reset_index(drop=True, inplace=True)

In [None]:
X_features = df_drop_outlier.iloc[:, :-1]
y_target = df_drop_outlier.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

print((X_train.shape, y_train.shape), (X_test.shape, y_test.shape))

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]

rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=6, random_state=0)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]

xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=7, min_child_weight=1.0, random_state=0)
xgb_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_train, y_train), (X_test, y_test)])
xgb_pred = xgb_clf.predict(X_test)
xgb_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=0)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_pred_proba = dt_clf.predict_proba(X_test)[:, 1]

In [None]:
get_model_train_eval(y_test, lr_clf, lr_pred, lr_pred_proba)
get_model_train_eval(y_test, rf_clf, rf_pred, rf_pred_proba)
get_model_train_eval(y_test, xgb_clf, xgb_pred, xgb_pred_proba)
get_model_train_eval(y_test, dt_clf, dt_pred, dt_pred_proba)

### XGBClassifier GridSearchCV 1

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'learning_rate':[0.005,0.01,0.05], 
          'max_depth':[6,7,8],
          'min_child_weight':[1.0,1.2],
          'n_estimators':[500,800,1000],
          'colsample_bytree':[0.5,0.8]}

cv = KFold(n_splits=5, random_state=1)
grid_model = XGBClassifier(random_state=0)
clf_cv = GridSearchCV(grid_model, params, cv=cv, n_jobs=4, verbose=1)
clf_cv.fit(X_train, y_train)

In [None]:
print('final params', clf_cv.best_params_)   # 최적의 파라미터 값 출력
print('best score', clf_cv.best_score_)      # 최고의 점수

In [None]:
xgb_clf_gs = XGBClassifier(**clf_cv.best_params_, random_state=0)
xgb_clf_gs.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_train, y_train), (X_test, y_test)])
xgb_pred_gs = xgb_clf_gs.predict(X_test)
xgb_pred_gs_proba = xgb_clf_gs.predict_proba(X_test)[:, 1]

get_model_train_eval(y_test, xgb_clf_gs, xgb_pred_gs, xgb_pred_gs_proba)

### RandomForestClassifier GridSearchCV 1

In [None]:
params = {'max_depth':[6,7,8],
         'n_estimators':[500, 700, 900],
         'min_samples_leaf':[3,5,7,10],
         'min_samples_split':[2,3,5,10]}

cv = KFold(n_splits=5, random_state=1)
grid_model_rf = RandomForestClassifier(random_state=0)
clf_cv_rf = GridSearchCV(grid_model_rf, params, cv=cv, n_jobs=4, verbose=1)
clf_cv_rf.fit(X_train, y_train)

In [None]:
print('final params', clf_cv_rf.best_params_)   # 최적의 파라미터 값 출력
print('best score', clf_cv_rf.best_score_)      # 최고의 점수

In [None]:
rf_clf_gs = RandomForestClassifier(**clf_cv_rf.best_params_)
rf_clf_gs.fit(X_train, y_train)
rf_pred_gs = rf_clf_gs.predict(X_test)
rf_pred_gs_proba = rf_clf_gs.predict_proba(X_test)[:, 1]

get_model_train_eval(y_test, rf_clf_gs, rf_pred_gs, rf_pred_gs_proba)

In [None]:
result_rf2 = rf_clf_gs.predict(test_df)
ensemble_pred = 0.6 * result_xgb2 + 0.4 * result_rf2 # xgb, rf2
submission['Label'] = np.clip(ensemble_pred, 0 , max(ensemble_pred))
submission['Label'] = submission['Label'].astype(int)
submission.to_csv('diabetes_result6.csv', index=False)

### XGBClassifier GridSearchCV 2

In [None]:
params ={'learning_rate':[0.01, 0.05, 0.1],
         'silent':[True],
         'max_depth':[7,8,9],
         'min_child_weight':[1.2,1.5,1.8],
         'colsample_bytree':[0.5,0.8],
         'colsample_bylevel':[0.9],
         'n_estimators':[500, 700]}

cv = KFold(n_splits=5, random_state=1)
grid_model2 = XGBClassifier(random_state=0)
clf_cv2 = GridSearchCV(grid_model2, params, cv=cv, n_jobs=4, verbose=1)
clf_cv2.fit(X_train, y_train)

In [None]:
print('final params', clf_cv2.best_params_)   # 최적의 파라미터 값 출력
print('best score', clf_cv2.best_score_)      # 최고의 점수

In [None]:
xgb_clf_gs2 = XGBClassifier(**clf_cv2.best_params_)
xgb_clf_gs2.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_train, y_train), (X_test, y_test)])
xgb_pred_gs2 = xgb_clf_gs2.predict(X_test)
xgb_pred_gs2_proba = xgb_clf_gs2.predict_proba(X_test)[:, 1]

get_model_train_eval(y_test, xgb_clf_gs2, xgb_pred_gs2, xgb_pred_gs2_proba)

In [None]:
result_xgb2 = xgb_clf_gs2.predict(test_df)
ensemble_pred = 0.6 * result_xgb2 + 0.4 * result # xgb, xgb2
submission['Label'] = np.clip(ensemble_pred, 0 , max(ensemble_pred))
submission['Label'] = submission['Label'].astype(int)
submission.to_csv('diabetes_result5.csv', index=False)

In [None]:
# xgb with grid_search
result = xgb_clf_gs.predict(test_df)

In [None]:
ensemble_pred = 0.6 * result + 0.4 * result_rf # xgb, rf
submission['Label'] = np.clip(ensemble_pred, 0 , max(ensemble_pred))
submission['Label'] = submission['Label'].astype(int)
submission.to_csv('diabetes_result4.csv', index=False)