In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
cd /content/drive/My Drive/psat_summer

/content/drive/My Drive/psat_summer


In [8]:
import pandas as pd
import numpy as np
import os

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.model_selection import KFold

import pickle
import joblib

In [4]:
train = pd.read_csv("train_mwmote.csv")

In [5]:
y = train['target']
x = train.drop('target', axis = 1)

시도 1

In [14]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {'learning_rate': 0.01,
              'n_estimators': 400, 
              'max_depth': 4,  
              'min_child_weight' : 2.5,
              'subsample' : 0.7,
              'colsample_bytree' : 0.7,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

    model = XGBClassifier(**params)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds=100, verbose=100)

    y_pred = model.predict(valid_X)

    CM.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

[0]	validation_0-error:0.218319
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.203037
Stopping. Best iteration:
[5]	validation_0-error:0.20254

              precision    recall  f1-score   support

     class 0       0.80      0.78      0.79      5020
     class 1       0.79      0.81      0.80      5057

    accuracy                           0.80     10077
   macro avg       0.80      0.80      0.80     10077
weighted avg       0.80      0.80      0.80     10077

[0]	validation_0-error:0.217525
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.20135
[200]	validation_0-error:0.192518
[300]	validation_0-error:0.185869
[399]	validation_0-error:0.176938
              precision    recall  f1-score   support

     class 0       0.84      0.80      0.82      5053
     class 1       0.81      0.85      0.83      5024

    accuracy                           0.82     10077
   macro avg       0.82      0

시도 2 - learning rate 조정 (0.01 -> **0.05**)

In [15]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {'learning_rate': 0.05,
              'n_estimators': 400, 
              'max_depth': 4,  
              'min_child_weight' : 2.5,
              'subsample' : 0.7,
              'colsample_bytree' : 0.7,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

    model = XGBClassifier(**params)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds=100, verbose=100)

    y_pred = model.predict(valid_X)

    CM.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

[0]	validation_0-error:0.218319
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.172075
[200]	validation_0-error:0.141014
[300]	validation_0-error:0.119182
[399]	validation_0-error:0.107572
              precision    recall  f1-score   support

     class 0       0.90      0.88      0.89      5020
     class 1       0.88      0.91      0.89      5057

    accuracy                           0.89     10077
   macro avg       0.89      0.89      0.89     10077
weighted avg       0.89      0.89      0.89     10077

[0]	validation_0-error:0.217525
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.169197
[200]	validation_0-error:0.133075
[300]	validation_0-error:0.112732
[399]	validation_0-error:0.100824
              precision    recall  f1-score   support

     class 0       0.92      0.88      0.90      5053
     class 1       0.88      0.92      0.90      5024

    accuracy                           

시도 3. learning rate 조정 (0.05 -> **0.1**)

In [16]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {'learning_rate': 0.1,
              'n_estimators': 400, 
              'max_depth': 4,  
              'min_child_weight' : 2.5,
              'subsample' : 0.7,
              'colsample_bytree' : 0.7,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

    model = XGBClassifier(**params)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds=100, verbose=100)

    y_pred = model.predict(valid_X)

    CM.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

[0]	validation_0-error:0.218319
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.143694
[200]	validation_0-error:0.110053
[300]	validation_0-error:0.095862
[399]	validation_0-error:0.082763
              precision    recall  f1-score   support

     class 0       0.93      0.91      0.92      5020
     class 1       0.91      0.93      0.92      5057

    accuracy                           0.92     10077
   macro avg       0.92      0.92      0.92     10077
weighted avg       0.92      0.92      0.92     10077

[0]	validation_0-error:0.217525
Will train until validation_0-error hasn't improved in 100 rounds.


KeyboardInterrupt: ignored

시도 4. learning rate 조정(0.1 -> 0.3)

In [17]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {'learning_rate': 0.3,
              'n_estimators': 400, 
              'max_depth': 4,  
              'min_child_weight' : 2.5,
              'subsample' : 0.7,
              'colsample_bytree' : 0.7,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

    model = XGBClassifier(**params)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds=100, verbose=100)

    y_pred = model.predict(valid_X)

    CM.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

[0]	validation_0-error:0.218319
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.099831
[200]	validation_0-error:0.075816
[300]	validation_0-error:0.063908
[399]	validation_0-error:0.057358
              precision    recall  f1-score   support

     class 0       0.95      0.93      0.94      5020
     class 1       0.93      0.95      0.94      5057

    accuracy                           0.94     10077
   macro avg       0.94      0.94      0.94     10077
weighted avg       0.94      0.94      0.94     10077

[0]	validation_0-error:0.217525
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.095961
[200]	validation_0-error:0.076412
[300]	validation_0-error:0.066687
[399]	validation_0-error:0.059145
              precision    recall  f1-score   support

     class 0       0.96      0.92      0.94      5053
     class 1       0.93      0.96      0.94      5024

    accuracy                           

시도 5. learning rate 조정 (0.3 -> **0.5**)

In [18]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {'learning_rate': 0.5,
              'n_estimators': 400, 
              'max_depth': 4,  
              'min_child_weight' : 2.5,
              'subsample' : 0.7,
              'colsample_bytree' : 0.7,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

    model = XGBClassifier(**params)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds=100, verbose=100)

    y_pred = model.predict(valid_X)

    CM.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

[0]	validation_0-error:0.218319
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.091793
[200]	validation_0-error:0.069366
[300]	validation_0-error:0.058648
[399]	validation_0-error:0.052397
              precision    recall  f1-score   support

     class 0       0.96      0.93      0.95      5020
     class 1       0.94      0.96      0.95      5057

    accuracy                           0.95     10077
   macro avg       0.95      0.95      0.95     10077
weighted avg       0.95      0.95      0.95     10077

[0]	validation_0-error:0.217525
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.089114
[200]	validation_0-error:0.065496
[300]	validation_0-error:0.058748
[399]	validation_0-error:0.053389
              precision    recall  f1-score   support

     class 0       0.97      0.93      0.95      5053
     class 1       0.93      0.97      0.95      5024

    accuracy                           

n_estimators 조정 (400 -> 800) \
learning_rate = 0.5

In [19]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {'learning_rate': 0.5,
              'n_estimators': 800, 
              'max_depth': 4,  
              'min_child_weight' : 2.5,
              'subsample' : 0.7,
              'colsample_bytree' : 0.7,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

    model = XGBClassifier(**params)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds=100, verbose=100)

    y_pred = model.predict(valid_X)

    CM.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

[0]	validation_0-error:0.218319
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.091793
[200]	validation_0-error:0.069366
[300]	validation_0-error:0.058648
[400]	validation_0-error:0.052793
[500]	validation_0-error:0.048129
[600]	validation_0-error:0.045748
[700]	validation_0-error:0.043763
[799]	validation_0-error:0.04287
              precision    recall  f1-score   support

     class 0       0.97      0.94      0.96      5020
     class 1       0.95      0.97      0.96      5057

    accuracy                           0.96     10077
   macro avg       0.96      0.96      0.96     10077
weighted avg       0.96      0.96      0.96     10077

[0]	validation_0-error:0.217525
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.089114
[200]	validation_0-error:0.065496
[300]	validation_0-error:0.058748
[400]	validation_0-error:0.05319
[500]	validation_0-error:0.048923
Stopping. Best iteration:
[477]	val

max_depth 조정 (4 -> 5) \
n_estimators = 800 \
learning_rate = 0.5

In [24]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {'learning_rate': 0.5,
              'n_estimators': 800, 
              'max_depth': 5,  
              'min_child_weight' : 2.5,
              'subsample' : 0.7,
              'colsample_bytree' : 0.7,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

    model = XGBClassifier(**params)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds=100, verbose=100)

    y_pred = model.predict(valid_X)

    CM.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

KeyboardInterrupt: ignored

min_child_weight 조정 (2.5 -> 3) \
max_depth = 5 \
n_estimators = 800 \
learning_rate = 0.5

In [27]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {'learning_rate': 0.5,
              'n_estimators': 800, 
              'max_depth': 5,  
              'min_child_weight' : 3,
              'subsample' : 0.7,
              'colsample_bytree' : 0.7,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

    model = XGBClassifier(**params)
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], early_stopping_rounds=100, verbose=100)

    y_pred = model.predict(valid_X)

    CM.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))
    print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

[0]	validation_0-error:0.214846
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.074328
[200]	validation_0-error:0.055969
[300]	validation_0-error:0.048427
[400]	validation_0-error:0.046343
[500]	validation_0-error:0.043565
[600]	validation_0-error:0.042572
[700]	validation_0-error:0.041778
Stopping. Best iteration:
[667]	validation_0-error:0.040389

              precision    recall  f1-score   support

     class 0       0.97      0.95      0.96      5020
     class 1       0.95      0.97      0.96      5057

    accuracy                           0.96     10077
   macro avg       0.96      0.96      0.96     10077
weighted avg       0.96      0.96      0.96     10077

[0]	validation_0-error:0.211869
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.072343
[200]	validation_0-error:0.053389
[300]	validation_0-error:0.047236
[400]	validation_0-error:0.043664
[500]	validation_0-error:0.042671
[600]	

모델 저장

In [38]:
model = XGBClassifier(learning_rate=0.5, n_estimators=800, max_depth=5, min_child_weight=3, subsample=0.7, colsample_bytree=0.7, reg_lambda=3, objective='binary:logistic', random_state=0)

In [39]:
model.fit(x, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0,
              learning_rate=0.5, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=None, n_estimators=800, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=3, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.7, verbosity=1)

In [40]:
test = pd.read_csv("test_pca.csv")

In [41]:
test.shape

(2000, 189)

In [42]:
prediction = model.predict(test)

In [43]:
submission = pd.DataFrame(columns = ['id', 'target'])
submission['id'] = range(1,2001)

In [44]:
submission['target'] = prediction

In [45]:
submission['target'].value_counts()

0    1843
1     157
Name: target, dtype: int64

In [46]:
submission.to_csv("xgboost_ver2.csv", header = True, index = False)