In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
cd /content/drive/My Drive/psat_summer

/content/drive/My Drive/psat_summer


In [82]:
import pandas as pd
import numpy as np
import os

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.model_selection import KFold

from imblearn.over_sampling import SMOTE

import pickle
import joblib

In [83]:
train = pd.read_csv("train_pca.csv")

In [84]:
y = train['target']
x = train.drop('target', axis = 1)

In [101]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 2021)

for train_idx, valid_idx in folds.split(x):
  train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
  valid_x, valid_y = x.iloc[valid_idx], y.iloc[valid_idx]

  smote = SMOTE(random_state = 2021)
  smote = smote.fit_sample(train_x, train_y)
  smote_x = pd.DataFrame(smote[0])
  smote_y = smote[1]

  smote_x.columns = valid_x.columns

  params = {'learning_rate': 0.01,
              'n_estimators': 1000, 
              'max_depth': 5,  
              'min_child_weight' : 5,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'reg_lambda' : 1e-5,
              'objective': 'binary:logistic',  
              'random_state': 0}

  model = XGBClassifier(**params)
  model.fit(smote_x, smote_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)

  y_pred = model.predict(valid_x)

  CM.append(confusion_matrix(valid_y, y_pred))
  f1_scores.append(f1_score(valid_y, y_pred))
  print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)



[0]	validation_0-error:0.241964
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.197143
[200]	validation_0-error:0.192143
[300]	validation_0-error:0.1875
[400]	validation_0-error:0.182321
[500]	validation_0-error:0.179821
[600]	validation_0-error:0.17375
[700]	validation_0-error:0.171071
[800]	validation_0-error:0.16625
[900]	validation_0-error:0.162321
[999]	validation_0-error:0.160179
              precision    recall  f1-score   support

     class 0       0.95      0.87      0.91      5041
     class 1       0.33      0.57      0.42       559

    accuracy                           0.84      5600
   macro avg       0.64      0.72      0.66      5600
weighted avg       0.89      0.84      0.86      5600





[0]	validation_0-error:0.203214
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.2075
Stopping. Best iteration:
[0]	validation_0-error:0.203214

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5030
     class 1       0.29      0.71      0.41       570

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.203214
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.222679
Stopping. Best iteration:
[0]	validation_0-error:0.203214

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5028
     class 1       0.29      0.71      0.42       572

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.205
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.211071
Stopping. Best iteration:
[30]	validation_0-error:0.201429

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5029
     class 1       0.30      0.72      0.42       571

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.251607
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.2075
[200]	validation_0-error:0.198929
[300]	validation_0-error:0.191607
[400]	validation_0-error:0.185714
[500]	validation_0-error:0.17875
[600]	validation_0-error:0.171071
[700]	validation_0-error:0.166429
[800]	validation_0-error:0.16125
[900]	validation_0-error:0.156786
[999]	validation_0-error:0.151964
              precision    recall  f1-score   support

     class 0       0.95      0.88      0.91      5063
     class 1       0.33      0.58      0.42       537

    accuracy                           0.85      5600
   macro avg       0.64      0.73      0.67      5600
weighted avg       0.89      0.85      0.87      5600

F1 score : 0.418091


In [121]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 2021)

for train_idx, valid_idx in folds.split(x):
  train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
  valid_x, valid_y = x.iloc[valid_idx], y.iloc[valid_idx]

  smote = SMOTE(random_state = 2021)
  smote = smote.fit_sample(train_x, train_y)
  smote_x = pd.DataFrame(smote[0])
  smote_y = smote[1]

  smote_x.columns = valid_x.columns

  params = {'learning_rate': 0.01,
              'n_estimators': 1500, 
              'max_depth': 5,  
              'min_child_weight' : 5,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'reg_lambda' : 1e-5,
              'objective': 'binary:logistic',  
              'random_state': 0}

  model = XGBClassifier(**params)
  model.fit(smote_x, smote_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)

  y_pred = model.predict(valid_x)

  CM.append(confusion_matrix(valid_y, y_pred))
  f1_scores.append(f1_score(valid_y, y_pred))
  print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)



[0]	validation_0-error:0.241964
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.197143
[200]	validation_0-error:0.192143
[300]	validation_0-error:0.1875
[400]	validation_0-error:0.182321
[500]	validation_0-error:0.179821
[600]	validation_0-error:0.17375
[700]	validation_0-error:0.171071
[800]	validation_0-error:0.16625
[900]	validation_0-error:0.162321
[1000]	validation_0-error:0.160536
[1100]	validation_0-error:0.156964
[1200]	validation_0-error:0.154464
[1300]	validation_0-error:0.151786
[1400]	validation_0-error:0.149643
[1499]	validation_0-error:0.148571
              precision    recall  f1-score   support

     class 0       0.94      0.89      0.92      5041
     class 1       0.34      0.51      0.41       559

    accuracy                           0.85      5600
   macro avg       0.64      0.70      0.66      5600
weighted avg       0.88      0.85      0.86      5600





[0]	validation_0-error:0.203214
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.2075
Stopping. Best iteration:
[0]	validation_0-error:0.203214

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5030
     class 1       0.29      0.71      0.41       570

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.203214
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.222679
Stopping. Best iteration:
[0]	validation_0-error:0.203214

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5028
     class 1       0.29      0.71      0.42       572

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.205
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.211071
Stopping. Best iteration:
[30]	validation_0-error:0.201429

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5029
     class 1       0.30      0.72      0.42       571

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.251607
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.2075
[200]	validation_0-error:0.198929
[300]	validation_0-error:0.191607
[400]	validation_0-error:0.185714
[500]	validation_0-error:0.17875
[600]	validation_0-error:0.171071
[700]	validation_0-error:0.166429
[800]	validation_0-error:0.16125
[900]	validation_0-error:0.156786
[1000]	validation_0-error:0.152321
[1100]	validation_0-error:0.148929
[1200]	validation_0-error:0.146429
[1300]	validation_0-error:0.14125
[1400]	validation_0-error:0.138571
Stopping. Best iteration:
[1366]	validation_0-error:0.138214

              precision    recall  f1-score   support

     class 0       0.95      0.90      0.92      5063
     class 1       0.35      0.53      0.42       537

    accuracy                           0.86      5600
   macro avg       0.65      0.71      0.67      5600
weighted avg       0.89      0.86      0.87      5600

F1 score : 0.415904


In [124]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 2021)

for train_idx, valid_idx in folds.split(x):
  train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
  valid_x, valid_y = x.iloc[valid_idx], y.iloc[valid_idx]

  smote = SMOTE(random_state = 2021)
  smote = smote.fit_sample(train_x, train_y)
  smote_x = pd.DataFrame(smote[0])
  smote_y = smote[1]

  smote_x.columns = valid_x.columns

  params = {'learning_rate': 0.01,
              'n_estimators': 1000, 
              'max_depth': 5,  
              'min_child_weight' : 5,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'reg_lambda' : 3,
              'objective': 'binary:logistic',  
              'random_state': 0}

  model = XGBClassifier(**params)
  model.fit(smote_x, smote_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)

  y_pred = model.predict(valid_x)

  CM.append(confusion_matrix(valid_y, y_pred))
  f1_scores.append(f1_score(valid_y, y_pred))
  print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)



[0]	validation_0-error:0.24125
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.20375
[200]	validation_0-error:0.196786
[300]	validation_0-error:0.190893
[400]	validation_0-error:0.184286
[500]	validation_0-error:0.181607
[600]	validation_0-error:0.1775
[700]	validation_0-error:0.171964
[800]	validation_0-error:0.169821
[900]	validation_0-error:0.165357
[999]	validation_0-error:0.163036
              precision    recall  f1-score   support

     class 0       0.95      0.86      0.91      5041
     class 1       0.33      0.60      0.42       559

    accuracy                           0.84      5600
   macro avg       0.64      0.73      0.66      5600
weighted avg       0.89      0.84      0.86      5600





[0]	validation_0-error:0.202679
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.210714
Stopping. Best iteration:
[0]	validation_0-error:0.202679

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5030
     class 1       0.29      0.71      0.41       570

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.204286
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.226429
Stopping. Best iteration:
[0]	validation_0-error:0.204286

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5028
     class 1       0.29      0.71      0.42       572

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.210893
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.212321
Stopping. Best iteration:
[30]	validation_0-error:0.207321

              precision    recall  f1-score   support

     class 0       0.96      0.80      0.87      5029
     class 1       0.29      0.73      0.42       571

    accuracy                           0.79      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.79      0.83      5600





[0]	validation_0-error:0.249821
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.21
[200]	validation_0-error:0.20125
[300]	validation_0-error:0.19375
[400]	validation_0-error:0.187143
[500]	validation_0-error:0.179107
[600]	validation_0-error:0.174464
[700]	validation_0-error:0.168036
[800]	validation_0-error:0.162679
[900]	validation_0-error:0.159464
[999]	validation_0-error:0.154286
              precision    recall  f1-score   support

     class 0       0.95      0.87      0.91      5063
     class 1       0.33      0.61      0.43       537

    accuracy                           0.85      5600
   macro avg       0.64      0.74      0.67      5600
weighted avg       0.90      0.85      0.87      5600

F1 score : 0.420491


In [127]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 2021)

for train_idx, valid_idx in folds.split(x):
  train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
  valid_x, valid_y = x.iloc[valid_idx], y.iloc[valid_idx]

  smote = SMOTE(random_state = 2021)
  smote = smote.fit_sample(train_x, train_y)
  smote_x = pd.DataFrame(smote[0])
  smote_y = smote[1]

  smote_x.columns = valid_x.columns

  params = {'learning_rate': 0.01,
              'n_estimators': 1000, 
              'max_depth': 5,  
              'min_child_weight' : 5,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'reg_lambda' : 10,
              'objective': 'binary:logistic',  
              'random_state': 0}

  model = XGBClassifier(**params)
  model.fit(smote_x, smote_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)

  y_pred = model.predict(valid_x)

  CM.append(confusion_matrix(valid_y, y_pred))
  f1_scores.append(f1_score(valid_y, y_pred))
  print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)



[0]	validation_0-error:0.245
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.209286
[200]	validation_0-error:0.200714
[300]	validation_0-error:0.196071
[400]	validation_0-error:0.188393
[500]	validation_0-error:0.18375
[600]	validation_0-error:0.178393
[700]	validation_0-error:0.173929
[800]	validation_0-error:0.171607
[900]	validation_0-error:0.167857
[999]	validation_0-error:0.164821
              precision    recall  f1-score   support

     class 0       0.95      0.86      0.90      5041
     class 1       0.33      0.62      0.43       559

    accuracy                           0.84      5600
   macro avg       0.64      0.74      0.67      5600
weighted avg       0.89      0.84      0.86      5600





[0]	validation_0-error:0.27125
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.2125
[200]	validation_0-error:0.205714
[300]	validation_0-error:0.198036
[400]	validation_0-error:0.194286
[500]	validation_0-error:0.18875
[600]	validation_0-error:0.183214
[700]	validation_0-error:0.176964
[800]	validation_0-error:0.174107
[900]	validation_0-error:0.17125
[999]	validation_0-error:0.167143
              precision    recall  f1-score   support

     class 0       0.95      0.86      0.90      5030
     class 1       0.33      0.64      0.44       570

    accuracy                           0.83      5600
   macro avg       0.64      0.75      0.67      5600
weighted avg       0.89      0.83      0.85      5600





[0]	validation_0-error:0.221071
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.23125
Stopping. Best iteration:
[0]	validation_0-error:0.221071

              precision    recall  f1-score   support

     class 0       0.96      0.79      0.86      5028
     class 1       0.27      0.71      0.39       572

    accuracy                           0.78      5600
   macro avg       0.62      0.75      0.63      5600
weighted avg       0.89      0.78      0.82      5600





[0]	validation_0-error:0.198214
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.219286
Stopping. Best iteration:
[0]	validation_0-error:0.198214

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5029
     class 1       0.30      0.70      0.42       571

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.23375
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.215179
[200]	validation_0-error:0.203929
[300]	validation_0-error:0.195536
[400]	validation_0-error:0.190536
[500]	validation_0-error:0.185
[600]	validation_0-error:0.17875
[700]	validation_0-error:0.1725
[800]	validation_0-error:0.166964
[900]	validation_0-error:0.163214
[999]	validation_0-error:0.158571
              precision    recall  f1-score   support

     class 0       0.95      0.87      0.91      5063
     class 1       0.33      0.61      0.43       537

    accuracy                           0.84      5600
   macro avg       0.64      0.74      0.67      5600
weighted avg       0.89      0.84      0.86      5600

F1 score : 0.421896


In [132]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 2021)

for train_idx, valid_idx in folds.split(x):
  train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
  valid_x, valid_y = x.iloc[valid_idx], y.iloc[valid_idx]

  smote = SMOTE(random_state = 2021)
  smote = smote.fit_sample(train_x, train_y)
  smote_x = pd.DataFrame(smote[0])
  smote_y = smote[1]

  smote_x.columns = valid_x.columns

  params = {'learning_rate': 0.01,
              'n_estimators': 1000, 
              'max_depth': 5,  
              'min_child_weight' : 5,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'reg_lambda' : 20,
              'objective': 'binary:logistic',  
              'random_state': 0}

  model = XGBClassifier(**params)
  model.fit(smote_x, smote_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)

  y_pred = model.predict(valid_x)

  CM.append(confusion_matrix(valid_y, y_pred))
  f1_scores.append(f1_score(valid_y, y_pred))
  print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)



[0]	validation_0-error:0.2525
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.210536
[200]	validation_0-error:0.205
[300]	validation_0-error:0.200536
[400]	validation_0-error:0.192679
[500]	validation_0-error:0.188571
[600]	validation_0-error:0.182857
[700]	validation_0-error:0.178214
[800]	validation_0-error:0.174464
[900]	validation_0-error:0.168393
[999]	validation_0-error:0.165536
              precision    recall  f1-score   support

     class 0       0.95      0.86      0.90      5041
     class 1       0.33      0.63      0.43       559

    accuracy                           0.83      5600
   macro avg       0.64      0.74      0.67      5600
weighted avg       0.89      0.83      0.86      5600





[0]	validation_0-error:0.267321
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.215179
[200]	validation_0-error:0.208214
[300]	validation_0-error:0.202143
[400]	validation_0-error:0.197679
[500]	validation_0-error:0.191964
[600]	validation_0-error:0.188393
[700]	validation_0-error:0.183571
[800]	validation_0-error:0.176607
[900]	validation_0-error:0.172321
[999]	validation_0-error:0.168393
              precision    recall  f1-score   support

     class 0       0.96      0.85      0.90      5030
     class 1       0.33      0.66      0.44       570

    accuracy                           0.83      5600
   macro avg       0.65      0.76      0.67      5600
weighted avg       0.89      0.83      0.85      5600





[0]	validation_0-error:0.277679
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.231964
[200]	validation_0-error:0.226429
[300]	validation_0-error:0.218214
[400]	validation_0-error:0.209821
[500]	validation_0-error:0.202679
[600]	validation_0-error:0.193036
[700]	validation_0-error:0.185714
[800]	validation_0-error:0.180536
[900]	validation_0-error:0.174643
[999]	validation_0-error:0.170179
              precision    recall  f1-score   support

     class 0       0.96      0.85      0.90      5028
     class 1       0.33      0.67      0.44       572

    accuracy                           0.83      5600
   macro avg       0.65      0.76      0.67      5600
weighted avg       0.89      0.83      0.85      5600





[0]	validation_0-error:0.198571
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.221607
Stopping. Best iteration:
[0]	validation_0-error:0.198571

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5029
     class 1       0.30      0.70      0.42       571

    accuracy                           0.80      5600
   macro avg       0.63      0.76      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.235179
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.215536
[200]	validation_0-error:0.2075
[300]	validation_0-error:0.201071
[400]	validation_0-error:0.192857
[500]	validation_0-error:0.188214
[600]	validation_0-error:0.18125
[700]	validation_0-error:0.175536
[800]	validation_0-error:0.170714
[900]	validation_0-error:0.164821
[999]	validation_0-error:0.1625
              precision    recall  f1-score   support

     class 0       0.96      0.86      0.91      5063
     class 1       0.32      0.62      0.42       537

    accuracy                           0.84      5600
   macro avg       0.64      0.74      0.66      5600
weighted avg       0.89      0.84      0.86      5600

F1 score : 0.432508


In [135]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 2021)

for train_idx, valid_idx in folds.split(x):
  train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
  valid_x, valid_y = x.iloc[valid_idx], y.iloc[valid_idx]

  smote = SMOTE(random_state = 2021)
  smote = smote.fit_sample(train_x, train_y)
  smote_x = pd.DataFrame(smote[0])
  smote_y = smote[1]

  smote_x.columns = valid_x.columns

  params = {'learning_rate': 0.01,
              'n_estimators': 2000, 
              'max_depth': 5,  
              'min_child_weight' : 5,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'reg_lambda' : 30,
              'objective': 'binary:logistic',  
              'random_state': 0}

  model = XGBClassifier(**params)
  model.fit(smote_x, smote_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)

  y_pred = model.predict(valid_x)

  CM.append(confusion_matrix(valid_y, y_pred))
  f1_scores.append(f1_score(valid_y, y_pred))
  print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)



[0]	validation_0-error:0.2525
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.213393
[200]	validation_0-error:0.206786
[300]	validation_0-error:0.201786
[400]	validation_0-error:0.194107
[500]	validation_0-error:0.191429
[600]	validation_0-error:0.185536
[700]	validation_0-error:0.181607
[800]	validation_0-error:0.178214
[900]	validation_0-error:0.174464
[1000]	validation_0-error:0.170714
[1100]	validation_0-error:0.167143
[1200]	validation_0-error:0.164286
[1300]	validation_0-error:0.160714
[1400]	validation_0-error:0.157679
[1500]	validation_0-error:0.155536
[1600]	validation_0-error:0.154107
[1700]	validation_0-error:0.150536
[1800]	validation_0-error:0.148214
[1900]	validation_0-error:0.147143
[1999]	validation_0-error:0.145714
              precision    recall  f1-score   support

     class 0       0.95      0.89      0.92      5041
     class 1       0.35      0.55      0.43       559

    accuracy                           0.85     



[0]	validation_0-error:0.2675
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.220714
[200]	validation_0-error:0.212857
[300]	validation_0-error:0.205
[400]	validation_0-error:0.200536
[500]	validation_0-error:0.194286
[600]	validation_0-error:0.189821
[700]	validation_0-error:0.185536
[800]	validation_0-error:0.180714
[900]	validation_0-error:0.176071
[1000]	validation_0-error:0.171964
[1100]	validation_0-error:0.167679
[1200]	validation_0-error:0.164821
[1300]	validation_0-error:0.161964
[1400]	validation_0-error:0.16
[1500]	validation_0-error:0.157143
[1600]	validation_0-error:0.152857
[1700]	validation_0-error:0.150357
[1800]	validation_0-error:0.148571
[1900]	validation_0-error:0.146607
[1999]	validation_0-error:0.145357
              precision    recall  f1-score   support

     class 0       0.95      0.89      0.92      5030
     class 1       0.37      0.59      0.45       570

    accuracy                           0.85      5600
 



[0]	validation_0-error:0.279821
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.235357
[200]	validation_0-error:0.228393
[300]	validation_0-error:0.221607
[400]	validation_0-error:0.211429
[500]	validation_0-error:0.204464
[600]	validation_0-error:0.197143
[700]	validation_0-error:0.188393
[800]	validation_0-error:0.1825
[900]	validation_0-error:0.177679
[1000]	validation_0-error:0.172321
[1100]	validation_0-error:0.16875
[1200]	validation_0-error:0.163036
[1300]	validation_0-error:0.16
[1400]	validation_0-error:0.1575
[1500]	validation_0-error:0.154821
[1600]	validation_0-error:0.15125
[1700]	validation_0-error:0.1475
[1800]	validation_0-error:0.144107
[1900]	validation_0-error:0.143393
[1999]	validation_0-error:0.141964
              precision    recall  f1-score   support

     class 0       0.95      0.89      0.92      5028
     class 1       0.38      0.59      0.46       572

    accuracy                           0.86      5600
   m



[0]	validation_0-error:0.199464
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.223571
Stopping. Best iteration:
[0]	validation_0-error:0.199464

              precision    recall  f1-score   support

     class 0       0.96      0.81      0.88      5029
     class 1       0.30      0.70      0.42       571

    accuracy                           0.80      5600
   macro avg       0.63      0.75      0.65      5600
weighted avg       0.89      0.80      0.83      5600





[0]	validation_0-error:0.237321
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.217679
[200]	validation_0-error:0.20875
[300]	validation_0-error:0.203393
[400]	validation_0-error:0.197679
[500]	validation_0-error:0.188393
[600]	validation_0-error:0.183393
[700]	validation_0-error:0.1775
[800]	validation_0-error:0.171786
[900]	validation_0-error:0.167143
[1000]	validation_0-error:0.162857
[1100]	validation_0-error:0.158214
[1200]	validation_0-error:0.15625
[1300]	validation_0-error:0.152143
[1400]	validation_0-error:0.149464
[1500]	validation_0-error:0.145357
[1600]	validation_0-error:0.144821
[1700]	validation_0-error:0.141786
[1800]	validation_0-error:0.13875
[1900]	validation_0-error:0.137143
[1999]	validation_0-error:0.136429
              precision    recall  f1-score   support

     class 0       0.95      0.90      0.92      5063
     class 1       0.36      0.55      0.44       537

    accuracy                           0.86      56

In [137]:
CM = []
f1_scores = []

folds = KFold(n_splits = 5, shuffle = True, random_state = 2021)

for train_idx, valid_idx in folds.split(x):
  train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
  valid_x, valid_y = x.iloc[valid_idx], y.iloc[valid_idx]

  smote = SMOTE(random_state = 2021)
  smote = smote.fit_sample(train_x, train_y)
  smote_x = pd.DataFrame(smote[0])
  smote_y = smote[1]

  smote_x.columns = valid_x.columns

  params = {'learning_rate': 0.01,
              'n_estimators': 3000, 
              'max_depth': 5,  
              'min_child_weight' : 5,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'reg_lambda' : 50,
              'objective': 'binary:logistic',  
              'random_state': 0}

  model = XGBClassifier(**params)
  model.fit(smote_x, smote_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)

  y_pred = model.predict(valid_x)

  CM.append(confusion_matrix(valid_y, y_pred))
  f1_scores.append(f1_score(valid_y, y_pred))
  print(classification_report(valid_y, y_pred, target_names=['class 0', 'class 1']))

CM = sum(CM)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)



[0]	validation_0-error:0.244464
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.218393
[200]	validation_0-error:0.210536
[300]	validation_0-error:0.206071
[400]	validation_0-error:0.198214
[500]	validation_0-error:0.196071
[600]	validation_0-error:0.19
[700]	validation_0-error:0.18375
[800]	validation_0-error:0.181429
[900]	validation_0-error:0.177143
[1000]	validation_0-error:0.173036
[1100]	validation_0-error:0.16875
[1200]	validation_0-error:0.164107
[1300]	validation_0-error:0.1625
[1400]	validation_0-error:0.160179
[1500]	validation_0-error:0.157321
[1600]	validation_0-error:0.157143
[1700]	validation_0-error:0.155357
[1800]	validation_0-error:0.153571
[1900]	validation_0-error:0.15125
[2000]	validation_0-error:0.150536
[2100]	validation_0-error:0.148036
[2200]	validation_0-error:0.147143
[2300]	validation_0-error:0.144643
[2400]	validation_0-error:0.143571
[2500]	validation_0-error:0.142321
[2600]	validation_0-error:0.141429
[2700]	va



[0]	validation_0-error:0.267857
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.22125
[200]	validation_0-error:0.214464
[300]	validation_0-error:0.208929
[400]	validation_0-error:0.205179
[500]	validation_0-error:0.198036
[600]	validation_0-error:0.193214
[700]	validation_0-error:0.188393
[800]	validation_0-error:0.185536
[900]	validation_0-error:0.180893
[1000]	validation_0-error:0.175
[1100]	validation_0-error:0.170714
[1200]	validation_0-error:0.167857
[1300]	validation_0-error:0.164643
[1400]	validation_0-error:0.161964
[1500]	validation_0-error:0.16125
[1600]	validation_0-error:0.156964
[1700]	validation_0-error:0.154821
[1800]	validation_0-error:0.153571
[1900]	validation_0-error:0.15
[2000]	validation_0-error:0.147679
[2100]	validation_0-error:0.145536
[2200]	validation_0-error:0.145
[2300]	validation_0-error:0.14375
[2400]	validation_0-error:0.142321
[2500]	validation_0-error:0.142321
[2600]	validation_0-error:0.141071
[2700]	valida



[0]	validation_0-error:0.269821
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.237857
[200]	validation_0-error:0.232679
[300]	validation_0-error:0.226429
[400]	validation_0-error:0.216964
[500]	validation_0-error:0.206429
[600]	validation_0-error:0.2
[700]	validation_0-error:0.194464
[800]	validation_0-error:0.188214
[900]	validation_0-error:0.183214
[1000]	validation_0-error:0.177679
[1100]	validation_0-error:0.172679
[1200]	validation_0-error:0.167679
[1300]	validation_0-error:0.164821
[1400]	validation_0-error:0.163036
[1500]	validation_0-error:0.160893
[1600]	validation_0-error:0.1575
[1700]	validation_0-error:0.154464
[1800]	validation_0-error:0.153214
[1900]	validation_0-error:0.150357
[2000]	validation_0-error:0.14875
[2100]	validation_0-error:0.145536
[2200]	validation_0-error:0.142857
[2300]	validation_0-error:0.141607
[2400]	validation_0-error:0.138929
[2500]	validation_0-error:0.137143
[2600]	validation_0-error:0.135536
Stopping



[0]	validation_0-error:0.250893
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.228571
[200]	validation_0-error:0.219821
[300]	validation_0-error:0.212143
[400]	validation_0-error:0.205179
[500]	validation_0-error:0.196607
[600]	validation_0-error:0.188571
[700]	validation_0-error:0.184107
[800]	validation_0-error:0.179286
[900]	validation_0-error:0.174107
[1000]	validation_0-error:0.17
[1100]	validation_0-error:0.165179
[1200]	validation_0-error:0.161429
[1300]	validation_0-error:0.158036
[1400]	validation_0-error:0.155357
[1500]	validation_0-error:0.154821
[1600]	validation_0-error:0.152143
[1700]	validation_0-error:0.149821
[1800]	validation_0-error:0.14875
[1900]	validation_0-error:0.147321
[2000]	validation_0-error:0.14625
[2100]	validation_0-error:0.144643
[2200]	validation_0-error:0.141607
[2300]	validation_0-error:0.140179
[2400]	validation_0-error:0.139643
[2500]	validation_0-error:0.138214
[2600]	validation_0-error:0.136429
[2700]



[0]	validation_0-error:0.235893
Will train until validation_0-error hasn't improved in 100 rounds.
[100]	validation_0-error:0.222857
[200]	validation_0-error:0.214464
[300]	validation_0-error:0.204821
[400]	validation_0-error:0.200179
[500]	validation_0-error:0.19375
[600]	validation_0-error:0.188393
[700]	validation_0-error:0.18
[800]	validation_0-error:0.176071
[900]	validation_0-error:0.170357
[1000]	validation_0-error:0.166786
[1100]	validation_0-error:0.162857
[1200]	validation_0-error:0.15875
[1300]	validation_0-error:0.155536
[1400]	validation_0-error:0.152679
[1500]	validation_0-error:0.150179
[1600]	validation_0-error:0.148214
[1700]	validation_0-error:0.146429
[1800]	validation_0-error:0.143571
[1900]	validation_0-error:0.141607
[2000]	validation_0-error:0.139107
[2100]	validation_0-error:0.13875
[2200]	validation_0-error:0.136786
[2300]	validation_0-error:0.135714
[2400]	validation_0-error:0.133929
[2500]	validation_0-error:0.133036
Stopping. Best iteration:
[2449]	validatio

  params = {'learning_rate': 0.01,
              'n_estimators': 3000, 
              'max_depth': 5,  
              'min_child_weight' : 5,
              'subsample' : 0.5,
              'colsample_bytree' : 0.5,
              'reg_lambda' : 50,
              'objective': 'binary:logistic',  
              'random_state': 0}

In [None]:
# train set에 대한 smote 시행

In [None]:
model = XGBClassifier(learning_rate=0.5, n_estimators=800, max_depth=5, min_child_weight=3, subsample=0.7, colsample_bytree=0.7, reg_lambda=3, objective='binary:logistic', early_stopping_rounds = 100, random_state=0)

In [None]:
model.fit(x, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=None, n_estimators=800, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=100, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.7, verbosity=1)

In [None]:
test = pd.read_csv("test_pca.csv")

In [None]:
test.shape

(2000, 189)

In [None]:
prediction = model.predict(test)

In [None]:
submission = pd.DataFrame(columns = ['id', 'target'])
submission['id'] = range(1,2001)

In [None]:
submission['target'] = prediction

In [None]:
submission['target'].value_counts()

0    1523
1     477
Name: target, dtype: int64

In [None]:
submission.to_csv("xgboost_ver2.csv", header = True, index = False)