In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
def get_eval(y_test, y_pred):
    cf = confusion_matrix(y_test, preds)
    print("confusion matrix : ")
    print(cf)
    p = precision_score(y_test, preds)
    print("precision score :", p)
    r = recall_score(y_test, preds)
    print("recall score :", r)
    f1 = f1_score(y_test, preds)
    print("f1 score :", f1)
    auc = roc_auc_score(y_test, preds)
    print("auc score :", auc)

In [None]:
train  = pd.read_csv('competition_data/train.csv')
test  = pd.read_csv('competition_data/test.csv')

In [None]:
train.head()

In [None]:
#train.info()

In [None]:
#test.info()

# EDA

In [None]:
# 연속형 변수간 상관관계
numerical_feats = train.dtypes[train.dtypes != "object"].index 
colormap = plt.cm.PuBu 
sns.set(font_scale=1.0) 

corr_data = train[numerical_feats]
f , ax = plt.subplots(figsize = (14,12)) 
plt.title('Correlation of Numeric Features',y=1,size=18) 
sns.heatmap(corr_data.corr(),square = True, linewidths = 0.1, cmap = colormap, linecolor = "white", vmax=0.8)

In [None]:
# nerdiness와 높은 상관성 가진 변수들 10개
corrmat=train.corr()
col1 = corrmat.nlargest(15, 'nerdiness')['nerdiness'].index
corrmat1 = train[col1].corr()
plt.subplots(figsize = (10,6))
sns.heatmap(corrmat1, annot=True,  cmap = colormap, linecolor = "white")
plt.show()

# 모델링

In [12]:
train  = pd.read_csv('competition_data/train.csv')
test  = pd.read_csv('competition_data/test.csv')

In [13]:
# 쓸모없는 변수 제거
train = train.drop(['index', 'country'],axis = 1)
test = test.drop(['index', 'country'],axis = 1)

In [14]:
# train을 target과 feature로 나눔
train_x=train.drop(['nerdiness'], axis=1)
train_y=train['nerdiness']

In [15]:
# test, validation 데이터 셋 분리
x_train, x_valid, y_train, y_valid=train_test_split(train_x, train_y, test_size=0.8, random_state=345)

In [26]:
# XGBoost 파라미터 설정
params = {'max_depth':[3,4,5,6,7,8,9],
         'eta':[0.1,0.2,0.3],
         'gamma':[1,2,3,4,5],
         'lambda':[1,2,3,4,5]}
'''
'early_stoppings':100
'''

num_rounds=400

In [28]:
# 모델 학습

kf = KFold(n_splits=5, shuffle=True, random_state=42)

model = xgb.XGBClassifier(params = params)

xgb_model_gridCV = GridSearchCV(xgb_model, param_grid=params, cv=kf,n_jobs = 5,verbose=True)
xgb_model_gridCV = xgb_model_gridCV.fit(x_train, y_train, eval_metric = 'auc', eval_set = [(x_valid, y_valid)])

print(xgb_model_gridCV.best_score_)
print(xgb_model_gridCV.best_params_)

Fitting 5 folds for each of 525 candidates, totalling 2625 fits
Parameters: { "params" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.71818
[1]	validation_0-auc:0.75234
[2]	validation_0-auc:0.76286
[3]	validation_0-auc:0.76864
[4]	validation_0-auc:0.77378
[5]	validation_0-auc:0.77776
[6]	validation_0-auc:0.77926
[7]	validation_0-auc:0.78043
[8]	validation_0-auc:0.78162
[9]	validation_0-auc:0.78286
[10]	validation_0-auc:0.78355
[11]	validation_0-auc:0.78298
[12]	validation_0-auc:0.78323
[13]	validation_0-auc:0.78325
[14]	validation_0-auc:0.78334
[15]	validation_0-auc:0.78357
[16]	validation_0-auc:0.78297
[17]	validation_0-auc:0.78299
[18]	validation_0-auc:0.78353
[19]	validation_0-auc:0.78379
[20]	validation_0-auc:0.78411
[21

In [10]:
# 예측(validation)
pred_probs = xgb_model.predict(x_valid)
preds=[1 if x > 0.5 else 0 for x in pred_probs]
print('예측값 10개:',preds[:10])

예측값 10개: [1, 0, 1, 1, 0, 0, 1, 1, 1, 1]


In [11]:
get_eval(y_valid, preds)

confusion matrix : 
[[3497 1883]
 [1526 5094]]
precision score : 0.7301132291815967
recall score : 0.7694864048338369
f1 score : 0.7492829300581011
auc score : 0.7097432024169184


# 제출 파일 생성

In [None]:
submission = pd.read_csv('competition_data/sample_submission.csv')

submission

In [None]:
pred_test = xgb_model.predict(test)
submission["nerdiness"]=[1 if x > 0.5 else 0 for x in pred_test]

In [None]:
submission

In [None]:
submission.to_csv("baseline.csv", index = False)