In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
from xgboost import XGBRegressor
from xgboost import plot_importance

In [None]:
# 데이터 불러오기
df_train = pd.read_csv("src/data/train.csv") 
df_test = pd.read_csv("src/data/test.csv")

# 결측치 제거
df_train = df_train.dropna(axis=1)
df_test = df_test.dropna(axis=1)

# x와 y분류
x = df_train.drop(['ID', 'Y_LABEL'], axis=1)
y = df_train['Y_LABEL']

# test데이터에서 ID값 제거
df_test = df_test.drop(['ID'], axis = 1)

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, stratify=y)

scaler = StandardScaler()
le = LabelEncoder()

# 변수 정규화
def get_values(value):
  return value.values.reshape(-1, 1)

categorical_features = ['COMPONENT_ARBITRARY','YEAR']
test_stage_features = ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']

for col in x:
  if col not in categorical_features:
    x_train[col] = scaler.fit_transform(get_values(x_train[col]))
    x_valid[col] = scaler.transform(get_values(x_valid[col]))
    if col in df_test.columns:
        df_test[col] = scaler.transform(get_values(df_test[col]))

for col in categorical_features:
  x_train[col] = le.fit_transform(x_train[col])
  x_valid[col] = le.transform(x_valid[col])
  if col in df_test.columns:
      df_test[col] = le.transform(df_test[col])

In [None]:
# Xgboost 이진분류 모델 gridCV로 파라미터 튜닝
params = {'n_estimators': [100,200,400,600,800,1000],
        'max_depth' : [3,4,6,8,10,12],
        'eta' : [0.1], 
        'objective' : ['binary:logistic']}

model = XGBClassifier()
model_grid = GridSearchCV(model, param_grid=params, cv=3)
model_grid.fit(x_train, y_train, early_stopping_rounds=30, eval_metric='auc', eval_set = [(x_valid, y_valid)])

# 최적 파라미터 출력
print(model_grid.best_params_)
print(model_grid.best_score_)

In [None]:
# 최적 파라미터로 모델 적용 후 예측값 구하기
model = XGBClassifier(n_estimators=200, eta=0.1, max_depth=3, min_child_weight = 3,objective='binary:logistic')
model.fit(x_train, y_train, early_stopping_rounds=30, eval_metric='auc', eval_set = [(x_valid, y_valid)])

pro = pd.DataFrame(model.predict_proba(x_train))
y_reg= pd.DataFrame(pro.iloc[:, 1:2])

plot_importance(model)

In [None]:
# Xgboost 회귀분석 모델 GridCV로 최적 파리미터 탐색
xgb_reg = XGBRegressor()
parameters = {'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.4, 0.6,0.8],
              'colsample_bytree': [0.2, 0.6, 0.8],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb_reg,
                        parameters,
                        cv = 3
                        )
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_train[test_stage_features], y_reg, test_size=0.2)
xgb_grid.fit(x_train2,y_train2)

# 최적 파라미터 출력
print(xgb_grid.best_params_)
print(xgb_grid.best_score_) 

In [None]:
# 최적 파라미터로 회귀분석에 학습 
model_reg = XGBRegressor(colsample_bytree= 0.8, learning_rate= 0.03, max_depth= 5, min_child_weight= 2, n_estimators= 500, nthread= 4, subsample= 0.8)
model_reg.fit(x_train2, y_train2, early_stopping_rounds=30, eval_metric='rmse', eval_set = [(x_test2, y_test2)])

plot_importance(model_reg)

In [None]:
# 모델에 test데이터 적용

y_hat = model_reg.predict(df_test)

y_hat = pd.DataFrame(y_hat)
y_hat

In [None]:
submit = pd.read_csv("src/data/test.csv")
submit['Y_LABEL'] = y_hat

final_submit = pd.DataFrame()
final_submit['ID'] = submit['ID']
final_submit['Y_LABEL'] = y_hat


def get_reg(v):
  if v >= 0.355:
    score = 1
  else:
    score = 0
  return score

final_submit['Y_LABEL'] = final_submit['Y_LABEL'].apply(lambda v: get_reg(v))
final_submit

final_submit.to_csv('src/submit/submit9.csv', index=False)