In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 목차  

## 1. LGBM Modeling  
## 2. XGboost Modeling

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import warnings
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

## 1. LGBM Modeling

### 1.1 Input Data 전처리 및 모델 관련 함수 정의

In [0]:
base_path='/content/drive/My Drive/big_con/model/true_lgbm'  #모델 저장 경로

def save_model(filename,model,path=base_path):               #모델 저장
  file_path=os.path.join(path,filename)  
  joblib.dump(model,file_path) 

def load_model(filename,path=base_path):                     #모델 불러오기
  file_path=os.path.join(path,filename) 
  model = joblib.load(file_path)
  return model


def met(x,y): #x: y_test, y: y_pred                          #성능 출력
    accuracy = accuracy_score(x, y)
    precision=precision_score(x, y)
    recall=recall_score(x, y)
    roc=roc_auc_score(x, y)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Precision: %.2f%% " % (precision *100))
    print("Recall: %.2f%% " % (recall * 100))
    print("AUC: %.2f%% " % (roc *100))

def pre_data(data_x):                                       #Biinary Category 정리 및 One Hot Encoding
    data_x.reset_index(inplace=True)
    afsnt_c02_binary=data_x[(data_x.DRR=='C02')|(data_x.DLY=='N')] 
    afsnt_c02_binary.AOD=np.where(afsnt_c02_binary.AOD=='D',1,0)
    afsnt_c02_binary.IRR=np.where(afsnt_c02_binary.IRR=='Y',1,0)
    x=afsnt_c02_binary.drop(['DRR'],axis=1)
    x=pd.get_dummies(data=x,columns=['ARP','ODP','FLO'])     
    x.DLY=np.where(x.DLY=='Y',1,0)
    return x
    

### 1.2 EDA에서 전처리한 파일 불러오기

In [0]:
afsnt_out=pd.read_csv('/content/drive/My Drive/test_colab/afsnt_pre.csv',encoding='utf-8',index_col=0)  #EDA 전처리한 파일
afsnt_out.index=pd.to_datetime(afsnt_out.Date)
afsnt_out['Month']=afsnt_out.index.month
afsnt_out.drop(['Date','FLT'],axis=1,inplace=True)

In [0]:
X=pre_data(afsnt_out)
X.index=X.Date
X.drop(['Date'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### 1.3 Train -Test set 나누기  
- Test set을 Delay_DLY와 비슷하게 구성함

In [0]:
X_train=pd.concat([X[:'2018-09-15'],X['2018-10-01':]])
X_test=X['2018-09-15':'2018-10-01']
Y_train=X_train.DLY
Y_test=X_test.DLY
X_train.drop('DLY',axis=1,inplace=True)
X_test.drop('DLY',axis=1,inplace=True)
X_train.reset_index(drop=True)
X_test.reset_index(drop=True)
Y_train.reset_index(drop=True)
Y_test.reset_index(drop=True)
X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)
X_train.drop('Date',axis=1,inplace=True)
X_test.drop('Date',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


### 1.4 LGBM Parameter 설정
- RandomGridSearchCV로 탐색 후 lr 변경하며 휴리스틱하게 탐색
- Imbalanced Data 해결을 위한 class weight 설정  
- Random GridSearchCV를 통한 Hyperparameter 탐색

In [0]:
weight_1=np.sqrt((Y_train[Y_train==0].shape[0])/(Y_train[Y_train==1].shape[0])) #
weight_2=(Y_train[Y_train==0].shape[0])/(Y_train[Y_train==1].shape[0])   
params = {
        'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'scale_pos_weight':weight_2,
    'random_state': 0,
    'n_estimators':10000,
    'objective': 'binary',
    'reg_lambda':1,
    'reg_alpha':1.2,
    'metric' : ['auc']
        }

### 1.5  LGBM Training

In [0]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42, stratify=Y_train)

model =lgb.LGBMClassifier(**params)
model.fit(X_train,Y_train,eval_set=[(X_train,Y_train),(X_val,Y_val)],verbose=1,early_stopping_rounds=500)
save_model('xxxx',model) 


### 1.6 LGBM 성능 평가

1. Accuracy  
2. Precision  
3. Recall 
4. AUC

In [0]:
def predict_prob(model,data):   
  y_prob=[]
  for i in model:
    y_probab=i.predict_proba(data)
    y_prob.append(y_probab)
  prediction=np.array(y_prob).mean(axis=0)
  return np.argmax(prediction,axis=1)

model_1=load_model('lightlight')
y_pred=model_1.predict(X_test)
met(Y_test,y_pred)

Accuracy: 74.75%
Precision: 19.20% 
Recall: 70.10% 
AUC: 72.62% 


## 2. Xgboost Modeling  

1. RandomGridSearchCV 로 Hyperparameter 탐색
2. Model Training  


### 2.1 Xgboost Training

In [0]:
import xgboost as xgb

base_path='/content/drive/My Drive/big_con/model/xgboost'   #xgboost 저장 경로
  
params = {
        'gamma':0.1,
        'learning_rate':0.1,
        'max_depth':3,
        'n_estimators':10000,
        'n_jobs':-1,
        'reg_lambda':1,
        'eval_metric':'auc',
        'objective':'binary:logistic',
        'scale_pos_weight':weight_2,
        'min_child_weight' : 1,
        'random_state': 0

        }

x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42, stratify=y_train)

model = xgb.XGBClassifier(**params)
model.fit(X_train,Y_train,eval_set=[(x_train,y_train),(x_val,y_val)],verbose=1,early_stopping_rounds=500)
save_model('xgbxgb',model)

## 2.2 Xgoost Evaluation

In [0]:
model_2=load_model('xgbxgb')
y_pred=model_2.predict(X_test)
met(Y_test,y_pred)

Accuracy: 72.80%
Precision: 17.78% 
Recall: 69.01% 
AUC: 71.06% 
