In [18]:
#Upload csv excel file
import pandas as pd
X_train = pd.read_csv('/dshome/WoongLab/heo/oil_kamp/Data/X_train.csv')
X_test = pd.read_csv('/dshome/WoongLab/heo/oil_kamp/Data/X_test.csv')
y_train = pd.read_csv('/dshome/WoongLab/heo/oil_kamp/Data/y_train.csv')
y_test = pd.read_csv('/dshome/WoongLab/heo/oil_kamp/Data/y_test.csv')
 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(584640, 4)
(250560, 4)
(584640, 1)
(250560, 1)


## Classification Model 구축

In [19]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline 

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix 
 
import torch
import torch.nn as nn


In [20]:
### 머신러닝 분류모델 라이브러리 로드

from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression  
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier 
from xgboost import XGBClassifier 
from sklearn.model_selection import GridSearchCV 

### 딥러닝 모델 'Tabnet' 라이브러리 로드

from pytorch_tabnet.tab_model import TabNetClassifier

In [21]:
import warnings 
warnings.filterwarnings('ignore')


In [22]:
# 변수중요도가 낮은 INSP 변수는 분류모델 구축에서 제외함

X_train = X_train[['MOTORSPEED','MELT_WEIGHT','MELT_TEMP']]
X_test = X_test[['MOTORSPEED','MELT_WEIGHT','MELT_TEMP']]


In [23]:
X_train

Unnamed: 0,MOTORSPEED,MELT_WEIGHT,MELT_TEMP
0,74,128,410
1,1740,658,740
2,222,569,411
3,1720,316,764
4,1703,512,769
...,...,...,...
584635,76,2,456
584636,1725,28,749
584637,136,452,496
584638,167,382,466


In [24]:
#Scaling data
# 트리계열 모델을 제외한 모델들은 각 변수들의 단위가 달라서 scaling을 통해 맞춰줘야함

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_transform = scaler.fit_transform(X_train)
X_test_transform=scaler.transform(X_test)

In [25]:
X_train_transform

array([[-0.60388753, -0.37476763, -0.77362127],
       [ 2.0001629 ,  0.06229296,  1.79738435],
       [-0.37255532, -0.01110024, -0.76583034],
       ...,
       [-0.50697809, -0.10758342, -0.10360162],
       [-0.45852337, -0.1653084 , -0.3373294 ],
       [-0.43038837,  0.09115545, -0.75803942]])

## Decision Tree 구축

In [26]:
parameters={'max_depth':[3,5,8], 'min_samples_split': [5,10,15]}

In [27]:
dtree=DecisionTreeClassifier(random_state=42)

In [28]:
grid_dtree=GridSearchCV(dtree, param_grid=parameters,cv=5,refit=True,scoring ="roc_auc")

In [29]:
## Find the optimized parameters 

grid_dtree.fit(X_train,y_train)

In [30]:
# GridSearchCV 결과를 추출해 DataFrame으로 변환 

scores_df=pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params','mean_test_score','rank_test_score',
           'split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 3, 'min_samples_split': 5}",0.781966,7,0.782163,0.781187,0.780641
1,"{'max_depth': 3, 'min_samples_split': 10}",0.781966,7,0.782163,0.781187,0.780641
2,"{'max_depth': 3, 'min_samples_split': 15}",0.781966,7,0.782163,0.781187,0.780641
3,"{'max_depth': 5, 'min_samples_split': 5}",0.796417,4,0.797177,0.796039,0.794497
4,"{'max_depth': 5, 'min_samples_split': 10}",0.796417,4,0.797177,0.796039,0.794497
5,"{'max_depth': 5, 'min_samples_split': 15}",0.796417,4,0.797177,0.796039,0.794497
6,"{'max_depth': 8, 'min_samples_split': 5}",0.800427,2,0.801262,0.799699,0.797962
7,"{'max_depth': 8, 'min_samples_split': 10}",0.800439,1,0.801262,0.799718,0.797976
8,"{'max_depth': 8, 'min_samples_split': 15}",0.800423,3,0.801179,0.799718,0.797976


In [31]:
print('GridSearchCV 최적 파라미터:', grid_dtree.best_params_)
print('GridSearchCV 최고 정확도:{0:.4f}'.format(grid_dtree.best_score_))


GridSearchCV 최적 파라미터: {'max_depth': 8, 'min_samples_split': 10}
GridSearchCV 최고 정확도:0.8004


In [32]:
# GridSearchCV의 refit으로 이미 학습된 estimator 반환. 즉 최적의 하이퍼파라미터 반환

best_dtree=grid_dtree.best_estimator_ 


In [33]:
pred_dtree=best_dtree.predict(X_test)
prob_dtree=best_dtree.predict_proba(X_test)[:,1]                  # roc_auc_score 함수에서는 인수로는 실제값과 예측확률값이 들어간다. 

print('Test set accuracy: ', accuracy_score(y_test, pred_dtree))
print('Test set precision: ',precision_score(y_test,pred_dtree))
print('Test set recall: ', recall_score(y_test,pred_dtree))
print('Test set F1 score: ', f1_score(y_test, pred_dtree))
print('Test set AUC Score: ', roc_auc_score(y_test,prob_dtree))

Test set accuracy:  0.7941570881226053
Test set precision:  0.7969447249275268
Test set recall:  0.9913695299837926
Test set F1 score:  0.8835882342319569
Test set AUC Score:  0.7992216849697209


## 로지스틱 회귀모형 만들기

In [34]:
lg_model=LogisticRegression(random_state=42)


In [35]:
param_grid = {'C': [ 0.01, 0.1, 1, 10, 100]}

In [36]:
grid_lg = GridSearchCV(lg_model, param_grid, cv=5, refit=True ,scoring ="roc_auc")      

In [37]:
grid_lg.fit(X_train_transform,y_train)

In [38]:
scores_df=pd.DataFrame(grid_lg.cv_results_)
scores_df[['params','mean_test_score','rank_test_score',
           'split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,{'C': 0.01},0.760185,5,0.762202,0.759428,0.757892
1,{'C': 0.1},0.761106,4,0.763031,0.760222,0.758837
2,{'C': 1},0.761195,3,0.763108,0.760296,0.758927
3,{'C': 10},0.761203,2,0.763116,0.760303,0.758936
4,{'C': 100},0.761204,1,0.763117,0.760304,0.758937


In [39]:
print('GridSearchCV 최적 파라미터:', grid_lg.best_params_)
print('GridSearchCV 최고 정확도:{0:.4f}'.format(grid_lg.best_score_))

GridSearchCV 최적 파라미터: {'C': 100}
GridSearchCV 최고 정확도:0.7612


In [40]:
# GridSearchCV의 refit으로 이미 학습된 estimator 반환 

best_lg=grid_lg.best_estimator_

In [41]:
pred_lg=best_lg.predict(X_test_transform)
prob_lg=best_lg.predict_proba(X_test_transform)[:,1]

print('Test accuracy: ', accuracy_score(y_test, pred_lg))
print('Test set precision: ',precision_score(y_test,pred_lg))
print('Test set recall: ', recall_score(y_test,pred_lg))
print('Test set F1 score: ', f1_score(y_test, pred_lg))
print('Test set AUC Score: ', roc_auc_score(y_test,prob_lg))

Test accuracy:  0.7826947637292465
Test set precision:  0.7927380523436732
Test set recall:  0.9806118314424636


Test set F1 score:  0.8767229980619102
Test set AUC Score:  0.7608525751894583


## KNN 모델 구축

In [42]:
grid_params = {
    'n_neighbors' : list(range(1,10)),
    'metric' : ['euclidean', 'manhattan']
}

In [43]:
## Simple model

knn=KNeighborsClassifier()

In [44]:
grid_knn = GridSearchCV(knn, grid_params, cv=5,scoring="roc_auc")

In [45]:
grid_knn.fit(X_train_transform,y_train)

In [46]:
scores_df=pd.DataFrame(grid_knn.cv_results_)
scores_df[['params','mean_test_score','rank_test_score',
           'split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'metric': 'euclidean', 'n_neighbors': 1}",0.600098,18,0.600603,0.600177,0.598121
1,"{'metric': 'euclidean', 'n_neighbors': 2}",0.656225,16,0.656728,0.656227,0.654932
2,"{'metric': 'euclidean', 'n_neighbors': 3}",0.690611,13,0.691799,0.691161,0.688205
3,"{'metric': 'euclidean', 'n_neighbors': 4}",0.712614,12,0.713815,0.713825,0.710074
4,"{'metric': 'euclidean', 'n_neighbors': 5}",0.728446,9,0.729568,0.729534,0.726014
5,"{'metric': 'euclidean', 'n_neighbors': 6}",0.739388,8,0.740244,0.740277,0.737429
6,"{'metric': 'euclidean', 'n_neighbors': 7}",0.748084,5,0.748568,0.74837,0.746657
7,"{'metric': 'euclidean', 'n_neighbors': 8}",0.754548,3,0.754943,0.754444,0.753308
8,"{'metric': 'euclidean', 'n_neighbors': 9}",0.759088,2,0.759583,0.758808,0.7575
9,"{'metric': 'manhattan', 'n_neighbors': 1}",0.600169,17,0.600241,0.60028,0.597937


In [47]:
print('GridSearchCV 최적 파라미터:', grid_knn.best_params_)
print('GridSearchCV 최고 정확도:{0:.4f}'.format(grid_knn.best_score_))

GridSearchCV 최적 파라미터: {'metric': 'manhattan', 'n_neighbors': 9}
GridSearchCV 최고 정확도:0.7594


In [48]:
# GridSearchCV의 refit으로 이미 학습된 estimator 반환 

best_knn=grid_knn.best_estimator_ 


In [49]:
pred_knn=best_knn.predict(X_test_transform)
prob_knn=best_knn.predict_proba(X_test_transform)[:,1]

print('Test accuracy: ', accuracy_score(y_test, pred_knn))
print('Test set precision: ',precision_score(y_test,pred_knn))
print('Test set recall: ', recall_score(y_test,pred_knn))
print('Test set F1 score: ', f1_score(y_test, pred_knn))
print('Test set AUC Score: ', roc_auc_score(y_test,prob_knn))

Test accuracy:  0.7738625478927204
Test set precision:  0.8205014957449812
Test set recall:  0.9126873987034035
Test set F1 score:  0.8641428273426317
Test set AUC Score:  0.7578260266806692


## 랜덤포레스트 모형 및 최적의 하이퍼 파라미터 및 튜닝하기

In [50]:
params={
    'n_estimators': [100,200],
    'max_depth':[5,7,9],
    'min_samples_split': [2,4,8]
}

In [51]:
# RandomForestClassifier 객체 생성 후 GridSearchCV 수행 

rf_clf=RandomForestClassifier(random_state=42)
grid_rf=GridSearchCV(rf_clf,param_grid=params,cv=5,n_jobs=-1,scoring='roc_auc')
grid_rf.fit(X_train,y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

In [52]:
scores_df=pd.DataFrame(grid_rf.cv_results_)
scores_df[['params','mean_test_score','rank_test_score',
           'split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 5, 'min_samples_split': 2, 'n_es...",0.794402,14,0.794292,0.793511,0.793159
1,"{'max_depth': 5, 'min_samples_split': 2, 'n_es...",0.794248,17,0.794559,0.793392,0.792615
2,"{'max_depth': 5, 'min_samples_split': 4, 'n_es...",0.794404,13,0.794293,0.793511,0.793163
3,"{'max_depth': 5, 'min_samples_split': 4, 'n_es...",0.794259,16,0.794529,0.79338,0.792564
4,"{'max_depth': 5, 'min_samples_split': 8, 'n_es...",0.794387,15,0.794297,0.793483,0.793163
5,"{'max_depth': 5, 'min_samples_split': 8, 'n_es...",0.794244,18,0.79449,0.793371,0.79251
6,"{'max_depth': 7, 'min_samples_split': 2, 'n_es...",0.798781,9,0.799301,0.798156,0.797162
7,"{'max_depth': 7, 'min_samples_split': 2, 'n_es...",0.798737,11,0.79927,0.798014,0.79703
8,"{'max_depth': 7, 'min_samples_split': 4, 'n_es...",0.798814,7,0.799287,0.798196,0.796854
9,"{'max_depth': 7, 'min_samples_split': 4, 'n_es...",0.798775,10,0.799218,0.798027,0.796991


In [53]:
print('GridSearchCV 최적 하이퍼 파라미터: ',grid_rf.best_params_)
print('GridSearchCV  최고 예측 정확도: {0:.4f}'.format(grid_rf.best_score_))

GridSearchCV 최적 하이퍼 파라미터:  {'max_depth': 9, 'min_samples_split': 8, 'n_estimators': 200}
GridSearchCV  최고 예측 정확도: 0.8009


In [54]:
# GridSearchCV의 refit으로 이미 학습된 estimator 반환 

best_rf=grid_rf.best_estimator_


In [55]:
# GridSearchCV의 best_estimator_는 이미 최적 학습이 됐으므로 별도 학습이 필요 없음

pred_rf=grid_rf.predict(X_test)
prob_rf=best_rf.predict_proba(X_test)[:,1]

print('Test accuracy: ', accuracy_score(y_test, pred_rf))
print('Test set precision: ',precision_score(y_test,pred_rf))
print('Test set recall: ', recall_score(y_test,pred_rf))
print('Test set F1 score: ', f1_score(y_test, pred_rf))
print('Test set AUC Score: ', roc_auc_score(y_test,prob_rf))

Test accuracy:  0.7937060983397191
Test set precision:  0.7948002249163643
Test set recall:  0.9951225688816856
Test set F1 score:  0.883751683935202
Test set AUC Score:  0.7988611837411181


## XGBoost 모형 만들기

In [56]:

xgb_model=XGBClassifier(random_state=42)

# 후보 파라미터 선정 

params={'n_estimators':[100, 200],'max_depth':[5,7,9,11]}

In [57]:
grid_xg = GridSearchCV (xgb_model, param_grid = params, cv = 5, scoring='roc_auc')
grid_xg.fit( X_train, y_train )


In [58]:
scores_df=pd.DataFrame(grid_xg.cv_results_)
scores_df[['params','mean_test_score','rank_test_score',
           'split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 5, 'n_estimators': 100}",0.803744,1,0.804595,0.80236,0.802092
1,"{'max_depth': 5, 'n_estimators': 200}",0.803125,2,0.804186,0.801708,0.801449
2,"{'max_depth': 7, 'n_estimators': 100}",0.8023,3,0.803099,0.801069,0.800334
3,"{'max_depth': 7, 'n_estimators': 200}",0.800492,4,0.801668,0.798741,0.798622
4,"{'max_depth': 9, 'n_estimators': 100}",0.800035,5,0.800462,0.79931,0.798297
5,"{'max_depth': 9, 'n_estimators': 200}",0.796707,6,0.797707,0.795552,0.795423
6,"{'max_depth': 11, 'n_estimators': 100}",0.796636,7,0.797181,0.79577,0.795335
7,"{'max_depth': 11, 'n_estimators': 200}",0.792062,8,0.792971,0.790569,0.791733


In [59]:
print('최적 하이퍼 파라미터:\n',grid_xg.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_xg.best_score_))

최적 하이퍼 파라미터:
 {'max_depth': 5, 'n_estimators': 100}
최고 예측 정확도: 0.8037


In [60]:
best_xg = grid_xg.best_estimator_
best_xg

In [61]:
# GridSearchCV의 best_estimator_는 이미 최적 학습이 됐으므로 별도 학습이 필요 없음 

pred_xg=best_xg.predict(X_test)
prob_xg=best_xg.predict_proba(X_test)[:,1]

print('Test accuracy: ', accuracy_score(y_test, pred_xg))
print('Test set precision: ',precision_score(y_test,pred_xg))
print('Test set recall: ', recall_score(y_test,pred_xg))
print('Test set F1 score: ', f1_score(y_test, pred_xg))
print('Test set AUC Score: ', roc_auc_score(y_test,prob_xg))

Test accuracy:  0.7953983077905492
Test set precision:  0.7988860399209923
Test set recall:  0.9894347649918963
Test set F1 score:  0.8840087516658258
Test set AUC Score:  0.8020046028643291


## Voting classifier

In [62]:
# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기

from sklearn.ensemble import VotingClassifier 
vo_clf=VotingClassifier(estimators=[('LR',best_lg),('KNN',best_knn),('Dtree',best_dtree),('XGB',best_xg),('Rf',best_rf)],voting='soft')

In [63]:
# VotingClassifier 학습/예측/평가.

vo_clf.fit(X_train,y_train)


## Test data 평가하기

In [64]:
pred_vo=vo_clf.predict(X_test)
prob_vo=vo_clf.predict_proba(X_test)[:,1]

print('Test accuracy: ', accuracy_score(y_test, pred_vo))
print('Test set precision: ',precision_score(y_test,pred_vo))
print('Test set recall: ', recall_score(y_test,pred_vo))
print('Test set F1 score: ', f1_score(y_test, pred_vo))
print('Test set AUC Score: ', roc_auc_score(y_test,prob_vo))

Test accuracy:  0.7942608556832694
Test set precision:  0.7966329144809077
Test set recall:  0.9922001620745543
Test set F1 score:  0.8837261757076802
Test set AUC Score:  0.797722502350493


## TabNet 코드 구현

In [None]:
!pip install pytorch_tabnet

In [None]:
import os 
import numpy as np  
from sklearn.metrics import roc_auc_score 
from sklearn.preprocessing import LabelEncoder 

import torch 
import torch.nn as nn 
from pytorch_tabnet.tab_model import TabNetClassifier 

In [66]:

#Separate the training dataset into the training set and validation set to optimize the deeplearning model "tabnet"

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True, stratify=y_train)

In [76]:
# Feature importance가 낮은 INSP 변수는 분류모델 구축에서 제외함

X_train = X_train[['MOTORSPEED','MELT_WEIGHT','MELT_TEMP']]
X_val = X_val[['MOTORSPEED','MELT_WEIGHT','MELT_TEMP']]
X_test = X_test[['MOTORSPEED','MELT_WEIGHT','MELT_TEMP']]

In [83]:
tab_classifier=TabNetClassifier(verbose=0, optimizer_fn=torch.optim.Adam,seed=42)
tab_classifier.fit(X_train=X_train.values, y_train=y_train.values.ravel(), eval_set=[(X_val.values, y_val.values.ravel())], batch_size=500, eval_metric=['auc'], patience=5, max_epochs=30)
     


Early stopping occurred at epoch 6 with best_epoch = 1 and best_val_0_auc = 0.79389


In [88]:

pred_tab=tab_classifier.predict(X_test.values)
prob_tab=tab_classifier.predict_proba(X_test.values)[:,1]

print('Test accuracy: ', accuracy_score(y_test, pred_tab))
print('Test set precision: ',precision_score(y_test,pred_tab))
print('Test set recall: ', recall_score(y_test,pred_tab))
print('Test set F1 score: ', f1_score(y_test, pred_tab))
print('Test set AUC Score: ', roc_auc_score(y_test,prob_tab))


Test accuracy:  0.7880946679438059
Test set precision:  0.7883981410812257
Test set recall:  0.9992858589951378
Test set F1 score:  0.8814031043594607
Test set AUC Score:  0.7901565271735748
