In [1]:
# 安裝所需套件
from sklearn import metrics
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean

## EDA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## 過採樣
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN


# 績效指標
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from collections import Counter

## 建模
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_classification
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#load data
df1 = pd.read_csv("C:/Users/USER/Desktop/firstphase_test.csv")

# 將與weather_delay無關的特徵刪除
df1.drop(["CRS_DEP_TIME","DEP_TIME","DEP_DELAY","CRS_ARR_TIME","ARR_TIME","ARR_DELAY"
          ,"CRS_ELAPSED_TIME","ACTUAL_ELAPSED_TIME","AIR_TIME"],axis=1,inplace=True)


#將target欄位移至最後一欄
target_col = df1.pop(df1.columns[4])
df1.insert(18, target_col.name, target_col)

In [3]:
# 1. 類別變數轉換
# 值之間沒有大小的意義，用One-Hot(使用get_dummies)

dummy = pd.get_dummies(df1[['ORIGIN','DEST']])


# 將轉換後產生的dataframe塞入原始df1，並刪除ORIGIN、DEST
df1 = pd.concat([dummy,df1],axis=1)
df1.drop(['ORIGIN','DEST'],axis=1,inplace=True)

In [4]:
## 由於features中有不用標準化的欄位(dummy)，加上標準化、split的返回結果是ndarray，
## 因此要把不需要標準化的欄位獨立出來

# 1.切分成訓練集、驗證集
X = df1.iloc[:,:-1].values 
y = df1.iloc[:,-1].values

X_trainval,X_test,y_trainval,y_test = train_test_split(X,y,test_size = 0.3,random_state = 1) 

# 2. 記錄不用標準化的欄位(type = dataframe)

# ndarray to df
Xtrainval = pd.DataFrame(X_trainval,columns = [str(i) for i in range(X_trainval.shape[1])])
Xtest = pd.DataFrame(X_test,columns = [str(i) for i in range(X_test.shape[1])])

## 目標df
X_trainval_temp = Xtrainval[["0","1","2","3","4","5",'6',"7","8","10","11"]]
X_test_temp = Xtest[["0","1","2","3","4","5",'6',"7","8","10","11"]]

# 3. df to ndarray
X_trainval_new = Xtrainval.to_numpy()
X_test_new = Xtest.to_numpy()

# 4. 標準化
sc = StandardScaler().fit(X_trainval)
x_train_std = sc.transform(X_trainval) ## 將規則用在訓練集
x_test_std = sc.transform(X_test) ## 將規則用在測試集

# 5. 將沒標準化的features放回df中
xtrainval = pd.DataFrame(X_trainval,columns = [str(i) for i in range(X_trainval.shape[1])])
xtest = pd.DataFrame(X_test,columns = [str(i) for i in range(X_test.shape[1])])

xtrainval.drop(["0","1","2","3","4","5",'6',"7","8","10","11"],axis=1,inplace=True)
xtest.drop(["0","1","2","3","4","5",'6',"7","8","10","11"],axis=1,inplace=True)

del X_trainval
del X_test

X_trainval = pd.concat([X_trainval_temp,xtrainval],axis=1)
X_test = pd.concat([X_test_temp,xtest],axis=1)

# df to ndarray(為了進行SMOTE&CV)
X_trainval = X_trainval.to_numpy()
X_test = X_test.to_numpy()

In [6]:
##未進行補值前
clf = RandomForestClassifier(max_depth=3, random_state=0)
clf.fit(X_trainval, y_trainval)
y_pred = clf.predict(X_test)
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.96      1.00      0.98       554
     class 1       0.00      0.00      0.00        21

    accuracy                           0.96       575
   macro avg       0.48      0.50      0.49       575
weighted avg       0.93      0.96      0.95       575



  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
## 確定補值成功
print('Original dataset shape %s' % Counter(y_trainval))
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_trainval, y_trainval)
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0: 1298, 1: 42})
Resampled dataset shape Counter({0: 1298, 1: 1298})


In [8]:
## 重跑一次RF
clf = RandomForestClassifier(n_estimators=300, max_depth=4, random_state=0)
clf.fit(X_res, y_res)
y_pred = clf.predict(X_test)
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))
print('ROC AUC: %.3f' %roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

     class 0       0.99      0.82      0.90       554
     class 1       0.15      0.86      0.26        21

    accuracy                           0.82       575
   macro avg       0.57      0.84      0.58       575
weighted avg       0.96      0.82      0.88       575

ROC AUC: 0.839


* SMOTE()

In [28]:
model = RandomForestClassifier(n_estimators=300, max_depth=4)
over = SMOTE() 

steps = [('over', over), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))


pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print(roc_auc_score(y_test, y_test_pred))

Mean ROC AUC: 0.905
0.8621282447997248


* SVMSMOTE()

In [36]:
## xxxxx
model = RandomForestClassifier(n_estimators=300, max_depth=4,random_state=1)
over = SVMSMOTE() #BorderlineSMOTE() #SVMSMOTE() #ADASYN()
under = RandomUnderSampler()
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))


pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print(roc_auc_score(y_test, y_test_pred))

Mean ROC AUC: 0.908
0.8060426336599621


* BorderlineSMOTE()

In [27]:
model = RandomForestClassifier(n_estimators=300, max_depth=4)
over = BorderlineSMOTE() 
under = RandomUnderSampler()
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print("BorderlineSMOTE():\n")
print('模型平均ROC AUC分數: %.3f' % mean(scores))

pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print("測試集ROC AUC分數: %.3f" % roc_auc_score(y_test, y_test_pred))
print("----------------------------")
print("混淆矩陣:")
print(confusion_matrix(y_test, y_test_pred))

BorderlineSMOTE():

模型平均ROC AUC分數: 0.903
測試集ROC AUC分數: 0.802
----------------------------
混淆矩陣:
[[466  88]
 [  5  16]]


In [6]:
model = RandomForestClassifier(n_estimators=300, max_depth=4)
over = BorderlineSMOTE() 

steps = [('over', over), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print("BorderlineSMOTE():\n")
print('模型平均ROC AUC分數: %.3f' % mean(scores))

pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print("測試集ROC AUC分數: %.3f" % roc_auc_score(y_test, y_test_pred))
print("----------------------------")
print("混淆矩陣:")
print(confusion_matrix(y_test, y_test_pred))

BorderlineSMOTE():

模型平均ROC AUC分數: 0.904
測試集ROC AUC分數: 0.803
----------------------------
混淆矩陣:
[[468  86]
 [  5  16]]


* ADASYN

In [29]:
model = RandomForestClassifier(n_estimators=300, max_depth=4)
over = ADASYN() 
under = RandomUnderSampler()
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print("ADASYN():\n")
print('模型平均ROC AUC分數: %.3f' % mean(scores))

pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print("測試集ROC AUC分數: %.3f" % roc_auc_score(y_test, y_test_pred))
print("----------------------------")
print("混淆矩陣:")
print(confusion_matrix(y_test, y_test_pred))

ADASYN():

模型平均ROC AUC分數: 0.904
測試集ROC AUC分數: 0.866
----------------------------
混淆矩陣:
[[458  96]
 [  2  19]]


In [13]:
model = RandomForestClassifier(n_estimators=300, max_depth=4)
over = ADASYN() 

steps = [('over', over), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print("ADASYN():\n")
print('模型平均ROC AUC分數: %.3f' % mean(scores))

pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print("測試集ROC AUC分數: %.3f" % roc_auc_score(y_test, y_test_pred))
print("----------------------------")
print("混淆矩陣:")
print(confusion_matrix(y_test, y_test_pred))

ADASYN():

模型平均ROC AUC分數: 0.903
測試集ROC AUC分數: 0.864
----------------------------
混淆矩陣:
[[456  98]
 [  2  19]]


* SMOTEENN()

In [31]:
model = RandomForestClassifier(n_estimators=300, max_depth=4)
over = SMOTEENN() 

steps = [('over', over), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print("SMOTEENN():\n")
print('模型平均ROC AUC分數: %.3f' % mean(scores))

pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print("測試集ROC AUC分數: %.3f" % roc_auc_score(y_test, y_test_pred))
print("----------------------------")
print("混淆矩陣:")
print(confusion_matrix(y_test, y_test_pred))

SMOTEENN():

模型平均ROC AUC分數: 0.901
測試集ROC AUC分數: 0.851
----------------------------
混淆矩陣:
[[442 112]
 [  2  19]]


* SMOTETomek()

In [52]:
model = RandomForestClassifier(n_estimators=300, max_depth=4,random_state=1)
over = SMOTETomek() 

steps = [('over', over), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print("SMOTETomek():\n")
print('模型平均ROC AUC分數: %.3f' % mean(scores))

pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print("測試集ROC AUC分數: %.3f" % roc_auc_score(y_test, y_test_pred))
print("----------------------------")
print("混淆矩陣:")
print(confusion_matrix(y_test, y_test_pred))

SMOTETomek():

模型平均ROC AUC分數: 0.905
測試集ROC AUC分數: 0.865
----------------------------
混淆矩陣:
[[457  97]
 [  2  19]]


In [53]:
model = RandomForestClassifier(n_estimators=300, max_depth=4,random_state=1)
over = SMOTETomek() 

steps = [('over', over), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline,X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print("SMOTETomek():\n")
print('模型平均ROC AUC分數: %.3f' % mean(scores))

pipeline.fit(X_trainval, y_trainval)
y_test_pred = pipeline.predict(X_test)
print("測試集ROC AUC分數: %.3f" % roc_auc_score(y_test, y_test_pred))
print("----------------------------")
print("混淆矩陣:")
print(confusion_matrix(y_test, y_test_pred))

SMOTETomek():

模型平均ROC AUC分數: 0.905
測試集ROC AUC分數: 0.862
----------------------------
混淆矩陣:
[[454 100]
 [  2  19]]
