#### 1.) 패키지 업로드 

In [1]:
import joblib
import pickle
import pandas as pd
import numpy as np
from numpy import array
import datetime
from tqdm import tqdm
import scipy
from scipy import stats
from scipy.stats import skew,kurtosis,describe,variation

import optuna
from optuna.integration import XGBoostPruningCallback
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import GridSearchCV,cross_validate
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Binarizer
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold, KFold 
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier,IsolationForest,StackingClassifier
from sklearn.metrics import accuracy_score,balanced_accuracy_score,precision_score,recall_score,confusion_matrix,f1_score,fbeta_score,roc_auc_score,classification_report,make_scorer,balanced_accuracy_score
from mlxtend.classifier import StackingClassifier,StackingCVClassifier

from imblearn.over_sampling import SMOTE,ADASYN,BorderlineSMOTE,SVMSMOTE
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
def get_clf_eval(y_test,pred=None,pred_proba=None):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    roc_auc = roc_auc_score(y_test,pred_proba)
    f1 = f1_score(y_test,pred)
    f2 = fbeta_score(y_test,pred,beta=2)
    balanced_acc = balanced_accuracy_score(y_test,pred)
    print('Confusion Matrix')
    print(confusion)
    print('\n')
    print('정확도:',accuracy.round(3),'정밀도:', precision.round(3),'재현율:',recall.round(3),'AUC:',roc_auc.round(3),'F1:',f1.round(3),'F2:',f2.round(3),'Balanced_Accuracy:',balanced_acc.round(3))

def get_eval_by_threshold(y_test,pred_proba_c1,thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임계값',custom_threshold)
        get_clf_eval(y_test,custom_predict,pred_proba_c1)
        print('\n')

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

#### 2.) 저장된 학습데이터 업로드 
- Matched Dataset
- NonMatched Dataset

In [5]:
#### 1.) Ex. 사출기 18호 데이터 업로드 
Final_Matched = pd.read_csv('machine18.csv',encoding='cp949')
Final_Matched = Final_Matched.set_index('index1')

Final_Original = pd.read_csv('machine18_Original.csv',encoding='cp949')
Final_Original = Final_Original.set_index('index')

In [7]:
Final_Matched.CreateDate # 사출 + PLC 매칭 데이터 

index1
17557.0    2023-03-16 04:18:24.270
17558.0    2023-03-16 04:19:19.259
17559.0    2023-03-16 04:20:29.258
17560.0    2023-03-16 04:21:29.263
17561.0    2023-03-16 04:22:44.261
                    ...           
34901.0    2023-06-14 10:00:17.279
34902.0    2023-06-14 10:01:27.290
34903.0    2023-06-14 10:02:42.379
34904.0    2023-06-14 10:03:52.354
34905.0    2023-06-14 10:05:02.298
Name: CreateDate, Length: 5180, dtype: object

In [8]:
Final_Original.CreateDate # 기존 사출 Only 데이터 

index
7510      2023-01-18 23:31:48.679
7513      2023-01-18 23:32:33.926
7518      2023-01-18 23:34:33.923
7524      2023-01-18 23:35:53.934
7529      2023-01-18 23:36:43.927
                   ...           
420428    2023-06-14 10:00:17.279
420429    2023-06-14 10:01:27.290
420430    2023-06-14 10:02:42.379
420431    2023-06-14 10:03:52.354
420432    2023-06-14 10:05:02.298
Name: CreateDate, Length: 34906, dtype: object

#### 3.) Train/Test Split & Target Label Hybrid Sampling 1단계 
- 학습데이터에서 추가 이상치 제거 시도
- Train/Test UnderSampling 진행 

In [9]:
def Train_Test_Setting(data,machine_number,matched):
    #### 1.) 학습데이터 Train/Test Split & Stratify=y로 정상/불량 불균형 비율 반영
    X = data.drop(['_id','UniqeNum','PlantNo','PlantName','ItmeCode','ItemName','YMdate','CreateDate','PassOrFail','FaultyType'],axis=1)
    y = data['PassOrFail']
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2021,stratify=y)

    #### 2.) Isolation Forest 기반 Train Data 이상치 라벨링 부분집합 제거 & Train/Test UnderSampling 진행 
    if matched =='yes': 
        while machine_number <= 16: 
            if machine_number == 13:
                clf=IsolationForest(random_state=2021,contamination=0.0093) 
                ros_train = RandomUnderSampler(sampling_strategy=0.5485,random_state=2021)
                ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)
            elif machine_number == 14:
                clf=IsolationForest(random_state=2021,contamination=0.01)
                ros_train = RandomUnderSampler(sampling_strategy=0.5,random_state=2021)
                ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)
            elif machine_number == 15:
                clf=IsolationForest(random_state=2021,contamination=0.014)
                ros_train = RandomUnderSampler(sampling_strategy=0.45,random_state=2021)
                ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)
            elif machine_number == 16:
                clf=IsolationForest(random_state=2021,contamination=0.005)
                ros_train = RandomUnderSampler(sampling_strategy=0.5,random_state=2021)
                ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)   

            total_train = pd.concat([X_train,y_train],axis=1)
            total_pass = total_train[total_train['PassOrFail']==0]
            total_fault = total_train[total_train['PassOrFail']==1]
            clf.fit(total_pass)
            pred = clf.predict(total_pass)
            total_pass['anomaly']=pred
            outliers = total_pass.loc[total_pass['anomaly']==-1]
            outlier_index=list(outliers.index)

            total_pass = total_pass[total_pass['anomaly']==1].drop(['anomaly'],axis=1)
            total_train = pd.concat([total_pass,total_fault],axis=0)
            X_train = total_train.drop(['PassOrFail'],axis=1)
            y_train = total_train['PassOrFail']
            original_scale = y_train.value_counts()[0]/y_train.value_counts()[1]

            X_train,y_train = ros_train.fit_resample(X_train,y_train)
            X_test,y_test = ros_test.fit_resample(X_test,y_test)    
            return X_train, y_train, X_test, y_test , original_scale

        else: 
            if machine_number == 17:
                ros_train = RandomUnderSampler(sampling_strategy=0.45,random_state=2021)
                ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)   
            if machine_number == 18:
                ros_train = RandomUnderSampler(sampling_strategy=0.5,random_state=2021)
                ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)   

            original_scale = y_train.value_counts()[0]/y_train.value_counts()[1]
            X_train,y_train = ros_train.fit_resample(X_train,y_train)
            X_test,y_test = ros_test.fit_resample(X_test,y_test)      
            return X_train, y_train, X_test, y_test , original_scale    
          
    #### 3.) PLC매칭 데이터 아닐 경우 UnderSampling만 단순 진행 
    else :
        if machine_number == 13:
            ros_train = RandomUnderSampler(sampling_strategy=0.23,random_state=2021)
            ros_test = RandomUnderSampler(sampling_strategy=0.7,random_state=2021)
        elif machine_number == 14:
            ros_train = RandomUnderSampler(sampling_strategy=0.15,random_state=2021)
            ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)
        elif machine_number == 15:
            ros_train = RandomUnderSampler(sampling_strategy=0.4,random_state=2021)
            ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)
        elif machine_number == 16:
            ros_train = RandomUnderSampler(sampling_strategy=0.45,random_state=2021)
            ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021) 
        elif machine_number == 17:
            ros_train = RandomUnderSampler(sampling_strategy=0.55,random_state=2021)
            ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)  
        elif machine_number == 18:
            ros_train = RandomUnderSampler(sampling_strategy=0.5,random_state=2021)
            ros_test = RandomUnderSampler(sampling_strategy=0.8,random_state=2021)   
        original_scale = y_train.value_counts()[0]/y_train.value_counts()[1]
        X_train,y_train = ros_train.fit_resample(X_train,y_train)
        X_test,y_test = ros_test.fit_resample(X_test,y_test)    
        return X_train, y_train, X_test, y_test , original_scale

#### 4.) Train/Test 데이터 완성 예시 (사출기 18호)

In [10]:
# 데이터 & 사출기 번호 지정 
data = Final_Matched # Final_Original
machine_number = 18
matched='yes' # no

X_train, y_train, X_test, y_test, original_scale = Train_Test_Setting(data,machine_number,matched)

In [12]:
y_train.value_counts()

0.0    164
1.0     82
Name: PassOrFail, dtype: int64

In [13]:
y_test.value_counts()

0.0    25
1.0    20
Name: PassOrFail, dtype: int64

#### 5.) 지도학습 예측모델 나열 
- Tree ML 모델 정의 

In [14]:
rf = RandomForestClassifier(random_state=2021,class_weight='balanced')
ada = AdaBoostClassifier(random_state=2021)
gbm = GradientBoostingClassifier(random_state=2021)
xgb = XGBClassifier(random_state=2021,scale_pos_weight=y_train.value_counts()[0]/y_train.value_counts()[1])
xgb_original_scale = XGBClassifier(random_state=2021,scale_pos_weight=original_scale)
lgbm = LGBMClassifier(random_state=2021,scale_pos_weight=y_train.value_counts()[0]/y_train.value_counts()[1]) 
lgb_original_scale = LGBMClassifier(random_state=2021,scale_pos_weight=original_scale) 
models = [rf,ada,gbm,xgb,xgb_original_scale,lgbm,lgb_original_scale]

#### 6.) 예측모델 성능평가 진행 

In [15]:
def test_model(data,machine_number,model):
    X_train, y_train, X_test, y_test, original_scale = Train_Test_Setting(data,machine_number,matched)
    ml = model
    ml.fit(X_train,y_train)      
    pred = ml.predict(X_test)
    pred_proba = ml.predict_proba(X_test)[:,1]
    get_clf_eval(y_test,pred,pred_proba)  
    print('\n')
    print(classification_report(y_test, pred,target_names=['양품','불량']))
    if model == rf:
        print('RandomForest Prediction Result')
        print('------------------------------------------------------------------------------------------------------')
        print('\n')
    elif model == ada:
        print('AdaBoost Prediction Result')
        print('------------------------------------------------------------------------------------------------------')
        print('\n')
    elif model == gbm:
        print('GradientBoostingMachine Prediction Result')
        print('------------------------------------------------------------------------------------------------------')
        print('\n')
    elif model == xgb:
        print('XGBoost Prediction Result')
        print('------------------------------------------------------------------------------------------------------')
        print('\n')
    elif model == xgb_original_scale:
        print('XGBoost(Original Scale Weight) Prediction Result')
        print('------------------------------------------------------------------------------------------------------')
        print('\n')
    elif model == lgbm:
        print('LightGBM Prediction')
        print('------------------------------------------------------------------------------------------------------')
        print('\n')   
    elif model == lgb_original_scale:
        print('LightGBM(Original Scale Weight) Prediction Result')
        print('------------------------------------------------------------------------------------------------------')
        print('\n')  
    for i in range(1,len(models)+1):
        
        print('Machine_'+str(machine_number)+'/ML'+str(i)+'.pkl')
        joblib.dump(ml.fit(X_train,y_train),'Machine_'+str(machine_number)+'/ML'+str(i)+'.pkl')  # --> 모델 저장  
        
    #return ml

In [17]:
#### Ex.) 사출기 18호기 사출+PLC데이터에 대한 다수 예측모델 성능 평가 
for i in range(0,len(models)):
    model = models[i]
    test_model(data,machine_number,model)

Confusion Matrix
[[22  3]
 [ 5 15]]


정확도: 0.822 정밀도: 0.833 재현율: 0.75 AUC: 0.933 F1: 0.789 F2: 0.765 Balanced_Accuracy: 0.815


              precision    recall  f1-score   support

          양품       0.81      0.88      0.85        25
          불량       0.83      0.75      0.79        20

    accuracy                           0.82        45
   macro avg       0.82      0.81      0.82        45
weighted avg       0.82      0.82      0.82        45

RandomForest Prediction Result
------------------------------------------------------------------------------------------------------


Confusion Matrix
[[23  2]
 [ 5 15]]


정확도: 0.844 정밀도: 0.882 재현율: 0.75 AUC: 0.906 F1: 0.811 F2: 0.773 Balanced_Accuracy: 0.835


              precision    recall  f1-score   support

          양품       0.82      0.92      0.87        25
          불량       0.88      0.75      0.81        20

    accuracy                           0.84        45
   macro avg       0.85      0.83      0.84        45
weighted 

#### 6.) 예측모델 성능평가 진행 2 
- 기존 UnderSampling된 Train Data에 OverSampling 추가 적용 (2단계 Hybrid Sampling)

In [18]:
models = [rf,ada,gbm,xgb,xgb_original_scale,lgbm,lgb_original_scale]

In [19]:
def Hybrid_Sampling_Test(data,machine_number,model):
    
    X_train, y_train, X_test, y_test, original_scale = Train_Test_Setting(data,machine_number,matched)
    for i in list(np.arange(0.7,1.0,0.05).round(2)):
        oversampler = SMOTEENN(sampling_strategy=i,random_state=2021)
        X_train_over,y_train_over = oversampler.fit_resample(X_train,y_train)
        ml = model
        ml.fit(X_train_over,y_train_over) 
        pred = ml.predict(X_test)
        pred_proba = ml.predict_proba(X_test)[:,1]
        get_clf_eval(y_test,pred,pred_proba)  
        print('\n')
        print(classification_report(y_test, pred,target_names=['양품','불량']))
        
        if model == rf:
            print('RandomForest SMOTEENN'+'='+str(i),'Prediction Result')
            print('------------------------------------------------------------------------------------------------------')
            print('\n')
        elif model == ada:
            print('AdaBoost SMOTEENN'+'='+str(i),'Prediction Result')
            print('------------------------------------------------------------------------------------------------------')
            print('\n')
        elif model == gbm:
            print('GradientBoostingMachine SMOTEENN'+'='+str(i),'Prediction Result')
            print('------------------------------------------------------------------------------------------------------')
            print('\n')
        elif model == xgb:
            print('XGBoost SMOTEENN'+'='+str(i),'Prediction Result')
            print('------------------------------------------------------------------------------------------------------')
            print('\n')
        elif model == xgb_original_scale:
            print('XGBoost(Original Scale Weight) SMOTEENN'+'='+str(i),'Prediction Result')
            print('------------------------------------------------------------------------------------------------------')
            print('\n')
        elif model == lgbm:
            print('LightGBM SMOTEENN'+'='+str(i),'Prediction Result')
            print('------------------------------------------------------------------------------------------------------')
            print('\n')   
        elif model == lgb_original_scale:
            print('LightGBM(Original Scale Weight) SMOTEENN'+'='+str(i),'Prediction Result')
            print('------------------------------------------------------------------------------------------------------')
            print('\n')          
        for i in range(1,len(list(np.arange(0.7,1.0,0.05).round(2)))+1):
            
            print('Machine_'+str(machine_number)+'/ML_HybridOver'+str(i)+'.pkl')
            joblib.dump(ml.fit(X_train,y_train),'Machine_'+str(machine_number)+'/ML_HybridOver'+str(i)+'.pkl')  # --> 모델 저장  

In [21]:
#### Ex.) 사출기 18호기 사출+PLC데이터에 대한 예측모델 성능펴가 
model = xgb
Hybrid_Sampling_Test(data,machine_number,model)

Confusion Matrix
[[22  3]
 [ 7 13]]


정확도: 0.778 정밀도: 0.812 재현율: 0.65 AUC: 0.924 F1: 0.722 F2: 0.677 Balanced_Accuracy: 0.765


              precision    recall  f1-score   support

          양품       0.76      0.88      0.81        25
          불량       0.81      0.65      0.72        20

    accuracy                           0.78        45
   macro avg       0.79      0.77      0.77        45
weighted avg       0.78      0.78      0.77        45

XGBoost SMOTEENN=0.7 Prediction Result
------------------------------------------------------------------------------------------------------


Confusion Matrix
[[20  5]
 [ 5 15]]


정확도: 0.778 정밀도: 0.75 재현율: 0.75 AUC: 0.89 F1: 0.75 F2: 0.75 Balanced_Accuracy: 0.775


              precision    recall  f1-score   support

          양품       0.80      0.80      0.80        25
          불량       0.75      0.75      0.75        20

    accuracy                           0.78        45
   macro avg       0.78      0.78      0.78        45
weigh

#### 7.) 예측모델 성능평가 진행 3
- 1개 사출기당 다수 예측모델 대상으로 Stacking Classifier 학습모델 구축 

In [22]:
clf1 = rf
clf2 = ada
clf3 = gbm
clf4 = xgb
clf5 = xgb_original_scale
clf6 = lgbm
clf7 =lgb_original_scale
meta_clf = RandomForestClassifier(random_state=2021,class_weight='balanced')

In [23]:
sclf = StackingCVClassifier(random_state=2021,classifiers=[clf1, clf2, clf3, clf4, clf5, clf6, clf7],meta_classifier=meta_clf,use_probas=True)

model_names = ['rf','ada','gbm','xgb','xgb2','lgbm','lgbm2','Stacking']
for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, sclf],model_names):
    scores = model_selection.cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy') # accuracy, recall, f1 
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    
print('----------------------------------------------------------------------------------')
print('\n')    
Stacking = sclf.fit(X_train, y_train)
stacking_pred = Stacking.predict(X_test)
stacking_pred_proba = Stacking.predict_proba(X_test)[:,1]

get_clf_eval(y_test,stacking_pred,stacking_pred_proba)  
print('\n')
print(classification_report(y_test,stacking_pred,target_names=['양품','불량']))

Accuracy: 0.81 (+/- 0.04) [rf]
Accuracy: 0.81 (+/- 0.05) [ada]
Accuracy: 0.80 (+/- 0.04) [gbm]
Accuracy: 0.81 (+/- 0.03) [xgb]
Accuracy: 0.78 (+/- 0.03) [xgb2]
Accuracy: 0.78 (+/- 0.03) [lgbm]
Accuracy: 0.80 (+/- 0.09) [lgbm2]
Accuracy: 0.76 (+/- 0.04) [Stacking]
----------------------------------------------------------------------------------


Confusion Matrix
[[20  5]
 [ 8 12]]


정확도: 0.711 정밀도: 0.706 재현율: 0.6 AUC: 0.867 F1: 0.649 F2: 0.619 Balanced_Accuracy: 0.7


              precision    recall  f1-score   support

          양품       0.71      0.80      0.75        25
          불량       0.71      0.60      0.65        20

    accuracy                           0.71        45
   macro avg       0.71      0.70      0.70        45
weighted avg       0.71      0.71      0.71        45



In [24]:
sclf = StackingCVClassifier(random_state=2021,classifiers=[clf1, clf2, clf3, clf4, clf5, clf6, clf7], meta_classifier=meta_clf, use_probas=True, cv=5)

# Stacking Cross Validate 
model_names = ['rf','ada','gbm','xgb','xgb2','lgbm','lgbm2','Stacking']

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, sclf],model_names):

    scores = model_selection.cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy') # accuracy, recall, f1 
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    
print('----------------------------------------------------------------------------------')
print('\n')    
StackingCV = sclf.fit(X_train, y_train)
stackingcv_pred = StackingCV.predict(X_test)
stackingcv_pred_proba = StackingCV.predict_proba(X_test)[:,1]

get_clf_eval(y_test,stackingcv_pred,stackingcv_pred_proba)  
print('\n')
print(classification_report(y_test,stacking_pred,target_names=['양품','불량']))

Accuracy: 0.81 (+/- 0.04) [rf]
Accuracy: 0.81 (+/- 0.05) [ada]
Accuracy: 0.80 (+/- 0.04) [gbm]
Accuracy: 0.81 (+/- 0.03) [xgb]
Accuracy: 0.78 (+/- 0.03) [xgb2]
Accuracy: 0.78 (+/- 0.03) [lgbm]
Accuracy: 0.80 (+/- 0.09) [lgbm2]
Accuracy: 0.77 (+/- 0.03) [Stacking]
----------------------------------------------------------------------------------


Confusion Matrix
[[21  4]
 [ 5 15]]


정확도: 0.8 정밀도: 0.789 재현율: 0.75 AUC: 0.879 F1: 0.769 F2: 0.758 Balanced_Accuracy: 0.795


              precision    recall  f1-score   support

          양품       0.71      0.80      0.75        25
          불량       0.71      0.60      0.65        20

    accuracy                           0.71        45
   macro avg       0.71      0.70      0.70        45
weighted avg       0.71      0.71      0.71        45



In [26]:
from sklearn.ensemble import StackingClassifier
estimators = [('rf', clf1),('ada',clf2),('gbm',clf3),('xgb1',clf4),('xgb2',clf5),('lgbm1',clf6),('lgbm2',clf7)]
Stacking = StackingClassifier(estimators = estimators, final_estimator = meta_clf)
Stacking_Train = Stacking.fit(X_train, y_train)
stacking_pred = Stacking_Train.predict(X_test)
stacking_pred_proba = Stacking.predict_proba(X_test)[:,1]
get_clf_eval(y_test,stacking_pred,stacking_pred_proba)  
print('\n')
print(classification_report(y_test,stacking_pred,target_names=['양품','불량']))

Confusion Matrix
[[22  3]
 [ 5 15]]


정확도: 0.822 정밀도: 0.833 재현율: 0.75 AUC: 0.948 F1: 0.789 F2: 0.765 Balanced_Accuracy: 0.815


              precision    recall  f1-score   support

          양품       0.81      0.88      0.85        25
          불량       0.83      0.75      0.79        20

    accuracy                           0.82        45
   macro avg       0.82      0.81      0.82        45
weighted avg       0.82      0.82      0.82        45

