In [1]:
from IPython.display import display, HTML
display(HTML("<style.container { width:95% !important; }</style>"))

In [114]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import RandomUnderSampler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


#from pycaret.classification import *
#%matplotlib inline

In [75]:
data = pd.read_csv('InjectionMolding_Raw_Data.csv')

In [76]:
data_lot = to_lot(data_drop)

In [77]:
# 분석에 사용하지 않는 변수 제거
data_drop=data.copy().drop(['Minimum_Cushion','Weighing_Start_Position','_ID'],axis=1)

In [78]:
# Lot 단위 데이터 정리

# Lot별 분리함수
# Lot을 판단할 수 있는 공정변수 'No_Shot'을 기준으로 분리하여 리스트에 저장
def to_lot(data):
    
    lot_list = [0,data.shape[0]]
    for i in range(data.shape[0]):
        if data['No_Shot'][i] == 0:
            lot_list.append(i)
    lot_list.sort()
    data_lot = []
    
    for i in range(len(lot_list)-1):
        data_lot.append(data.iloc[lot_list[i]:lot_list[i+1]])

    return data_lot

In [79]:
data_lot = to_lot(data_drop)
data_lot

[Empty DataFrame
 Columns: [No_Shot, Machine_Cycle_Time, Cycle_Time, Barrel_Temp_Z1, Barrel_Temp_Z2, Barrel_Temp_Z3, Barrel_Temp_Z4, Hopper_Temp, Injection_Pressure_Real_Time, Screw_Position, Injection_Peak_Press, Max_Injection_Rate, Screw_Velocity, VP_Time, VP_Position, VP_Press, Plasticizing_Time, Plasticizing_Start_Position, Plasticizing_End_Position, Plasticizing_RPM, Cooling_Time, Back_Flow, Decompression_Time]
 Index: []
 
 [0 rows x 23 columns],
      No_Shot  Machine_Cycle_Time  Cycle_Time  Barrel_Temp_Z1  Barrel_Temp_Z2  \
 0          0               16.54       16.52           210.2           200.7   
 1          1               16.56       16.54           210.2           200.8   
 2          2               16.58       16.56           210.1           200.7   
 3          3               16.62       16.58           210.1           200.6   
 4          4               16.62       16.62           210.0           200.5   
 ..       ...                 ...         ...            

In [80]:
# 데이터 갯수가 100 미만인 Lot 제거 함수

def error_drop(data_lot):
    
    normal_lot = []
    
    for i in range(len(data_lot)):
        if data_lot[i].shape[0]>=100:
            normal_lot.append(data_lot[i])
    
    return normal_lot

In [81]:
# 분석에 사용할 Lot 데이터
# Shot의 갯수가 100개 미만인 Lot을 이상치라 판단, 제거
data_n_lot = error_drop(data_lot)
data_n_lot

[     No_Shot  Machine_Cycle_Time  Cycle_Time  Barrel_Temp_Z1  Barrel_Temp_Z2  \
 0          0               16.54       16.52           210.2           200.7   
 1          1               16.56       16.54           210.2           200.8   
 2          2               16.58       16.56           210.1           200.7   
 3          3               16.62       16.58           210.1           200.6   
 4          4               16.62       16.62           210.0           200.5   
 ..       ...                 ...         ...             ...             ...   
 767      767               16.70       16.68           210.0           199.9   
 768      768               16.70       16.70           209.9           200.0   
 769      769               16.68       16.72           210.0           199.9   
 770      770               16.70       16.68           210.0           200.1   
 771      771               16.68       16.70           210.1           199.9   
 
      Barrel_Temp_Z3  Barr

In [82]:
# Lot별 대표값 데이터프레임
# 대표값은 평균으로
data_list = []

for i in range(len(data_n_lot)):
    data_list.append(data_n_lot[i].mean())

lot_mean = pd.DataFrame(data_list).drop(['No_Shot'], axis = 1)

lot_mean.head()

Unnamed: 0,Machine_Cycle_Time,Cycle_Time,Barrel_Temp_Z1,Barrel_Temp_Z2,Barrel_Temp_Z3,Barrel_Temp_Z4,Hopper_Temp,Injection_Pressure_Real_Time,Screw_Position,Injection_Peak_Press,...,VP_Time,VP_Position,VP_Press,Plasticizing_Time,Plasticizing_Start_Position,Plasticizing_End_Position,Plasticizing_RPM,Cooling_Time,Back_Flow,Decompression_Time
0,16.668679,16.681723,209.998316,200.042358,194.962824,189.962435,40.085622,1.367863,6.535635,1259.353666,...,2.0,8.11978,1184.873031,4.797461,11.00851,63.111632,28.27,8.0,49.03,0.322306
1,16.660866,16.67778,209.999803,200.038804,195.007241,190.01889,40.089532,1.363695,6.509563,1270.757599,...,2.0,8.119437,1161.96571,4.789536,11.000386,63.111625,28.27,8.0,49.03,0.321893
2,16.681288,16.699753,209.996035,200.036514,195.001296,190.016765,40.078406,1.382933,6.640133,1252.482285,...,2.0,8.121991,1184.553149,4.684229,11.08764,63.111457,28.27,8.0,49.03,0.321735
3,16.634533,16.653553,209.998039,200.040196,195.001059,190.008471,40.090627,1.338847,6.524275,1244.939608,...,2.0,8.123667,1113.798486,4.731149,11.006322,63.111573,28.27,8.0,49.03,0.321718
4,16.610939,16.624291,209.996946,200.031504,195.009702,190.020371,40.125474,1.308349,6.465195,1239.34087,...,2.0,8.124291,1109.827781,4.630274,10.976884,63.111736,28.27,8.0,49.03,0.321701


In [83]:
# Lot별 대표값 데이터프레임
# 대표값은 중앙값으로
data_list = []

for I in range(len(data_n_lot)):
    data_list.append(data_n_lot[i].median())

    lot_median = pd.DataFrame(data_list).drop(['No_Shot'], axis=1)
    
# Lot 별 중앙값 데이터프레임 확인
lot_median.head()

Unnamed: 0,Machine_Cycle_Time,Cycle_Time,Barrel_Temp_Z1,Barrel_Temp_Z2,Barrel_Temp_Z3,Barrel_Temp_Z4,Hopper_Temp,Injection_Pressure_Real_Time,Screw_Position,Injection_Peak_Press,...,VP_Time,VP_Position,VP_Press,Plasticizing_Time,Plasticizing_Start_Position,Plasticizing_End_Position,Plasticizing_RPM,Cooling_Time,Back_Flow,Decompression_Time
0,15.1,15.12,290.0,270.0,260.0,250.0,55.8,0.81,9.23,776.95,...,1.0,10.16,776.45,2.07,11.68,26.47,28.27,6.5,49.03,0.4
1,15.1,15.12,290.0,270.0,260.0,250.0,55.8,0.81,9.23,776.95,...,1.0,10.16,776.45,2.07,11.68,26.47,28.27,6.5,49.03,0.4
2,15.1,15.12,290.0,270.0,260.0,250.0,55.8,0.81,9.23,776.95,...,1.0,10.16,776.45,2.07,11.68,26.47,28.27,6.5,49.03,0.4
3,15.1,15.12,290.0,270.0,260.0,250.0,55.8,0.81,9.23,776.95,...,1.0,10.16,776.45,2.07,11.68,26.47,28.27,6.5,49.03,0.4
4,15.1,15.12,290.0,270.0,260.0,250.0,55.8,0.81,9.23,776.95,...,1.0,10.16,776.45,2.07,11.68,26.47,28.27,6.5,49.03,0.4


In [84]:
# Lot별 대표값 데이터프레임
# 대표값은 3분위수로
data_list = []

for I in range(len(data_n_lot)):
    data_list.append(data_n_lot[i].quantile(.75))

lot_75 = pd.DataFrame(data_list).drop(['No_Shot'], axis=1).reset_index(drop=True)

# Lot별 3분위수 데이터프레임 확인
lot_75.head()

Unnamed: 0,Machine_Cycle_Time,Cycle_Time,Barrel_Temp_Z1,Barrel_Temp_Z2,Barrel_Temp_Z3,Barrel_Temp_Z4,Hopper_Temp,Injection_Pressure_Real_Time,Screw_Position,Injection_Peak_Press,...,VP_Time,VP_Position,VP_Press,Plasticizing_Time,Plasticizing_Start_Position,Plasticizing_End_Position,Plasticizing_RPM,Cooling_Time,Back_Flow,Decompression_Time
0,15.12,15.12,290.0,270.1,260.0,250.1,58.2,0.81,9.27,777.45,...,1.0,10.17,776.83,2.09,11.71,26.47,28.27,6.5,49.03,0.4
1,15.12,15.12,290.0,270.1,260.0,250.1,58.2,0.81,9.27,777.45,...,1.0,10.17,776.83,2.09,11.71,26.47,28.27,6.5,49.03,0.4
2,15.12,15.12,290.0,270.1,260.0,250.1,58.2,0.81,9.27,777.45,...,1.0,10.17,776.83,2.09,11.71,26.47,28.27,6.5,49.03,0.4
3,15.12,15.12,290.0,270.1,260.0,250.1,58.2,0.81,9.27,777.45,...,1.0,10.17,776.83,2.09,11.71,26.47,28.27,6.5,49.03,0.4
4,15.12,15.12,290.0,270.1,260.0,250.1,58.2,0.81,9.27,777.45,...,1.0,10.17,776.83,2.09,11.71,26.47,28.27,6.5,49.03,0.4


In [85]:
# Lot별 대표값 데이터프레임
# 대표값은 1분위수로
data_list = []

for I in range(len(data_n_lot)):
    data_list.append(data_n_lot[i].quantile(.25))

lot_25 = pd.DataFrame(data_list).drop(['No_Shot'], axis=1).reset_index(drop=True)
                                       
# Lot별 1분위수 데이터프레임 확인
lot_25.head()

Unnamed: 0,Machine_Cycle_Time,Cycle_Time,Barrel_Temp_Z1,Barrel_Temp_Z2,Barrel_Temp_Z3,Barrel_Temp_Z4,Hopper_Temp,Injection_Pressure_Real_Time,Screw_Position,Injection_Peak_Press,...,VP_Time,VP_Position,VP_Press,Plasticizing_Time,Plasticizing_Start_Position,Plasticizing_End_Position,Plasticizing_RPM,Cooling_Time,Back_Flow,Decompression_Time
0,15.1,15.12,290.0,270.0,260.0,250.0,52.6,0.8,9.2,776.45,...,1.0,10.16,775.95,2.05,11.65,26.46,28.27,6.5,49.03,0.4
1,15.1,15.12,290.0,270.0,260.0,250.0,52.6,0.8,9.2,776.45,...,1.0,10.16,775.95,2.05,11.65,26.46,28.27,6.5,49.03,0.4
2,15.1,15.12,290.0,270.0,260.0,250.0,52.6,0.8,9.2,776.45,...,1.0,10.16,775.95,2.05,11.65,26.46,28.27,6.5,49.03,0.4
3,15.1,15.12,290.0,270.0,260.0,250.0,52.6,0.8,9.2,776.45,...,1.0,10.16,775.95,2.05,11.65,26.46,28.27,6.5,49.03,0.4
4,15.1,15.12,290.0,270.0,260.0,250.0,52.6,0.8,9.2,776.45,...,1.0,10.16,775.95,2.05,11.65,26.46,28.27,6.5,49.03,0.4


In [86]:
# IQR을 통한 이상치 분류 함수
def outliers_iqr(data):
    
    q1, q3 = np.percentile(data,[25,75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr*1.5)
    upper_bound = q3 + (iqr*1.5)
    
    return np.where((data>upper_bound)|(data<lower_bound))

In [87]:
# Lot별 대표값을 통한 abnormal 탐색 함수
# 각 컬럼별 이상치가 4개 이상인 Lot 번호(index)를 abnormal이라 판단
def outlier_search(lot_rep):
    
    rep_index = []
    
    for i in range(lot_rep.shape[1]):
        rep_index.append(outliers_iqr(lot_rep[lot_rep.columns[i]])[0])
        
    a = 0
    outlier_index = []
    
    for i in range(len(lot_rep)):
        a = 0
        for j in rep_index:
            if i in j:
                a = a + 1
        if a > 3:
            outlier_index.append(i)
            
    return outlier_index

In [88]:
# 각 대표값 별 abnormal 합집합
IQR_error= sorted((list(set(outlier_search(lot_mean)+outlier_search(lot_median)+
                            outlier_search(lot_75)+outlier_search(lot_25)))))

In [89]:
print(IQR_error)

[85, 86, 87, 88, 89, 90, 91, 92, 93, 100, 101, 102, 103, 113, 114, 328, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380]


In [90]:
# x: 독립변인들의value값만추출
# y: 종속변인추출
x = lot_mean.drop(['Machine_Cycle_Time'], axis=1).values
y = lot_mean['Machine_Cycle_Time'].values

print(x)
print(y)

[[ 16.6817228  209.99831606 200.04235751 ...   8.          49.03
    0.3223057 ]
 [ 16.6777804  209.99980323 200.03880362 ...   8.          49.03
    0.32189296]
 [ 16.69975265 209.99603455 200.03651355 ...   8.          49.03
    0.32173537]
 ...
 [ 15.03512623 290.0004279  270.00937099 ...   6.5        294.095092
    0.40015832]
 [ 15.04763788 289.99823101 270.00551509 ...   6.5        270.89907041
    0.40002775]
 [ 15.11659289 289.99698189 270.00449363 ...   6.5         49.03
    0.40004695]]
[16.66867876 16.6608658  16.68128779 16.63453333 16.61093931 16.6151929
 16.58832691 16.57856909 16.57410649 16.58617455 16.57820364 16.72306075
 16.75241763 16.75701087 16.75403175 16.63455809 16.59659591 16.62254495
 16.80313367 16.84316706 17.68217195 17.77893662 17.84900671 15.36076759
 15.36549466 15.34459016 15.34107973 15.34620347 15.33777937 15.33592357
 15.40422806 15.42330417 15.39479226 15.3918394  16.54459239 16.54473563
 16.58713622 16.53754898 16.53392337 16.5463401  16.56390929 

In [91]:
# x객체에 x를 표준화한 데이터를 저장 - 독립변인 표준화
x = MinMaxScaler().fit_transform(x)

In [92]:
# 2개의 주성분(n_components)으로 이루어진 데이터프레임 구성
pca = PCA(n_components=2)
printcipalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data=printcipalComponents, columns = ['principal component1', 'principal component2'])

print(principalDf)

     principal component1  principal component2
0               -0.873603              0.343205
1               -0.865574              0.340311
2               -0.837708              0.320043
3               -0.785490              0.296093
4               -0.766725              0.286019
..                    ...                   ...
376              1.738420              0.916883
377              1.738848              0.917567
378              1.709417              0.978951
379              1.706065              0.974776
380              1.743391              0.915489

[381 rows x 2 columns]


In [93]:
#PCA를 통한 차원축소 확인
principalDf.head()

Unnamed: 0,principal component1,principal component2
0,-0.873603,0.343205
1,-0.865574,0.340311
2,-0.837708,0.320043
3,-0.78549,0.296093
4,-0.766725,0.286019


In [94]:
# epsilon, 최소 샘플 개수 설정
# 모델을 DBSCAN 알고리즘 사용
# DBSCAN 알고리즘 파라미터 설정 및 저장
a = 0.165
b = 10
model = DBSCAN(eps=a, min_samples=b)

In [95]:
# 군집화 모델 학습 및 클러스터 예측 결과 반환
model.fit(principalDf)
principalDf['cluster'] = model.fit_predict(principalDf)

In [96]:
# DBSCAN 군집화를 통해 발견한 abnormal 예상 Lot 분류
DB_error=[]

for i in range(len(principalDf)):
    if principalDf['cluster'][i]==4:
        DB_error.append(i)
    elif principalDf['cluster'][i]==-1:
        DB_error.append(i)
        
# abnormal이라 판단되는 Lot 번호를 DB_error 리스트 변수에 저장한다. 이후 해당 Lot 번호를 라벨링에 사용

In [97]:
# IQR을 통한 abnormal Lot 번호와 DBSCAN을 통한 abnormal Lot 번호의 합집합
# 총 43개의 Lot을 abnormal이라 판단
error_list = sorted(list(set(IQR_error + DB_error)))
print('abnormal Lot 번호:', error_list)
print('abnormal Lot 개수:', len(error_list))

abnormal Lot 번호: [85, 86, 87, 88, 89, 90, 91, 92, 93, 100, 101, 102, 103, 113, 114, 328, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380]
abnormal Lot 개수: 41


In [98]:
# Shot 데이터에 Lot 번호 라벨링

n_Lot_list = []

for i in range(len(data_n_lot)):
    a = i
    Lot_label = data_n_lot[i].copy()
    Lot_label.loc[:,['Lot']] = a
    n_Lot_list.append(Lot_label)

  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_la

  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_label.loc[:,['Lot']] = a
  Lot_la

In [99]:
# concat 함수를 통한 데이터 결합
data_lot_label = n_Lot_list[0]
for i in range(len(n_Lot_list)):
    if i == 0:
        pass
    else:
        data_lot_label = pd.concat([data_lot_label, n_Lot_list[i]])

data_lot_label = data_lot_label.reset_index(drop=True)

In [100]:
# abnormal shot 라벨링

data_lot_label.loc[:, ['PassOrFail']] = 0

for i in range(len(data_lot_label)):
    if data_lot_label['Lot'][i] in error_list:
        data_lot_label.loc[i,['PassOrFail']]=1

data_shot_label = data_lot_label.copy().drop(['No_Shot'],axis=1)

  data_lot_label.loc[:, ['PassOrFail']] = 0


In [101]:
data_shot_label

Unnamed: 0,Machine_Cycle_Time,Cycle_Time,Barrel_Temp_Z1,Barrel_Temp_Z2,Barrel_Temp_Z3,Barrel_Temp_Z4,Hopper_Temp,Injection_Pressure_Real_Time,Screw_Position,Injection_Peak_Press,...,VP_Press,Plasticizing_Time,Plasticizing_Start_Position,Plasticizing_End_Position,Plasticizing_RPM,Cooling_Time,Back_Flow,Decompression_Time,Lot,PassOrFail
0,16.54,16.52,210.2,200.7,193.5,187.3,40.1,1.23,6.02,1211.65,...,1062.00,4.74,10.46,63.11,28.27,8.0,49.03,0.32,0,0
1,16.56,16.54,210.2,200.8,193.5,187.2,40.2,1.24,6.02,1211.65,...,1062.00,4.81,10.49,63.11,28.27,8.0,49.03,0.32,0,0
2,16.58,16.56,210.1,200.7,193.5,187.2,40.2,1.26,6.02,1209.16,...,1101.41,4.80,10.53,63.11,28.27,8.0,49.03,0.32,0,0
3,16.62,16.58,210.1,200.6,193.5,187.2,40.2,1.30,6.28,1228.24,...,1092.55,4.84,10.78,63.11,28.27,8.0,49.03,0.33,0,0
4,16.62,16.62,210.0,200.5,193.4,187.3,40.1,1.31,6.46,1287.11,...,1153.41,4.86,10.94,63.11,28.27,8.0,49.03,0.33,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030409,15.12,15.12,290.0,270.0,260.0,250.0,51.8,0.82,9.29,778.07,...,777.45,2.05,11.72,26.47,28.27,6.5,49.03,0.40,380,1
1030410,15.10,15.14,290.0,269.9,260.0,250.0,52.0,0.80,9.29,777.58,...,777.08,2.04,11.74,26.46,28.27,6.5,49.03,0.40,380,1
1030411,15.10,15.12,290.0,270.0,260.0,250.0,52.2,0.80,9.23,777.20,...,776.70,2.05,11.66,26.47,28.27,6.5,49.03,0.40,380,1
1030412,15.10,15.10,290.0,270.0,259.9,249.9,52.4,0.81,9.24,777.83,...,777.58,2.02,11.67,26.47,28.27,6.5,49.03,0.40,380,1


In [102]:
# 라벨링 갯수 비교
data_shot_label['PassOrFail'].value_counts()

0    940818
1     89596
Name: PassOrFail, dtype: int64

In [103]:
# 분석에 영향을 끼칠 수 있는 ‘Lot’ 컬럼을 drop()함수를 통해 삭제
data_shot_label = data_shot_label.copy().drop(['Lot'],axis=1)

In [106]:
# MinMaxScaler 초기화
scaler = MinMaxScaler()

# 스케일링 적용
final_df = scaler.fit_transform(data_shot_label)

# 스케일링된 데이터를 원래의 컬럼명으로 DataFrame 생성
final_df = pd.DataFrame(final_df, columns=data_shot_label.columns)

In [107]:
final_df

Unnamed: 0,Machine_Cycle_Time,Cycle_Time,Barrel_Temp_Z1,Barrel_Temp_Z2,Barrel_Temp_Z3,Barrel_Temp_Z4,Hopper_Temp,Injection_Pressure_Real_Time,Screw_Position,Injection_Peak_Press,...,VP_Position,VP_Press,Plasticizing_Time,Plasticizing_Start_Position,Plasticizing_End_Position,Plasticizing_RPM,Cooling_Time,Back_Flow,Decompression_Time,PassOrFail
0,0.231624,0.108827,0.696082,0.725657,0.725411,0.730238,0.582121,0.410000,0.217235,0.773513,...,0.156275,0.763308,0.194501,0.305040,0.999469,0.473611,0.181818,0.137909,0.428571,0.0
1,0.232479,0.109230,0.696082,0.726043,0.725411,0.729820,0.584200,0.413333,0.217235,0.773513,...,0.156275,0.763308,0.197374,0.305924,0.999469,0.473611,0.181818,0.137909,0.428571,0.0
2,0.233333,0.109633,0.695729,0.725657,0.725411,0.729820,0.584200,0.420000,0.217235,0.771588,...,0.155476,0.791662,0.196963,0.307103,0.999469,0.473611,0.181818,0.137909,0.428571,0.0
3,0.235043,0.110036,0.695729,0.725270,0.725411,0.729820,0.584200,0.433333,0.226571,0.786339,...,0.155476,0.785287,0.198605,0.314471,0.999469,0.473611,0.181818,0.137909,0.457143,0.0
4,0.235043,0.110842,0.695376,0.724884,0.725010,0.730238,0.582121,0.436667,0.233034,0.831853,...,0.154676,0.829073,0.199426,0.319187,0.999469,0.473611,0.181818,0.137909,0.457143,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030409,0.170940,0.080613,0.977762,0.993431,0.992373,0.992472,0.825364,0.273333,0.334650,0.438301,...,0.237010,0.558588,0.084120,0.342175,0.027328,0.473611,0.045455,0.137909,0.657143,1.0
1030410,0.170085,0.081016,0.977762,0.993045,0.992373,0.992472,0.829522,0.266667,0.334650,0.437922,...,0.237010,0.558322,0.083709,0.342765,0.027063,0.473611,0.045455,0.137909,0.657143,1.0
1030411,0.170085,0.080613,0.977762,0.993431,0.992373,0.992472,0.833680,0.266667,0.332496,0.437628,...,0.237810,0.558049,0.084120,0.340407,0.027328,0.473611,0.045455,0.137909,0.657143,1.0
1030412,0.170085,0.080210,0.977762,0.993431,0.991971,0.992054,0.837838,0.270000,0.332855,0.438115,...,0.237010,0.558682,0.082889,0.340701,0.027328,0.473611,0.045455,0.137909,0.657143,1.0


#### Auto ML - Pycaret

In [111]:
import pycaret

from pycaret.classification import *

In [113]:
# 데이터 프레임을 PyCaret 형식으로 변환
exp_clf = setup(data = final_df, target = 'PassOrFail', session_id = 400)

# AutoML 실행
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,400
1,Target,PassOrFail
2,Target type,Binary
3,Original data shape,"(1030414, 23)"
4,Transformed data shape,"(1030414, 23)"
5,Transformed train set shape,"(721289, 23)"
6,Transformed test set shape,"(309125, 23)"
7,Numeric features,22
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9988,1.0,0.9899,0.9961,0.993,0.9923,0.9923,1.04
rf,Random Forest Classifier,0.9987,1.0,0.9877,0.9977,0.9927,0.992,0.992,8.039
et,Extra Trees Classifier,0.9987,1.0,0.9863,0.9984,0.9923,0.9916,0.9916,4.609
xgboost,Extreme Gradient Boosting,0.9987,1.0,0.99,0.9948,0.9924,0.9916,0.9916,1.538
catboost,CatBoost Classifier,0.9987,1.0,0.9893,0.9962,0.9928,0.9921,0.9921,28.229
ada,Ada Boost Classifier,0.9983,1.0,0.9874,0.9931,0.9902,0.9893,0.9893,9.082
gbc,Gradient Boosting Classifier,0.9983,1.0,0.9831,0.997,0.99,0.989,0.9891,37.782
knn,K Neighbors Classifier,0.9981,0.9984,0.9849,0.993,0.9889,0.9879,0.9879,57.917
dt,Decision Tree Classifier,0.9981,0.9944,0.99,0.9886,0.9893,0.9883,0.9883,0.75
svm,SVM - Linear Kernel,0.9979,0.9998,0.9758,0.9998,0.9877,0.9865,0.9866,0.244


In [123]:
def get_clf_eval(y_test, y_pred=None):
    confusion = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['T[0]', 'F[1]'], columns = ['pred_T[0]', 'pred_F[1]'])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

#### LGBM

In [125]:
x_train, x_val, y_train, y_val = train_test_split(
    final_df.drop("PassOrFail", axis=1),
    final_df["PassOrFail"],
    test_size = 0.3,
    stratify = final_df["PassOrFail"],
    random_state = 400,
)

In [None]:
# LGBMClassifier 모델 초기화
lgbm = LGBMClassifier(random_state=400)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': np.arange(500, 1501, 100),
    'learning_rate': np.arange(0.01, 0.15, 0.02),
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

# GridSearchCV 설정
grid_lgbm = GridSearchCV(lgbm, param_grid, verbose=True, scoring='f1')

# 모델 학습
grid_lgbm.fit(x_train, y_train)

# 최적의 파라미터 출력
print('최적의 파라미터 :', grid_lgbm.best_params_)

Fitting 5 folds for each of 77 candidates, totalling 385 fits
[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3227
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454


[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3228
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3214
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGB

[LightGBM] [Info] Number of positive: 50174, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3211
[LightGBM] [Info] Number of data points in the train set: 577032, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351434
[LightGBM] [Info] Start training from score -2.351434
[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGB

[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3227
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3228
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGB

[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3214
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGBM] [Info] Number of positive: 50174, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3211
[LightGBM] [Info] Number of data points in the train set: 577032, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351434
[LightGBM] [Info] Start training from score -2.351434
[LightGB

[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025887 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3227
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGB

[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024363 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3228
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3214
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGB

[LightGBM] [Info] Number of positive: 50174, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3211
[LightGBM] [Info] Number of data points in the train set: 577032, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351434
[LightGBM] [Info] Start training from score -2.351434
[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGB

[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3227
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3228
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGB

[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3214
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGBM] [Info] Number of positive: 50174, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3211
[LightGBM] [Info] Number of data points in the train set: 577032, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351434
[LightGBM] [Info] Start training from score -2.351434
[LightGB

[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022677 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3227
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGB

[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3228
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGBM] [Info] Number of positive: 50174, number of negative: 526857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022455 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3214
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351432
[LightGBM] [Info] Start training from score -2.351432
[LightGB

[LightGBM] [Info] Number of positive: 50174, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3211
[LightGBM] [Info] Number of data points in the train set: 577032, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086952 -> initscore=-2.351434
[LightGBM] [Info] Start training from score -2.351434
[LightGBM] [Info] Number of positive: 50173, number of negative: 526858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 577031, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086950 -> initscore=-2.351454
[LightGBM] [Info] Start training from score -2.351454
[LightGB

In [122]:
lgbm_model.fit(x_train, y_train)

In [None]:
y_pred = lgbm.predict(x_val)
get_clf_eval(y_val, y_pred)

#### LGBM UnderSampling

In [115]:
# 언더샘플링
X_resampled, y_resampled = RandomUnderSampler(random_state = 400).fit_resample(final_df.drop("PassOrFail", axis=1), final_df["PassOrFail"])

In [116]:
x_train, x_val, y_train, y_val = train_test_split(
    X_resampled,
    y_resampled,
    test_size = 0.3,
    random_state = 400,
)

In [None]:
under_lgbm = LGBMClassifier(random_state = 400)

#### Voting

In [None]:
vote = VotingClassifier(estimators= [('lgbm', lgbm), ('lgbm_under', lgbm_under)], voting= 'soft')
vote.fit(x_train, y_train)

y_pred = vote.predict(x_val)
get_clf_eval(y_val, y_pred)

In [None]:
'''
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환"""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series
'''

In [None]:
'''
# 레이블 인코딩할 칼럼들
label_columns = [

]


for col in label_columns:
    df[col] = label_encoding(df[col])
    
'''