# Upload the data (데이터 업로드)

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
import shap
from tqdm import tqdm

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))

Fix the shifting (여기에서는 이미 shifting이 완료된 train_mod.csv 와 test_mod.csv를 사용함)

In [3]:
# def data_shifting(data) :
#   THRESHOLD = len(data) / 2
#   data = data.dropna(thresh=THRESHOLD, axis=1)

#   l = ['Dam', 'Fill1', 'Fill2']

#   def selection(process):
#       start_col = f'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_{process}'
#       data[start_col] = data[start_col].apply(lambda x: np.nan if x == 'OK' else x)

#       end_col = f'WorkMode Collect Result_{process}'

#       selected_columns = data.loc[:, start_col:end_col]
#       return selected_columns, start_col, end_col

#   def shifting(df, column):
#       df_shifted = df.copy()
#       nan_rows = df[df[column].isna()]

#       for index in nan_rows.index:
#           row = df_shifted.loc[index]
#           shifted_row = row.shift(-1).fillna(pd.NA)
#           df_shifted.loc[index] = shifted_row

#       return df_shifted

#   for i in l:
#       selected_columns_i, start_col_i, end_col_i = selection(i)
#       df_shifted = shifting(selected_columns_i, start_col_i)
#       data.loc[:, start_col_i:end_col_i] = df_shifted

#   return data


# Preprocessing the data (데이터 전처리)


Data encoding (categorical -> numerical)

In [4]:
# Data encoding
def cat2num(X):
    non_numeric_columns = X.select_dtypes(include=['object']).columns
    print("Non-numeric columns:", non_numeric_columns)

    encoded_columns = {}

    for column in non_numeric_columns:
        encoder = LabelEncoder()
        encoded_columns[column] = encoder.fit_transform(X[column])

    encoded_df = pd.DataFrame(encoded_columns, index=X.index)

    X = X.drop(columns=non_numeric_columns)
    X = pd.concat([X, encoded_df], axis=1)

    return X

Generating the new features (파생변수 생성하기)

In [5]:
def generating_features(df):
    # 1. Thickness features from three differen stages at DAM
    # Thickness difference
    df['Thickness_Diff_1_2'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 2 Collect Result_Dam']
    df['Thickness_Diff_2_3'] = df['THICKNESS 2 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']
    df['Thickness_Diff_1_3'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']

    # Thickness average
    df['Thickness_Avg'] = (
        df['THICKNESS 1 Collect Result_Dam'] +
        df['THICKNESS 2 Collect Result_Dam'] +
        df['THICKNESS 3 Collect Result_Dam']) / 3

    # Thickness standard deviation
    df['Thickness_Std'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].std(axis=1)

    # 2. Pressure features from three different stages at AUTOCLAVE
    # Pressure difference
    df['Pressure_Diff_1st_2nd'] = df['1st Pressure Collect Result_AutoClave'] - df['2nd Pressure Collect Result_AutoClave']
    df['Pressure_Diff_2nd_3rd'] = df['2nd Pressure Collect Result_AutoClave'] - df['3rd Pressure Collect Result_AutoClave']
    df['Pressure_Diff_1st_3rd'] = df['1st Pressure Collect Result_AutoClave'] - df['3rd Pressure Collect Result_AutoClave']

    # Pressure average
    df['Pressure_Avg'] = (
        df['1st Pressure Collect Result_AutoClave'] +
        df['2nd Pressure Collect Result_AutoClave'] +
        df['3rd Pressure Collect Result_AutoClave']) / 3

    # Pressure standard deviation
    df['Pressure_Std'] = df[['1st Pressure Collect Result_AutoClave', '2nd Pressure Collect Result_AutoClave', '3rd Pressure Collect Result_AutoClave']].std(axis=1)

    return df

In [6]:
  # 3. Cartesian coordinates to spherical Coordinates
def cart2sph(df):


    coordinate_columns = [
        ("HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam", "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam", "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam", "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam", "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam", "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam", "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam"),
        ("HEAD Standby Position X Collect Result_Dam", "HEAD Standby Position Y Collect Result_Dam", "HEAD Standby Position Z Collect Result_Dam"),
        ("Head Clean Position X Collect Result_Dam", "Head Clean Position Y Collect Result_Dam", "Head Clean Position Z Collect Result_Dam"),
        ("Head Purge Position X Collect Result_Dam", "Head Purge Position Y Collect Result_Dam", "Head Purge Position Z Collect Result_Dam"),
        ("Head Zero Position X Collect Result_Dam", "Head Zero Position Y Collect Result_Dam", "Head Zero Position Z Collect Result_Dam"),

        ("HEAD Standby Position X Collect Result_Fill1", "HEAD Standby Position Y Collect Result_Fill1", "HEAD Standby Position Z Collect Result_Fill1"),
        ("Head Clean Position X Collect Result_Fill1", "Head Clean Position Y Collect Result_Fill1", "Head Clean Position Z Collect Result_Fill1"),
        ("Head Purge Position X Collect Result_Fill1", "Head Purge Position Y Collect Result_Fill1", "Head Purge Position Z Collect Result_Fill1"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1", "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1", "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1", "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1", "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1", "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1", "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1"),

        ("HEAD Standby Position X Collect Result_Fill2", "HEAD Standby Position Y Collect Result_Fill2", "HEAD Standby Position Z Collect Result_Fill2"),
        ("Head Clean Position X Collect Result_Fill2", "Head Clean Position Y Collect Result_Fill2", "Head Clean Position Z Collect Result_Fill2"),
        ("Head Purge Position X Collect Result_Fill2", "Head Purge Position Y Collect Result_Fill2", "Head Purge Position Z Collect Result_Fill2"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2", "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2", "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2", "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2", "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2", "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2", "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2"),
    ]

    cylindrical_columns = [
        ("CURE START POSITION X Collect Result_Dam", "CURE START POSITION Z Collect Result_Dam", "CURE START POSITION Θ Collect Result_Dam"),
        ("CURE STANDBY POSITION X Collect Result_Dam", "CURE STANDBY POSITION Z Collect Result_Dam", "CURE STANDBY POSITION Θ Collect Result_Dam"),
        ("CURE END POSITION X Collect Result_Dam", "CURE END POSITION Z Collect Result_Dam", "CURE END POSITION Θ Collect Result_Dam"),
        ("CURE START POSITION X Collect Result_Fill2", "CURE START POSITION Z Collect Result_Fill2", "CURE START POSITION Θ Collect Result_Fill2"),
        ("CURE STANDBY POSITION X Collect Result_Fill2", "CURE STANDBY POSITION Z Collect Result_Fill2", "CURE STANDBY POSITION Θ Collect Result_Fill2"),
        ("CURE END POSITION X Collect Result_Fill2", "CURE END POSITION Z Collect Result_Fill2", "CURE END POSITION Θ Collect Result_Fill2"),
    ]

    for x_col, y_col, z_col in coordinate_columns:

      df[x_col] = np.sqrt(df[x_col]**2 + df[y_col]**2 + df[z_col]**2)
      df[y_col] = np.arctan2(df[y_col], df[x_col])
      df[z_col] = np.arccos(df[z_col] / df[x_col])

      r_col = x_col.replace("X", "r").replace("Y", "θ").replace("Z", "φ")
      theta_col = y_col.replace("X", "r").replace("Y", "θ").replace("Z", "φ")
      phi_col = z_col.replace("X", "r").replace("Y", "θ").replace("Z", "φ")

      df.rename(columns={x_col: r_col, y_col: theta_col, z_col: phi_col}, inplace=True)

    for x_col, z_col, theta_col in cylindrical_columns:

      df[x_col] = np.sqrt(df[x_col]**2 + df[z_col]**2)
      # df[theta_col] = df[theta_col]
      df[z_col] = np.arctan2(df[x_col], df[z_col])

      r_col = x_col.replace("X", "r").replace("Z", "φ").replace("Θ", "θ")
      phi_col = z_col.replace("X", "r").replace("Z", "φ").replace("Θ", "θ")

      df.rename(columns={x_col: r_col, z_col: phi_col}, inplace=True)

    return df

Exclude columns where every value is unique for each row (feature의 값이 다 다르거나 다 같은경우 feature 제거)

In [7]:
def preprocess(df) :
  # Exclude columns where every value is unique for each row

  #Identify columns where all values are the same
    same_rows_columns = [column for column in df.columns if df[column].nunique() == 1]

  #Identify columns where every row's value is unique
    row_count = len(df)
    matching_row_columns = [column for column in df.columns if df[column].value_counts().size == row_count]

    columns_to_remove = [ 'Wip Line_Dam', 'Process Desc._Dam', 'Equipment_Dam',
       'Model.Suffix_Dam', 'Workorder_Dam',
       'Wip Line_AutoClave', 'Process Desc._AutoClave', 'Equipment_AutoClave',
       'Model.Suffix_AutoClave', 'Workorder_AutoClave',
       'Wip Line_Fill1', 'Process Desc._Fill1', 'Equipment_Fill1', 'Model.Suffix_Fill1',
       'Workorder_Fill1', 'Wip Line_Fill2', 'Process Desc._Fill2', 'Equipment_Fill2', 'Model.Suffix_Fill2', 'Workorder_Fill2']

  #Drop columns identified in the above steps
    all_columns_to_remove = list(set(same_rows_columns + matching_row_columns + columns_to_remove))
    df.drop(columns=[col for col in all_columns_to_remove if col in df.columns], inplace=True)
    
    print("same_rows_columns",same_rows_columns)
    print("matching_row_columns",matching_row_columns)
    return df


In [8]:
train_data = preprocess(train_data)

same_rows_columns ['Wip Line_Dam', 'Process Desc._Dam', 'Insp. Seq No._Dam', 'Insp Judge Code_Dam', 'CURE STANDBY POSITION X Collect Result_Dam', 'CURE STANDBY POSITION Z Collect Result_Dam', 'CURE STANDBY POSITION Θ Collect Result_Dam', 'CURE START POSITION Z Collect Result_Dam', 'HEAD Standby Position X Collect Result_Dam', 'HEAD Standby Position Y Collect Result_Dam', 'HEAD Standby Position Z Collect Result_Dam', 'Head Clean Position X Collect Result_Dam', 'Head Clean Position Y Collect Result_Dam', 'Head Purge Position X Collect Result_Dam', 'Head Purge Position Y Collect Result_Dam', 'Head Zero Position X Collect Result_Dam', 'WorkMode Collect Result_Dam', 'Wip Line_AutoClave', 'Process Desc._AutoClave', 'Equipment_AutoClave', 'Insp. Seq No._AutoClave', 'Insp Judge Code_AutoClave', '1st Pressure Judge Value_AutoClave', '2nd Pressure Judge Value_AutoClave', '3rd Pressure Judge Value_AutoClave', 'Wip Line_Fill1', 'Process Desc._Fill1', 'Insp. Seq No._Fill1', 'Insp Judge Code_Fill1',

In [9]:
train_data = cat2num(train_data)
train_data = generating_features(train_data)
train_data = cart2sph(train_data)

Non-numeric columns: Index(['Chamber Temp. Judge Value_AutoClave', 'target'], dtype='object')


In [10]:
# Min-Max 스케일러 객체 생성
scaler = MinMaxScaler()

columns_to_scale = [col for col in train_data.columns if col != 'target']
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# Sampling (샘플링)

Downsampling

In [11]:
# Downsampling
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = train_data[train_data["target"] == 1]
df_abnormal = train_data[train_data["target"] == 0]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")


  Total: Normal: 38156, AbNormal: 2350


target
0    2350
1    2350
Name: count, dtype: int64

In [12]:
train_x = df_concat.copy()
train_y = train_x.target
train_x.drop(columns = ['target'],inplace =True)
print("train :", train_y)

train : 0       1
1       1
2       1
3       1
4       1
       ..
4695    0
4696    0
4697    0
4698    0
4699    0
Name: target, Length: 4700, dtype: int64


In [13]:
#Target encoding
label_encoder = LabelEncoder()
train_y = label_encoder.fit_transform(train_y)
print("train :", train_y)

train : [1 1 1 ... 0 0 0]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    train_x,
    train_y,
    test_size=0.3,
    random_state=RANDOM_STATE,
)


In [15]:
import catboost as cb
# CatBoostClassifier 모델 정의
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.01,
    verbose=0  # 학습 과정을 출력하지 않도록 설정
)

# 모델 학습
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 메트릭 계산
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

# 결과 출력
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

F1 Score: 0.6481
Recall: 0.6552
Accuracy: 0.6326
Precision: 0.6411


In [16]:
# import xgboost as xgb
# # XGBoost model
# model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# # F1 score calculation
# f1 = f1_score(y_test, y_pred)
# print(f"F1 Score: {f1:.4f}")

In [17]:
#SHAP
# SHAP value caculation
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Features importance
df_shap = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = df_shap.abs().mean().sort_values(ascending=False)
shap_importance

Dispense Volume(Stage2) Collect Result_Fill1                  0.020486
Pressure_Std                                                  0.016696
Machine Tact time Collect Result_Fill1                        0.014888
HEAD NORMAL COORDINATE φ ArIS(Stage2) Collect Result_Fill1    0.014287
HEAD NORMAL COORDINATE θ ArIS(Stage3) Collect Result_Dam      0.014264
                                                                ...   
CURE START POSITION Θ Collect Result_Dam                      0.000328
CURE SPEED Collect Result_Dam                                 0.000232
CURE END POSITION X Collect Result_Fill2                      0.000016
CURE START POSITION X Collect Result_Fill2                    0.000008
CURE END POSITION Θ Collect Result_Dam                        0.000000
Length: 117, dtype: float64

In [18]:
important_features = shap_importance[shap_importance > 0.006].index.tolist()
important_features

['Dispense Volume(Stage2) Collect Result_Fill1',
 'Pressure_Std',
 'Machine Tact time Collect Result_Fill1',
 'HEAD NORMAL COORDINATE φ ArIS(Stage2) Collect Result_Fill1',
 'HEAD NORMAL COORDINATE θ ArIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE r ArIS(Stage2) Collect Result_Fill1',
 '3rd Pressure Unit Time_AutoClave',
 'Stage2 Line2 Distance Speed Collect Result_Dam',
 'Production Qty Collect Result_Fill1',
 'Stage2 Circle3 Distance Speed Collect Result_Dam',
 'Stage2 Line1 Distance Speed Collect Result_Dam',
 'Stage2 Line4 Distance Speed Collect Result_Dam',
 'Dispense Volume(Stage2) Collect Result_Dam',
 'Stage2 Circle2 Distance Speed Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
 'Stage2 Circle4 Distance Speed Collect Result_Dam',
 'Production Qty Collect Result_Fill2',
 'HEAD NORMAL COORDINATE r ArIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE φ ArIS(Stage1) Collect Result_Fill2',
 'HEAD NORMAL COORDINATE θ ArIS(Stage3) Collect 

# Model

In [19]:
train_x_shap = train_x[important_features]

In [20]:
# from sklearn.ensemble import VotingClassifier
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# catboost_model = CatBoostClassifier(verbose=0, random_state=42)
# xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# randomforest_model = RandomForestClassifier(random_state=42)

# # VotingClassifier
# voting_clf = VotingClassifier(
#     estimators=[
#         ('catboost', catboost_model),
#         ('xgboost', xgboost_model),
#         ('randomforest', randomforest_model)
#     ],
#     voting='soft'
# )

# # train the model
# voting_clf.fit(train_x_shap, train_y)

In [21]:
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.01,
    verbose=0  # 학습 과정을 출력하지 않도록 설정
)

model.fit(train_x_shap, train_y)

<catboost.core.CatBoostClassifier at 0x7f0acc52a0b0>

# Test data

In [22]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))

In [23]:
test_data = preprocess(test_data)
test_data = cat2num(test_data)
test_data = generating_features(test_data)
test_data = cart2sph(test_data)


test_data

same_rows_columns ['Wip Line_Dam', 'Process Desc._Dam', 'Insp. Seq No._Dam', 'Insp Judge Code_Dam', 'CURE STANDBY POSITION X Collect Result_Dam', 'CURE STANDBY POSITION Z Collect Result_Dam', 'CURE STANDBY POSITION Θ Collect Result_Dam', 'CURE START POSITION Z Collect Result_Dam', 'HEAD Standby Position X Collect Result_Dam', 'HEAD Standby Position Y Collect Result_Dam', 'HEAD Standby Position Z Collect Result_Dam', 'Head Clean Position X Collect Result_Dam', 'Head Clean Position Y Collect Result_Dam', 'Head Purge Position X Collect Result_Dam', 'Head Purge Position Y Collect Result_Dam', 'Head Zero Position X Collect Result_Dam', 'WorkMode Collect Result_Dam', 'Wip Line_AutoClave', 'Process Desc._AutoClave', 'Equipment_AutoClave', 'Insp. Seq No._AutoClave', 'Insp Judge Code_AutoClave', '1st Pressure Judge Value_AutoClave', '2nd Pressure Judge Value_AutoClave', '3rd Pressure Judge Value_AutoClave', 'Wip Line_Fill1', 'Process Desc._Fill1', 'Insp. Seq No._Fill1', 'Insp Judge Code_Fill1',

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Thickness_Diff_1_2,Thickness_Diff_2_3,Thickness_Diff_1_3,Thickness_Avg,Thickness_Std,Pressure_Diff_1st_2nd,Pressure_Diff_2nd_3rd,Pressure_Diff_1st_3rd,Pressure_Avg,Pressure_Std
0,1000.0,12.5,90,70,280,90,10,17.0,4.9,17.0,...,0.000,0.000,0.000,0.000000,0.00000,-0.001,-0.182,-0.183,0.376333,0.105368
1,1000.0,12.5,90,70,280,90,16,14.2,8.3,14.2,...,0.165,-0.226,-0.061,-0.088667,0.11692,-0.279,-0.017,-0.296,0.495667,0.166206
2,240.0,2.5,-90,70,1030,-90,10,9.7,4.9,9.7,...,0.000,0.000,0.000,0.000000,0.00000,0.000,-0.192,-0.192,0.372000,0.110851
3,1000.0,12.5,90,70,280,90,10,21.3,10.6,21.3,...,0.000,0.000,0.000,0.000000,0.00000,-0.001,-0.198,-0.199,0.367667,0.114605
4,240.0,2.5,-90,70,1030,-90,16,13.2,7.5,13.2,...,0.000,0.000,0.000,0.000000,0.00000,-0.190,-0.005,-0.195,0.430333,0.111168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1000.0,12.5,90,70,280,90,10,21.3,10.6,21.3,...,0.000,0.000,0.000,0.000000,0.00000,-0.001,-0.189,-0.190,0.370667,0.109409
17357,1000.0,12.5,90,70,280,90,16,13.2,7.6,13.2,...,0.000,0.000,0.000,0.000000,0.00000,-0.190,-0.005,-0.195,0.430333,0.111168
17358,240.0,2.5,-90,70,1030,-90,16,13.2,6.6,13.2,...,0.000,0.000,0.000,0.000000,0.00000,0.000,-0.189,-0.189,0.371000,0.109119
17359,240.0,2.5,-90,70,1030,-90,10,9.7,3.9,9.7,...,0.000,0.000,0.000,0.000000,0.00000,-0.001,-0.194,-0.195,0.368333,0.112296


In [24]:
# Min-Max 스케일러 객체 생성
scaler = MinMaxScaler()

columns_to_scale = [col for col in train_data.columns if col != 'target']
test_data[columns_to_scale] = scaler.fit_transform(test_data[columns_to_scale])

In [25]:
# Apply SHAP features to the test set
test_x_shap = test_data[important_features]
test_x_shap

Unnamed: 0,Dispense Volume(Stage2) Collect Result_Fill1,Pressure_Std,Machine Tact time Collect Result_Fill1,HEAD NORMAL COORDINATE φ ArIS(Stage2) Collect Result_Fill1,HEAD NORMAL COORDINATE θ ArIS(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE r ArIS(Stage2) Collect Result_Fill1,3rd Pressure Unit Time_AutoClave,Stage2 Line2 Distance Speed Collect Result_Dam,Production Qty Collect Result_Fill1,Stage2 Circle3 Distance Speed Collect Result_Dam,...,Pressure_Diff_1st_2nd,HEAD NORMAL COORDINATE r ArIS(Stage2) Collect Result_Dam,HEAD NORMAL COORDINATE r ArIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE φ ArIS(Stage1) Collect Result_Fill1,Head Purge Position Z Collect Result_Dam,2nd Pressure Collect Result_AutoClave,HEAD NORMAL COORDINATE θ ArIS(Stage1) Collect Result_Dam,Stage1 Line2 Distance Speed Collect Result_Dam,HEAD NORMAL COORDINATE φ ArIS(Stage3) Collect Result_Fill2,1st Pressure 1st Pressure Unit Time_AutoClave
0,0.034483,0.300602,0.127313,0.916849,0.986799,0.819520,0.400000,0.6250,0.322314,0.6250,...,0.981967,0.864550,0.985341,0.999511,0.705556,0.031509,0.997112,0.2,1.0,0.803333
1,0.034483,0.474167,0.123612,0.916102,0.992985,0.819382,0.400000,0.1875,0.423140,0.1875,...,0.526230,0.987739,0.983338,0.999235,0.705556,0.474295,0.999362,0.4,1.0,0.803333
2,0.034483,0.316247,0.366395,0.005563,0.003072,0.001220,0.400000,0.6250,0.161983,0.6250,...,0.983607,0.004631,0.005472,0.705303,1.000000,0.018242,0.001966,1.0,0.0,0.803333
3,0.068966,0.326956,0.204293,0.916849,0.983570,0.819520,0.400000,0.0000,0.000000,0.0000,...,0.981967,0.865274,0.986523,0.999584,0.705556,0.008292,0.996861,0.0,1.0,0.803333
4,0.034483,0.317150,0.100666,0.003089,0.020510,0.000805,0.400000,0.3125,0.355372,0.3125,...,0.672131,0.003560,0.004498,0.704816,0.705556,0.323383,0.003475,0.5,0.0,0.803333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,0.034483,0.312132,0.238342,0.916849,0.986120,0.819520,0.400000,0.0000,0.216529,0.0000,...,0.981967,0.864630,0.986149,0.999584,0.705556,0.018242,0.997154,0.0,1.0,0.803333
17357,0.034483,0.317150,0.121392,0.916010,0.987140,0.819205,0.400000,0.3125,0.461157,0.3125,...,0.672131,0.989342,0.985762,0.999157,0.705556,0.323383,0.997458,0.5,1.0,0.803333
17358,0.077132,0.311305,0.092524,0.002909,0.009634,0.000730,0.400000,0.3125,0.109091,0.3125,...,0.983607,0.005228,0.006631,0.704816,0.705556,0.018242,0.000990,0.5,0.0,0.803333
17359,0.155172,0.320368,0.372317,0.006616,0.000000,0.001959,0.496667,1.0000,0.193388,1.0000,...,0.981967,0.004734,0.004971,0.704560,1.000000,0.011609,0.002550,1.0,0.0,1.000000


In [26]:
y_pred = model.predict(test_x_shap)

In [27]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred

df_sub['target'] = df_sub['target'].map({1: 'Normal', 0: 'AbNormal'})

counts = df_sub['target'].value_counts()
ratio = counts['AbNormal'] / (counts['AbNormal'] + counts['Normal'])
print("The ratio of abnormal is : ", ratio)

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

The ratio of abnormal is :  0.7804849951039686
