# Upload the data (데이터 업로드)

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
import shap
from tqdm import tqdm

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))

Fix the shifting (여기에서는 이미 shifting이 완료된 train_mod.csv 와 test_mod.csv를 사용함)

In [3]:
"""제츨할때 사용할 코드
def data_shifting(data) :
  THRESHOLD = len(data) / 2
  data = data.dropna(thresh=THRESHOLD, axis=1)

  l = ['Dam', 'Fill1', 'Fill2']

  def selection(process):
      start_col = f'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_{process}'
      data[start_col] = data[start_col].apply(lambda x: np.nan if x == 'OK' else x)

      end_col = f'WorkMode Collect Result_{process}'

      selected_columns = data.loc[:, start_col:end_col]
      return selected_columns, start_col, end_col

  def shifting(df, column):
      df_shifted = df.copy()
      nan_rows = df[df[column].isna()]

      for index in nan_rows.index:
          row = df_shifted.loc[index]
          shifted_row = row.shift(-1).fillna(pd.NA)
          df_shifted.loc[index] = shifted_row

      return df_shifted

  for i in l:
      selected_columns_i, start_col_i, end_col_i = selection(i)
      df_shifted = shifting(selected_columns_i, start_col_i)
      data.loc[:, start_col_i:end_col_i] = df_shifted

  return data
  """

"제츨할때 사용할 코드\ndef data_shifting(data) :\n  THRESHOLD = len(data) / 2\n  data = data.dropna(thresh=THRESHOLD, axis=1)\n\n  l = ['Dam', 'Fill1', 'Fill2']\n\n  def selection(process):\n      start_col = f'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_{process}'\n      data[start_col] = data[start_col].apply(lambda x: np.nan if x == 'OK' else x)\n\n      end_col = f'WorkMode Collect Result_{process}'\n\n      selected_columns = data.loc[:, start_col:end_col]\n      return selected_columns, start_col, end_col\n\n  def shifting(df, column):\n      df_shifted = df.copy()\n      nan_rows = df[df[column].isna()]\n\n      for index in nan_rows.index:\n          row = df_shifted.loc[index]\n          shifted_row = row.shift(-1).fillna(pd.NA)\n          df_shifted.loc[index] = shifted_row\n\n      return df_shifted\n\n  for i in l:\n      selected_columns_i, start_col_i, end_col_i = selection(i)\n      df_shifted = shifting(selected_columns_i, start_col_i)\n      data.loc[:, start_col_i:end

# Preprocessing the data (데이터 전처리)


Data encoding (categorical -> numerical)

In [4]:
# Separate features(X) and target(y)
X = train_data.drop(columns=['target'],axis=1)
y = train_data["target"]

# Data encoding
def cat2num(X) :
  non_numeric_columns = X.select_dtypes(include=['object']).columns

  encoder = LabelEncoder()
  encoded_df = pd.DataFrame(index=X.index)
  for column in non_numeric_columns:
      encoded_df[column] = encoder.fit_transform(X[column])

  X = X.drop(columns=non_numeric_columns)
  X = pd.concat([X, encoded_df], axis=1)
  return X

Generating the new features (파생변수 생성하기)

In [5]:
def generating_features(df):
    # 1. Thickness features from three differen stages at DAM
    # Thickness difference
    df['Thickness_Diff_1_2'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 2 Collect Result_Dam']
    df['Thickness_Diff_2_3'] = df['THICKNESS 2 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']
    df['Thickness_Diff_1_3'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']

    # Thickness average
    df['Thickness_Avg'] = (
        df['THICKNESS 1 Collect Result_Dam'] +
        df['THICKNESS 2 Collect Result_Dam'] +
        df['THICKNESS 3 Collect Result_Dam']) / 3

    # Thickness standard deviation
    df['Thickness_Std'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].std(axis=1)

    # 2. Pressure features from three different stages at AUTOCLAVE
    # Pressure difference
    df['Pressure_Diff_1st_2nd'] = df['1st Pressure Collect Result_AutoClave'] - df['2nd Pressure Collect Result_AutoClave']
    df['Pressure_Diff_2nd_3rd'] = df['2nd Pressure Collect Result_AutoClave'] - df['3rd Pressure Collect Result_AutoClave']
    df['Pressure_Diff_1st_3rd'] = df['1st Pressure Collect Result_AutoClave'] - df['3rd Pressure Collect Result_AutoClave']

    # Pressure average
    df['Pressure_Avg'] = (
        df['1st Pressure Collect Result_AutoClave'] +
        df['2nd Pressure Collect Result_AutoClave'] +
        df['3rd Pressure Collect Result_AutoClave']) / 3

    # Pressure standard deviation
    df['Pressure_Std'] = df[['1st Pressure Collect Result_AutoClave', '2nd Pressure Collect Result_AutoClave', '3rd Pressure Collect Result_AutoClave']].std(axis=1)

    return df

In [6]:
  # 3. Cartesian coordinates to spherical Coordinates
def cart2sph(df):


    coordinate_columns = [
        ("HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam", "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam", "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam", "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam", "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam", "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam", "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam"),
        ("HEAD Standby Position X Collect Result_Dam", "HEAD Standby Position Y Collect Result_Dam", "HEAD Standby Position Z Collect Result_Dam"),
        ("Head Clean Position X Collect Result_Dam", "Head Clean Position Y Collect Result_Dam", "Head Clean Position Z Collect Result_Dam"),
        ("Head Purge Position X Collect Result_Dam", "Head Purge Position Y Collect Result_Dam", "Head Purge Position Z Collect Result_Dam"),
        ("Head Zero Position X Collect Result_Dam", "Head Zero Position Y Collect Result_Dam", "Head Zero Position Z Collect Result_Dam"),

        ("HEAD Standby Position X Collect Result_Fill1", "HEAD Standby Position Y Collect Result_Fill1", "HEAD Standby Position Z Collect Result_Fill1"),
        ("Head Clean Position X Collect Result_Fill1", "Head Clean Position Y Collect Result_Fill1", "Head Clean Position Z Collect Result_Fill1"),
        ("Head Purge Position X Collect Result_Fill1", "Head Purge Position Y Collect Result_Fill1", "Head Purge Position Z Collect Result_Fill1"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1", "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1", "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1", "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1", "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1", "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1", "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1"),

        ("HEAD Standby Position X Collect Result_Fill2", "HEAD Standby Position Y Collect Result_Fill2", "HEAD Standby Position Z Collect Result_Fill2"),
        ("Head Clean Position X Collect Result_Fill2", "Head Clean Position Y Collect Result_Fill2", "Head Clean Position Z Collect Result_Fill2"),
        ("Head Purge Position X Collect Result_Fill2", "Head Purge Position Y Collect Result_Fill2", "Head Purge Position Z Collect Result_Fill2"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2", "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2", "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2", "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2", "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2"),
        ("HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2", "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2", "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2"),
    ]

    cylindrical_columns = [
        ("CURE START POSITION X Collect Result_Dam", "CURE START POSITION Z Collect Result_Dam", "CURE START POSITION Θ Collect Result_Dam"),
        ("CURE STANDBY POSITION X Collect Result_Dam", "CURE STANDBY POSITION Z Collect Result_Dam", "CURE STANDBY POSITION Θ Collect Result_Dam"),
        ("CURE END POSITION X Collect Result_Dam", "CURE END POSITION Z Collect Result_Dam", "CURE END POSITION Θ Collect Result_Dam"),
        ("CURE START POSITION X Collect Result_Fill2", "CURE START POSITION Z Collect Result_Fill2", "CURE START POSITION Θ Collect Result_Fill2"),
        ("CURE STANDBY POSITION X Collect Result_Fill2", "CURE STANDBY POSITION Z Collect Result_Fill2", "CURE STANDBY POSITION Θ Collect Result_Fill2"),
        ("CURE END POSITION X Collect Result_Fill2", "CURE END POSITION Z Collect Result_Fill2", "CURE END POSITION Θ Collect Result_Fill2"),
    ]

    for x_col, y_col, z_col in coordinate_columns:

      df[x_col] = np.sqrt(df[x_col]**2 + df[y_col]**2 + df[z_col]**2)
      df[y_col] = np.arctan2(df[y_col], df[x_col])
      df[z_col] = np.arccos(df[z_col] / df[x_col])

      r_col = x_col.replace("X", "r").replace("Y", "θ").replace("Z", "φ")
      theta_col = y_col.replace("X", "r").replace("Y", "θ").replace("Z", "φ")
      phi_col = z_col.replace("X", "r").replace("Y", "θ").replace("Z", "φ")

      df.rename(columns={x_col: r_col, y_col: theta_col, z_col: phi_col}, inplace=True)

    for x_col, z_col, theta_col in cylindrical_columns:

      df[x_col] = np.sqrt(df[x_col]**2 + df[z_col]**2)
      # df[theta_col] = df[theta_col]
      df[z_col] = np.arctan2(df[x_col], df[z_col])

      r_col = x_col.replace("X", "r").replace("Z", "φ").replace("Θ", "θ")
      phi_col = z_col.replace("X", "r").replace("Z", "φ").replace("Θ", "θ")

      df.rename(columns={x_col: r_col, z_col: phi_col}, inplace=True)

    return df

Exclude columns where every value is unique for each row (feature의 값이 다 다르거나 다 같은경우 feature 제거)

In [7]:
def preprocess(df) :
  # Exclude columns where every value is unique for each row

  #Identify columns where all values are the same
  same_rows_columns = [column for column in df.columns if df[column].nunique() == 1]

  #Identify columns where every row's value is unique
  row_count = len(df)
  matching_row_columns = [column for column in df.columns if df[column].value_counts().size == row_count]

  #Drop columns identified in the above steps
  df.drop(columns=same_rows_columns, inplace=True)
  df.drop(columns=matching_row_columns, inplace=True)

  return df


In [8]:
X = cat2num(X)
X = generating_features(X)
X = cart2sph(X)
X = preprocess(X)

#concat X with target y
train_data = pd.concat([X, y], axis=1)

In [9]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max 스케일러 객체 생성
scaler = MinMaxScaler()

# 수치형 컬럼만 선택하여 스케일링 적용
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns
train_data[numeric_columns] = scaler.fit_transform(train_data[numeric_columns])

# Sampling (샘플링)

Downsampling

In [10]:
# Downsampling
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")


  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
Name: count, dtype: int64

In [11]:
train_x = df_concat.copy()
train_y = train_x.target
train_x.drop(columns = ['target'],inplace =True)
train_x = train_x.select_dtypes(exclude=['object'])

In [12]:
#Target encoding
label_encoder = LabelEncoder()
train_y = label_encoder.fit_transform(train_y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    train_x,
    train_y,
    test_size=0.3,
    random_state=RANDOM_STATE,
)


In [14]:
import catboost as cb
# CatBoostClassifier 모델 정의
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.01,
    verbose=0  # 학습 과정을 출력하지 않도록 설정
)

# 모델 학습
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 메트릭 계산
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

# 결과 출력
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

F1 Score: 0.6473
Recall: 0.6566
Accuracy: 0.6305
Precision: 0.6382


In [15]:
# import xgboost as xgb
# # XGBoost model
# model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# # F1 score calculation
# f1 = f1_score(y_test, y_pred)
# print(f"F1 Score: {f1:.4f}")

In [16]:
#SHAP
# SHAP value caculation
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Features importance
df_shap = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = df_shap.abs().mean().sort_values(ascending=False)
shap_importance

Dispense Volume(Stage2) Collect Result_Fill1                  0.026253
Workorder_AutoClave                                           0.022255
HEAD NORMAL COORDINATE φ ArIS(Stage1) Collect Result_Fill2    0.020984
Workorder_Fill1                                               0.018951
Workorder_Dam                                                 0.018915
                                                                ...   
CURE END POSITION φ Collect Result_Fill2                      0.000043
Head Purge Position r Collect Result_Fill2                    0.000000
HEAD NORMAL COORDINATE φ ArIS(Stage2) Collect Result_Fill2    0.000000
Equipment_Fill1                                               0.000000
CURE END POSITION φ Collect Result_Dam                        0.000000
Length: 139, dtype: float64

In [17]:
important_features = shap_importance[shap_importance > 0.02].index.tolist()
important_features

['Dispense Volume(Stage2) Collect Result_Fill1',
 'Workorder_AutoClave',
 'HEAD NORMAL COORDINATE φ ArIS(Stage1) Collect Result_Fill2']

# Model

In [18]:
train_x_shap = train_x[important_features]

In [19]:
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
catboost_model = CatBoostClassifier(verbose=0, random_state=42)
xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
randomforest_model = RandomForestClassifier(random_state=42)

# VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('xgboost', xgboost_model),
        ('randomforest', randomforest_model)
    ],
    voting='soft'
)

# train the model
voting_clf.fit(train_x_shap, train_y)

Parameters: { "use_label_encoder" } are not used.



In [20]:
# xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
# xgb_model.fit(train_x_shap, train_y)

# Test data

In [21]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))

In [22]:
test_data = cat2num(test_data)
test_data = generating_features(test_data)
test_data = cart2sph(test_data)
test_data = preprocess(test_data)

test_data

Unnamed: 0,CURE END POSITION r Collect Result_Dam,CURE END POSITION φ Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION r Collect Result_Dam,CURE START POSITION φ Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,...,Thickness_Diff_1_2,Thickness_Diff_2_3,Thickness_Diff_1_3,Thickness_Avg,Thickness_Std,Pressure_Diff_1st_2nd,Pressure_Diff_2nd_3rd,Pressure_Diff_1st_3rd,Pressure_Avg,Pressure_Std
0,1000.078122,1.558298,90,70,281.996897,1.452555,90,10,17.0,4.9,...,0.000,0.000,0.000,0.000000,0.00000,-0.001,-0.182,-0.183,0.376333,0.105368
1,1000.078122,1.558298,90,70,281.996897,1.452555,90,16,14.2,8.3,...,0.165,-0.226,-0.061,-0.088667,0.11692,-0.279,-0.017,-0.296,0.495667,0.166206
2,240.013020,1.560381,-90,70,1030.544638,1.538301,-90,10,9.7,4.9,...,0.000,0.000,0.000,0.000000,0.00000,0.000,-0.192,-0.192,0.372000,0.110851
3,1000.078122,1.558298,90,70,281.996897,1.452555,90,10,21.3,10.6,...,0.000,0.000,0.000,0.000000,0.00000,-0.001,-0.198,-0.199,0.367667,0.114605
4,240.013020,1.560381,-90,70,1030.544638,1.538301,-90,16,13.2,7.5,...,0.000,0.000,0.000,0.000000,0.00000,-0.190,-0.005,-0.195,0.430333,0.111168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1000.078122,1.558298,90,70,281.996897,1.452555,90,10,21.3,10.6,...,0.000,0.000,0.000,0.000000,0.00000,-0.001,-0.189,-0.190,0.370667,0.109409
17357,1000.078122,1.558298,90,70,281.996897,1.452555,90,16,13.2,7.6,...,0.000,0.000,0.000,0.000000,0.00000,-0.190,-0.005,-0.195,0.430333,0.111168
17358,240.013020,1.560381,-90,70,1030.544638,1.538301,-90,16,13.2,6.6,...,0.000,0.000,0.000,0.000000,0.00000,0.000,-0.189,-0.189,0.371000,0.109119
17359,240.013020,1.560381,-90,70,1030.544638,1.538301,-90,10,9.7,3.9,...,0.000,0.000,0.000,0.000000,0.00000,-0.001,-0.194,-0.195,0.368333,0.112296


In [23]:
# Scaler
scaler = MinMaxScaler()
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns
test_data[numeric_columns] = scaler.fit_transform(test_data[numeric_columns])

test_data

Unnamed: 0,CURE END POSITION r Collect Result_Dam,CURE END POSITION φ Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION r Collect Result_Dam,CURE START POSITION φ Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,...,Thickness_Diff_1_2,Thickness_Diff_2_3,Thickness_Diff_1_3,Thickness_Avg,Thickness_Std,Pressure_Diff_1st_2nd,Pressure_Diff_2nd_3rd,Pressure_Diff_1st_3rd,Pressure_Avg,Pressure_Std
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.632479,0.161765,...,0.0,0.69969,0.38125,0.801205,0.0,0.981967,0.432099,0.690705,0.175453,0.300602
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.393162,0.661765,...,1.0,0.00000,0.00000,0.000000,1.0,0.526230,0.941358,0.509615,0.470346,0.474167
2,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.008547,0.161765,...,0.0,0.69969,0.38125,0.801205,0.0,0.983607,0.401235,0.676282,0.164745,0.316247
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.000000,1.000000,...,0.0,0.69969,0.38125,0.801205,0.0,0.981967,0.382716,0.665064,0.154036,0.326956
4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.307692,0.544118,...,0.0,0.69969,0.38125,0.801205,0.0,0.672131,0.978395,0.671474,0.308896,0.317150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.000000,1.000000,...,0.0,0.69969,0.38125,0.801205,0.0,0.981967,0.410494,0.679487,0.161450,0.312132
17357,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.307692,0.558824,...,0.0,0.69969,0.38125,0.801205,0.0,0.672131,0.978395,0.671474,0.308896,0.317150
17358,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.307692,0.411765,...,0.0,0.69969,0.38125,0.801205,0.0,0.983607,0.410494,0.681090,0.162273,0.311305
17359,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.008547,0.014706,...,0.0,0.69969,0.38125,0.801205,0.0,0.981967,0.395062,0.671474,0.155684,0.320368


In [24]:
# Apply SHAP features to the test set
test_x_shap = test_data[important_features]
test_x_shap

Unnamed: 0,Dispense Volume(Stage2) Collect Result_Fill1,Workorder_AutoClave,HEAD NORMAL COORDINATE φ ArIS(Stage1) Collect Result_Fill2
0,0.034483,0.363086,1.0
1,0.034483,0.770045,1.0
2,0.034483,0.193646,0.0
3,0.068966,0.462935,1.0
4,0.034483,0.627837,0.0
...,...,...,...
17356,0.034483,0.419062,1.0
17357,0.034483,0.664145,1.0
17358,0.077132,0.475038,0.0
17359,0.155172,0.012103,0.0


In [25]:
y_pred = voting_clf.predict(test_x_shap)

In [26]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred

df_sub['target'] = df_sub['target'].map({1: 'AbNormal', 0: 'Normal'})

counts = df_sub['target'].value_counts()
ratio = counts['Normal'] / counts['AbNormal']

print("Ratio", ratio)

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

Ratio 94.39010989010988
