In [20]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.feature_selection import RFE
import shap
import catboost as cb
import xgboost as xgb
from tqdm import tqdm
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, BorderlineSMOTE, SVMSMOTE, ADASYN, KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, CondensedNearestNeighbour, OneSidedSelection, InstanceHardnessThreshold, AllKNN, RepeatedEditedNearestNeighbours, EditedNearestNeighbours
from imblearn.combine import SMOTEENN

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

ROOT_DIR = "data"
RANDOM_STATE = 110

train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))
train_data = train_data[train_data['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] != 0]

In [21]:
def cat2num(X):
    non_numeric_columns = X.select_dtypes(include=['object']).columns
    # print("Non-numeric columns:", non_numeric_columns)

    encoded_columns = {}

    for column in non_numeric_columns:
        encoder = LabelEncoder()
        encoded_columns[column] = encoder.fit_transform(X[column])

    encoded_df = pd.DataFrame(encoded_columns, index=X.index)

    X = X.drop(columns=non_numeric_columns)
    X = pd.concat([X, encoded_df], axis=1)

    return X

def featuregen(train_data):
    axis = ['X', 'Y', 'Z']
    process = ['Dam', 'Fill1', 'Fill2']

    for ax in axis:
        for proc in process:
            stage1_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage1) Collect Result_{proc}'
            stage2_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage2) Collect Result_{proc}'
            new_col_1_2 = f'Head_DIFF_{ax}_Stage1&2_{proc}'
        
            train_data[new_col_1_2] = (train_data[stage1_col] - train_data[stage2_col]).abs()
        
            stage3_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage3) Collect Result_{proc}'
            new_col_2_3 = f'Head_DIFF_{ax}_Stage2&3_{proc}'
        
            train_data[new_col_2_3] = (train_data[stage2_col] - train_data[stage3_col]).abs()

            train_data = train_data.drop(columns=[stage1_col, stage2_col, stage3_col])

    return train_data

def generating_features(df):
 
    # Thickness difference
    df['Thickness_Diff_1_2'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 2 Collect Result_Dam']
    df['Thickness_Diff_2_3'] = df['THICKNESS 2 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']
    df['Thickness_Std'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].std(axis=1)
    
    df['Cure_Position_Diff_X'] = (df['CURE START POSITION X Collect Result_Dam'] - df['CURE END POSITION X Collect Result_Dam']).abs()
    df['Head_Z_Position_Dam'] = (df['Head Clean Position Z Collect Result_Dam'] - df['Head Purge Position Z Collect Result_Dam']).abs() 
    df['Temperature_Change_Rate'] = df['Chamber Temp. Collect Result_AutoClave'] / df['Chamber Temp. Unit Time_AutoClave']

    return df

def generate_volume_to_speed_ratio(df):
    df['Volume_to_Speed_Ratio_Stage1'] = df['Dispense Volume(Stage1) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_to_Speed_Ratio_Stage2'] = df['Dispense Volume(Stage2) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_to_Speed_Ratio_Stage3'] = df['Dispense Volume(Stage3) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    
    return df

def generate_pressure_change_rate(df):
    df['Pressure_Change_Rate_1st'] = df['1st Pressure Collect Result_AutoClave'] / df['1st Pressure 1st Pressure Unit Time_AutoClave']
    df['Pressure_Change_Rate_2nd'] = df['2nd Pressure Collect Result_AutoClave'] / df['2nd Pressure Unit Time_AutoClave']
    df['Pressure_Change_Rate_3rd'] = df['3rd Pressure Collect Result_AutoClave'] / df['3rd Pressure Unit Time_AutoClave']
    
    # 무한대 값을 drop
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df['Pressure_Change_Rate_1st'].fillna(df['Pressure_Change_Rate_1st'].mean(), inplace=True)
    df['Pressure_Change_Rate_2nd'].fillna(df['Pressure_Change_Rate_2nd'].mean(), inplace=True)
    df['Pressure_Change_Rate_3rd'].fillna(df['Pressure_Change_Rate_3rd'].mean(), inplace=True)
    
    df['Pressure_Change_Rate_Std'] = df[['Pressure_Change_Rate_1st', 'Pressure_Change_Rate_2nd', 'Pressure_Change_Rate_3rd']].std(axis=1)
    
    return df

def generate_volume_to_time_ratio(df):
    df['Volume_to_Time_Ratio_Stage1'] = df['Dispense Volume(Stage1) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
    df['Volume_to_Time_Ratio_Stage2'] = df['Dispense Volume(Stage2) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
    df['Volume_to_Time_Ratio_Stage3'] = df['Dispense Volume(Stage3) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
    
    return df


train_data = cat2num(train_data)
train_data = featuregen(train_data)
train_data = generating_features(train_data)
train_data = generate_volume_to_speed_ratio(train_data)
train_data = generate_pressure_change_rate(train_data)
train_data = generate_volume_to_time_ratio(train_data)



train_data = train_data[['Receip No Collect Result_Dam', 
                               'Receip No Collect Result_Fill1',  
#                          'Receip No Collect Result_Fill2',
                         'Head Clean Position Z Collect Result_Dam',
                         'Head_DIFF_X_Stage1&2_Dam',     
                         'Head_DIFF_X_Stage2&3_Dam',
                         'Head_DIFF_X_Stage1&2_Fill1',
                         'Head_DIFF_X_Stage2&3_Fill1',
#                       'Head_DIFF_X_Stage1&2_Fill2',
                         'Head_DIFF_X_Stage2&3_Fill2',
                         'Head_DIFF_Y_Stage1&2_Dam',
                         'Head_DIFF_Y_Stage2&3_Dam',
                         'Head_DIFF_Y_Stage1&2_Fill1',
                         'Head_DIFF_Y_Stage2&3_Fill1',
#                       'Head_DIFF_Y_Stage1&2_Fill2',
                         'Head_DIFF_Y_Stage2&3_Fill2',
                         'Head_DIFF_Z_Stage1&2_Fill1',
#                       'Thickness_Diff_1_2', 'Thickness_Diff_2_3', 
                         'Thickness_Std',
                         'Volume_to_Speed_Ratio_Stage1',
                         'Volume_to_Speed_Ratio_Stage2',
                         'Volume_to_Speed_Ratio_Stage3',
#                              'Chamber Temp. Collect Result_AutoClave',
#                          'Pressure_Change_Rate_1st',
#                          'Pressure_Change_Rate_2nd',
#                          'Pressure_Change_Rate_3rd',
                         'Pressure_Change_Rate_Std',
                         'Temperature_Change_Rate',
                         'Stage2 Line2 Distance Speed Collect Result_Dam',
                         'Cure_Position_Diff_X',
                         'Volume_to_Time_Ratio_Stage1', 'Volume_to_Time_Ratio_Stage2', 'Volume_to_Time_Ratio_Stage3',
                         'CURE SPEED Collect Result_Dam',
                         'Head_Z_Position_Dam',
                         'target']]

In [22]:
train_data.columns

Index(['Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1',
       'Head Clean Position Z Collect Result_Dam', 'Head_DIFF_X_Stage1&2_Dam',
       'Head_DIFF_X_Stage2&3_Dam', 'Head_DIFF_X_Stage1&2_Fill1',
       'Head_DIFF_X_Stage2&3_Fill1', 'Head_DIFF_X_Stage2&3_Fill2',
       'Head_DIFF_Y_Stage1&2_Dam', 'Head_DIFF_Y_Stage2&3_Dam',
       'Head_DIFF_Y_Stage1&2_Fill1', 'Head_DIFF_Y_Stage2&3_Fill1',
       'Head_DIFF_Y_Stage2&3_Fill2', 'Head_DIFF_Z_Stage1&2_Fill1',
       'Thickness_Std', 'Volume_to_Speed_Ratio_Stage1',
       'Volume_to_Speed_Ratio_Stage2', 'Volume_to_Speed_Ratio_Stage3',
       'Pressure_Change_Rate_Std', 'Temperature_Change_Rate',
       'Stage2 Line2 Distance Speed Collect Result_Dam',
       'Cure_Position_Diff_X', 'Volume_to_Time_Ratio_Stage1',
       'Volume_to_Time_Ratio_Stage2', 'Volume_to_Time_Ratio_Stage3',
       'CURE SPEED Collect Result_Dam', 'Head_Z_Position_Dam', 'target'],
      dtype='object')

In [23]:
X = train_data.drop(columns=['target'])
y = train_data['target']

# 오버샘플링
# oversampler = BorderlineSMOTE(sampling_strategy={0 : 10000}, random_state=RANDOM_STATE)
# borderline_smote = BorderlineSMOTE(sampling_strategy="auto", random_state=RANDOM_STATE)
oversampler = SMOTEENN(sampling_strategy= {0 : 10000}, random_state=RANDOM_STATE)
X, y = oversampler.fit_resample(X, y)

# 언더샘플링
# undersampler = RandomUnderSampler(sampling_strategy={1: 10000}, random_state=RANDOM_STATE)
# undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=RANDOM_STATE)
# undersampler = NearMiss(sampling_strategy="auto")

# undersampler = CondensedNearestNeighbour(sampling_strategy='auto')

#------------------------------------------------------------------------------------------------
# undersampler = TomekLinks(sampling_strategy='auto')
# undersampler = OneSidedSelection(sampling_strategy='auto', random_state=RANDOM_STATE)
# undersampler = AllKNN(sampling_strategy='auto')
undersampler = RepeatedEditedNearestNeighbours(sampling_strategy='auto')
# undersampler = EditedNearestNeighbours(sampling_strategy='auto')
#------------------------------------------------------------------------------------------------
X, y = undersampler.fit_resample(X, y)

# undersampler = NearMiss(sampling_strategy="auto")
# clf = RandomForestClassifier(random_state=RANDOM_STATE)
# undersampler = InstanceHardnessThreshold(estimator=clf, sampling_strategy='auto', random_state=RANDOM_STATE)
undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=RANDOM_STATE)
X, y = undersampler.fit_resample(X, y)

# 데이터프레임으로 다시 결합
train_data = pd.concat([pd.DataFrame(X, columns=X.columns), pd.Series(y, name='target')], axis=1)

df_normal = train_data[train_data["target"] == 1]
df_abnormal = train_data[train_data["target"] == 0]
num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# 스케일링 (정규화 또는 표준화)
scaler = MinMaxScaler()
columns_to_scale = [col for col in train_data.columns if col != 'target']
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# 데이터를 다시 피처와 타겟으로 분리
train_x = train_data.drop(columns=['target'])
train_y = train_data['target']

# 학습 및 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(
    train_x,
    train_y,
    test_size=0.22,
    random_state=RANDOM_STATE,
)

Total: Normal: 3123, AbNormal: 3123


In [24]:
train_x

Unnamed: 0,Receip No Collect Result_Dam,Receip No Collect Result_Fill1,Head Clean Position Z Collect Result_Dam,Head_DIFF_X_Stage1&2_Dam,Head_DIFF_X_Stage2&3_Dam,Head_DIFF_X_Stage1&2_Fill1,Head_DIFF_X_Stage2&3_Fill1,Head_DIFF_X_Stage2&3_Fill2,Head_DIFF_Y_Stage1&2_Dam,Head_DIFF_Y_Stage2&3_Dam,...,Volume_to_Speed_Ratio_Stage3,Pressure_Change_Rate_Std,Temperature_Change_Rate,Stage2 Line2 Distance Speed Collect Result_Dam,Cure_Position_Diff_X,Volume_to_Time_Ratio_Stage1,Volume_to_Time_Ratio_Stage2,Volume_to_Time_Ratio_Stage3,CURE SPEED Collect Result_Dam,Head_Z_Position_Dam
0,0.0,0.0,1.000000,0.002296,0.997697,0.969466,0.004460,1.000000,0.004444,0.004452,...,0.107633,0.001985,0.019184,0.6250,1.0,0.017613,0.023319,0.000000,0.000000,0.000000
1,0.0,0.0,0.819113,0.005510,0.997236,0.996183,0.001574,1.000000,0.004444,0.004452,...,0.060619,0.789743,0.075292,0.3125,1.0,0.009632,0.022006,0.015295,0.000000,0.000000
2,0.0,0.0,0.351536,0.003214,0.999539,0.992366,0.002361,1.000000,0.008000,0.008014,...,0.019485,0.791345,0.062504,0.1625,1.0,0.017784,0.023898,0.007725,0.857143,0.570833
3,0.0,0.0,1.000000,0.002296,0.997697,0.969466,0.004460,1.000000,0.004444,0.004452,...,0.107633,0.002039,0.029201,0.6250,1.0,0.017613,0.007935,0.017613,0.000000,0.000000
4,0.0,0.0,0.819113,0.998623,0.001842,0.997455,0.000525,0.011009,0.996444,0.998219,...,0.184023,0.791356,0.055424,0.3125,0.0,0.015295,0.030778,0.015295,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21315,0.0,0.0,1.000000,0.002296,0.997697,0.975827,0.003148,1.000000,0.004444,0.004452,...,0.164538,0.000034,0.029201,1.0000,1.0,0.017613,0.019532,0.017613,0.000000,0.000000
24795,0.0,0.0,0.351536,0.005510,0.997236,0.996183,0.001574,1.000000,0.004444,0.008014,...,0.060619,0.791390,0.075292,0.1875,1.0,0.015295,0.017241,0.015295,0.000000,0.570833
28579,0.0,0.0,0.351536,0.005510,0.997236,0.996183,0.001574,1.000000,0.004444,0.008014,...,0.060619,0.792999,0.071981,0.1875,1.0,0.012069,0.017241,0.012069,1.000000,0.570833
27334,0.0,0.0,0.819113,0.997245,0.001382,0.982188,0.002623,0.011009,0.007111,0.007124,...,0.184023,0.001904,0.020911,0.6250,0.0,0.022712,0.023319,0.022712,0.000000,0.000000


In [25]:
# Recursive Feature Elimination (RFE) for backward feature selection
model = cb.CatBoostClassifier(
    depth=4,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.05,
    colsample_bylevel = 0.9,
    verbose=0  # No output during training
)

# Train final model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

#SHAP
# SHAP value caculation
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Features importance
df_shap = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = df_shap.abs().mean().sort_values(ascending=False)
print(shap_importance)

F1 Score: 0.9080
Recall: 0.9447
Accuracy: 0.9018
Precision: 0.8740
Temperature_Change_Rate                           0.368399
Pressure_Change_Rate_Std                          0.321210
Volume_to_Speed_Ratio_Stage2                      0.249483
Volume_to_Time_Ratio_Stage2                       0.201177
Volume_to_Time_Ratio_Stage1                       0.199838
Volume_to_Time_Ratio_Stage3                       0.180577
Volume_to_Speed_Ratio_Stage3                      0.174497
Volume_to_Speed_Ratio_Stage1                      0.157677
Head_DIFF_Y_Stage2&3_Fill1                        0.135328
Head_DIFF_X_Stage2&3_Fill2                        0.128298
Head_DIFF_X_Stage1&2_Dam                          0.116032
Head_DIFF_X_Stage1&2_Fill1                        0.109144
Head_DIFF_X_Stage2&3_Dam                          0.093271
Head Clean Position Z Collect Result_Dam          0.087041
Head_DIFF_Y_Stage2&3_Fill2                        0.085333
Receip No Collect Result_Fill1                  

In [26]:
# Process test data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))
test_data = test_data[test_data['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] != 0]
test_data = cat2num(test_data)
test_data = featuregen(test_data)
test_data = generating_features(test_data) 
test_data = generate_volume_to_speed_ratio(test_data)
test_data = generate_pressure_change_rate(test_data)
test_data = generate_volume_to_time_ratio(test_data)




# Scale the test data
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Select the same features as the training data
test_x_rfe = test_data[X_train.columns]

# Predict on test data
y_pred = model.predict(test_x_rfe)

# Prepare submission
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred
df_sub['target'] = df_sub['target'].map({1: 'Normal', 0: 'AbNormal'})

# Calculate the ratio of abnormal cases
counts = df_sub['target'].value_counts()
ratio = counts['AbNormal'] / (counts['AbNormal'] + counts['Normal'])
print("The ratio of abnormal is:", ratio)

# Save the submission file
df_sub.to_csv("submission.csv", index=False)

The ratio of abnormal is: 0.09037497839986176
