In [75]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.feature_selection import RFE
import shap
import catboost as cb
from tqdm import tqdm
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, BorderlineSMOTE, SVMSMOTE, ADASYN, KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, CondensedNearestNeighbour, OneSidedSelection, InstanceHardnessThreshold, AllKNN, RepeatedEditedNearestNeighbours, EditedNearestNeighbours
from imblearn.combine import SMOTEENN

ROOT_DIR = "data"
RANDOM_STATE = 110

train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))
train_data = train_data[train_data['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] != 0]

In [76]:
def cat2num(X):
    non_numeric_columns = X.select_dtypes(include=['object']).columns
    # print("Non-numeric columns:", non_numeric_columns)

    encoded_columns = {}

    for column in non_numeric_columns:
        encoder = LabelEncoder()
        encoded_columns[column] = encoder.fit_transform(X[column])

    encoded_df = pd.DataFrame(encoded_columns, index=X.index)

    X = X.drop(columns=non_numeric_columns)
    X = pd.concat([X, encoded_df], axis=1)

    return X

def featuregen(train_data):
    axis = ['X', 'Y', 'Z']
    process = ['Dam', 'Fill1', 'Fill2']

    for ax in axis:
        for proc in process:
            stage1_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage1) Collect Result_{proc}'
            stage2_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage2) Collect Result_{proc}'
            new_col_1_2 = f'Head_DIFF_{ax}_Stage1&2_{proc}'
        
            train_data[new_col_1_2] = (train_data[stage1_col] - train_data[stage2_col]).abs()
        
            stage3_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage3) Collect Result_{proc}'
            new_col_2_3 = f'Head_DIFF_{ax}_Stage2&3_{proc}'
        
            train_data[new_col_2_3] = (train_data[stage2_col] - train_data[stage3_col]).abs()

            train_data = train_data.drop(columns=[stage1_col, stage2_col, stage3_col])
            print(new_col_1_2)
            print(new_col_2_3)
    return train_data

def preprocess(df) :
  # Exclude columns where every value is unique for each row

  #Identify columns where all values are the same
    same_rows_columns = [column for column in df.columns if df[column].nunique() == 1]

  #Identify columns where every row's value is unique
    row_count = len(df)
    matching_row_columns = [column for column in df.columns if df[column].value_counts().size == row_count]

  #Drop columns identified in the above steps
    df.drop(columns=same_rows_columns, inplace=True)
    df.drop(columns=matching_row_columns, inplace=True)

    return df

def generating_features(df):
    # 1. Thickness features from three differen stages at DAM
    # Thickness difference
    df['Thickness_Diff_1_2'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 2 Collect Result_Dam']
    df['Thickness_Diff_2_3'] = df['THICKNESS 2 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']

    # Thickness standard deviation
    df['Thickness_Std'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].std(axis=1)

    return df

def generate_volume_to_speed_ratio(df):
    df['Volume_to_Speed_Ratio_Stage1'] = df['Dispense Volume(Stage1) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_to_Speed_Ratio_Stage2'] = df['Dispense Volume(Stage2) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_to_Speed_Ratio_Stage3'] = df['Dispense Volume(Stage3) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    return df

def generate_pressure_change_rate(df):
    df['Pressure_Change_Rate_1st'] = df['1st Pressure Collect Result_AutoClave'] / df['1st Pressure 1st Pressure Unit Time_AutoClave']
    df['Pressure_Change_Rate_2nd'] = df['2nd Pressure Collect Result_AutoClave'] / df['2nd Pressure Unit Time_AutoClave']
    df['Pressure_Change_Rate_3rd'] = df['3rd Pressure Collect Result_AutoClave'] / df['3rd Pressure Unit Time_AutoClave']
    
    # 무한대 값을 9999로 대체
    df.replace([float('inf'), -float('inf')], 9999, inplace=True)
    
    return df

def generate_temperature_change_rate(df):
    df['Temperature_Change_Rate'] = df['Chamber Temp. Collect Result_AutoClave'] / df['Chamber Temp. Unit Time_AutoClave']
    return df

def generate_TtoP_change_rate(df):
    df['TtoP_1st'] = df['Temperature_Change_Rate'] / df['Pressure_Change_Rate_1st']
    df['TtoP_2nd'] = df['Temperature_Change_Rate'] / df['Pressure_Change_Rate_2nd']
    df['TtoP_3rd'] = df['Temperature_Change_Rate'] / df['Pressure_Change_Rate_3rd']
    return df

train_data = cat2num(train_data)
train_data = featuregen(train_data)
train_data = generating_features(train_data)
train_data = generate_volume_to_speed_ratio(train_data)
train_data = generate_temperature_change_rate(train_data)
train_data = generate_pressure_change_rate(train_data)
train_data = generate_TtoP_change_rate(train_data)


train_data = train_data[['Receip No Collect Result_Dam', 
#                                '1st Pressure 1st Pressure Unit Time_AutoClave',
#                          '2nd Pressure Collect Result_AutoClave',
#                                '3rd Pressure Collect Result_AutoClave', 
                               'Chamber Temp. Unit Time_AutoClave', 
                               'Receip No Collect Result_Fill1',  
                               'Receip No Collect Result_Fill2',
                               'Head Clean Position Z Collect Result_Dam',
                               'Head_DIFF_X_Stage1&2_Dam',     
                               'Head_DIFF_X_Stage2&3_Dam',
                         'Head_DIFF_X_Stage1&2_Fill1',
                         'Head_DIFF_X_Stage2&3_Fill1','Head_DIFF_X_Stage1&2_Fill2','Head_DIFF_X_Stage2&3_Fill2','Head_DIFF_Y_Stage1&2_Dam','Head_DIFF_Y_Stage2&3_Dam','Head_DIFF_Y_Stage1&2_Fill1','Head_DIFF_Y_Stage2&3_Fill1','Head_DIFF_Y_Stage1&2_Fill2','Head_DIFF_Y_Stage2&3_Fill2','Head_DIFF_Z_Stage1&2_Dam','Head_DIFF_Z_Stage2&3_Dam','Head_DIFF_Z_Stage1&2_Fill1','Head_DIFF_Z_Stage2&3_Fill1','Head_DIFF_Z_Stage1&2_Fill2','Head_DIFF_Z_Stage2&3_Fill2',
                               'Thickness_Diff_1_2', 'Thickness_Diff_2_3', 
                         'Thickness_Std',
                              'Volume_to_Speed_Ratio_Stage1',
                              'Volume_to_Speed_Ratio_Stage2',
                              'Volume_to_Speed_Ratio_Stage3',
#                              'Chamber Temp. Collect Result_AutoClave',
                    'Pressure_Change_Rate_1st','Pressure_Change_Rate_2nd','Pressure_Change_Rate_3rd',
                    # 'Temperature_Change_Rate',
                                                      'CURE END POSITION X Collect Result_Dam', 
                                                      
                                                    #   'TtoP_1st', 'TtoP_2nd', 'TtoP_3rd',
                               'target']]

Head_DIFF_X_Stage1&2_Dam
Head_DIFF_X_Stage2&3_Dam
Head_DIFF_X_Stage1&2_Fill1
Head_DIFF_X_Stage2&3_Fill1
Head_DIFF_X_Stage1&2_Fill2
Head_DIFF_X_Stage2&3_Fill2
Head_DIFF_Y_Stage1&2_Dam
Head_DIFF_Y_Stage2&3_Dam
Head_DIFF_Y_Stage1&2_Fill1
Head_DIFF_Y_Stage2&3_Fill1
Head_DIFF_Y_Stage1&2_Fill2
Head_DIFF_Y_Stage2&3_Fill2
Head_DIFF_Z_Stage1&2_Dam
Head_DIFF_Z_Stage2&3_Dam
Head_DIFF_Z_Stage1&2_Fill1
Head_DIFF_Z_Stage2&3_Fill1
Head_DIFF_Z_Stage1&2_Fill2
Head_DIFF_Z_Stage2&3_Fill2


In [77]:
train_data.columns

Index(['Receip No Collect Result_Dam', 'Chamber Temp. Unit Time_AutoClave',
       'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2',
       'Head Clean Position Z Collect Result_Dam', 'Head_DIFF_X_Stage1&2_Dam',
       'Head_DIFF_X_Stage2&3_Dam', 'Head_DIFF_X_Stage1&2_Fill1',
       'Head_DIFF_X_Stage2&3_Fill1', 'Head_DIFF_X_Stage1&2_Fill2',
       'Head_DIFF_X_Stage2&3_Fill2', 'Head_DIFF_Y_Stage1&2_Dam',
       'Head_DIFF_Y_Stage2&3_Dam', 'Head_DIFF_Y_Stage1&2_Fill1',
       'Head_DIFF_Y_Stage2&3_Fill1', 'Head_DIFF_Y_Stage1&2_Fill2',
       'Head_DIFF_Y_Stage2&3_Fill2', 'Head_DIFF_Z_Stage1&2_Dam',
       'Head_DIFF_Z_Stage2&3_Dam', 'Head_DIFF_Z_Stage1&2_Fill1',
       'Head_DIFF_Z_Stage2&3_Fill1', 'Head_DIFF_Z_Stage1&2_Fill2',
       'Head_DIFF_Z_Stage2&3_Fill2', 'Thickness_Diff_1_2',
       'Thickness_Diff_2_3', 'Thickness_Std', 'Volume_to_Speed_Ratio_Stage1',
       'Volume_to_Speed_Ratio_Stage2', 'Volume_to_Speed_Ratio_Stage3',
       'Pressure_Change_Rate_1st', '

In [78]:
# NaN 값이 있는 컬럼 확인
nan_columns = train_data.columns[train_data.isna().any()].tolist()

print("NaN이 있는 컬럼:", nan_columns)

NaN이 있는 컬럼: []


In [79]:
X = train_data.drop(columns=['target'])
y = train_data['target']

# 오버샘플링
# oversampler = BorderlineSMOTE(sampling_strategy={0 : 10000}, random_state=RANDOM_STATE)
# borderline_smote = BorderlineSMOTE(sampling_strategy="auto", random_state=RANDOM_STATE)
oversampler = SMOTEENN(sampling_strategy= {0 : 25000}, random_state=RANDOM_STATE)
X, y = oversampler.fit_resample(X, y)

# 언더샘플링
# undersampler = RandomUnderSampler(sampling_strategy={1: 10000}, random_state=RANDOM_STATE)
# undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=RANDOM_STATE)
# undersampler = NearMiss(sampling_strategy="auto")

# undersampler = CondensedNearestNeighbour(sampling_strategy='auto')

#------------------------------------------------------------------------------------------------
# undersampler = TomekLinks(sampling_strategy='auto')
# undersampler = OneSidedSelection(sampling_strategy='auto', random_state=RANDOM_STATE)
# undersampler = AllKNN(sampling_strategy='auto')
# undersampler = RepeatedEditedNearestNeighbours(sampling_strategy='auto')
# undersampler = EditedNearestNeighbours(sampling_strategy='auto')
#------------------------------------------------------------------------------------------------
# X, y = undersampler.fit_resample(X, y)

undersampler = NearMiss(sampling_strategy="auto")
# clf = RandomForestClassifier(random_state=RANDOM_STATE)
# undersampler = InstanceHardnessThreshold(estimator=clf, sampling_strategy='auto', random_state=RANDOM_STATE)
# undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=RANDOM_STATE)
X, y = undersampler.fit_resample(X, y)

# 데이터프레임으로 다시 결합
train_data = pd.concat([pd.DataFrame(X, columns=X.columns), pd.Series(y, name='target')], axis=1)

df_normal = train_data[train_data["target"] == 1]
df_abnormal = train_data[train_data["target"] == 0]
num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# 스케일링 (정규화 또는 표준화)
scaler = MinMaxScaler()
columns_to_scale = [col for col in train_data.columns if col != 'target']
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# 데이터를 다시 피처와 타겟으로 분리
train_x = train_data.drop(columns=['target'])
train_y = train_data['target']

# 학습 및 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(
    train_x,
    train_y,
    test_size=0.22,
    random_state=RANDOM_STATE,
)

Total: Normal: 10780, AbNormal: 10780


In [80]:
train_x

Unnamed: 0,Receip No Collect Result_Dam,Chamber Temp. Unit Time_AutoClave,Receip No Collect Result_Fill1,Receip No Collect Result_Fill2,Head Clean Position Z Collect Result_Dam,Head_DIFF_X_Stage1&2_Dam,Head_DIFF_X_Stage2&3_Dam,Head_DIFF_X_Stage1&2_Fill1,Head_DIFF_X_Stage2&3_Fill1,Head_DIFF_X_Stage1&2_Fill2,...,Thickness_Diff_1_2,Thickness_Diff_2_3,Thickness_Std,Volume_to_Speed_Ratio_Stage1,Volume_to_Speed_Ratio_Stage2,Volume_to_Speed_Ratio_Stage3,Pressure_Change_Rate_1st,Pressure_Change_Rate_2nd,Pressure_Change_Rate_3rd,CURE END POSITION X Collect Result_Dam
0,0.0,0.630480,0.0,0.0,0.721053,0.998623,0.001842,0.997455,0.000525,0.0,...,0.000000,0.699690,0.000000,0.663356,0.034544,0.055671,2.440217e-08,4.880489e-05,0.005203,1.0
1,0.0,0.630480,0.0,0.0,0.721053,0.998623,0.001842,0.997455,0.000525,0.0,...,0.000000,0.699690,0.000000,0.663356,0.034544,0.055671,2.523212e-08,4.880489e-05,0.005229,1.0
2,0.0,0.874739,0.0,0.0,1.000000,0.002296,0.997697,0.969466,0.004460,1.0,...,0.000000,0.699690,0.000000,0.673414,0.135488,0.083884,5.333867e-09,6.317300e-07,0.005313,0.0
3,0.0,0.626305,0.0,0.0,0.000000,1.000000,0.001842,0.988550,0.004722,0.0,...,0.436364,0.482972,0.350701,0.796028,0.060192,0.427831,2.575258e-08,4.900491e-05,0.005286,1.0
4,0.0,0.881002,0.0,0.0,1.000000,0.004591,1.000000,0.994911,0.001836,1.0,...,0.000000,0.699690,0.000000,0.700597,0.034544,0.160137,3.187179e-08,6.281621e-08,0.005283,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21555,0.0,0.630480,0.0,0.0,0.000000,0.005510,0.997236,0.996183,0.001574,1.0,...,0.000000,0.699690,0.000000,0.700597,0.034544,0.179724,2.357221e-08,4.890490e-05,0.005256,0.0
21556,0.0,0.630480,0.0,0.0,0.000000,0.005510,0.997236,0.996183,0.001574,1.0,...,0.000000,0.699690,0.000000,0.700597,0.034544,0.179724,2.357221e-08,4.890490e-05,0.005256,0.0
21557,0.0,0.881002,0.0,0.0,0.721053,0.999082,0.002764,0.982188,0.002623,0.0,...,0.000000,0.699690,0.000000,0.707580,0.034544,0.179724,2.647706e-08,5.207133e-08,0.005176,1.0
21558,0.0,0.881002,0.0,0.0,0.721053,0.999082,0.002764,0.982188,0.002623,0.0,...,0.000000,0.699690,0.000000,0.707580,0.034544,0.179724,2.647706e-08,5.207133e-08,0.005176,1.0


In [81]:
# Recursive Feature Elimination (RFE) for backward feature selection
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.05,
    verbose=0  # No output during training
)

# Train final model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

#SHAP
# SHAP value caculation
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Features importance
df_shap = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = df_shap.abs().mean().sort_values(ascending=False)
print(shap_importance)

F1 Score: 0.9540
Recall: 0.9780
Accuracy: 0.9522
Precision: 0.9311
Pressure_Change_Rate_3rd                    1.034864
Volume_to_Speed_Ratio_Stage2                0.629373
Pressure_Change_Rate_2nd                    0.476580
Pressure_Change_Rate_1st                    0.439601
Volume_to_Speed_Ratio_Stage3                0.436432
Volume_to_Speed_Ratio_Stage1                0.270306
CURE END POSITION X Collect Result_Dam      0.166425
Head_DIFF_Y_Stage1&2_Fill2                  0.133870
Head Clean Position Z Collect Result_Dam    0.130420
Head_DIFF_X_Stage2&3_Fill1                  0.117682
Head_DIFF_Y_Stage1&2_Fill1                  0.113212
Receip No Collect Result_Fill2              0.112566
Head_DIFF_X_Stage1&2_Fill1                  0.091364
Chamber Temp. Unit Time_AutoClave           0.091013
Head_DIFF_X_Stage2&3_Dam                    0.065595
Receip No Collect Result_Fill1              0.047030
Head_DIFF_Y_Stage2&3_Dam                    0.043413
Head_DIFF_X_Stage2&3_Fill2      

In [337]:
# Recursive Feature Elimination (RFE) for backward feature selection
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.01,
    verbose=0  # No output during training
)

# Train final model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

#SHAP
# SHAP value caculation
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Features importance
df_shap = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = df_shap.abs().mean().sort_values(ascending=False)
print(shap_importance)

F1 Score: 0.7881
Recall: 0.8239
Accuracy: 0.7835
Precision: 0.7553
Pressure_Change_Rate_3rd                    0.470191
Volume_to_Speed_Ratio_Stage2                0.183617
Head Clean Position Z Collect Result_Dam    0.128834
Pressure_Change_Rate_1st                    0.072869
Head_DIFF_Y_Stage2&3_Fill1                  0.060403
Pressure_Change_Rate_2nd                    0.052694
Volume_to_Speed_Ratio_Stage3                0.048530
Head_DIFF_X_Stage1&2_Dam                    0.037061
Head_DIFF_X_Stage1&2_Fill1                  0.034935
Head_DIFF_X_Stage2&3_Dam                    0.028983
CURE END POSITION X Collect Result_Dam      0.021727
Volume_to_Speed_Ratio_Stage1                0.018765
Head_DIFF_X_Stage2&3_Fill1                  0.015000
Receip No Collect Result_Fill2              0.013540
Chamber Temp. Unit Time_AutoClave           0.012414
Head_DIFF_Y_Stage2&3_Dam                    0.011763
Head_DIFF_Y_Stage1&2_Fill1                  0.010365
Receip No Collect Result_Fill1  

In [326]:
# Recursive Feature Elimination (RFE) for backward feature selection
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.01,
    verbose=0  # No output during training
)

# Train final model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

#SHAP
# SHAP value caculation
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Features importance
df_shap = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = df_shap.abs().mean().sort_values(ascending=False)
print(shap_importance)

F1 Score: 0.7654
Recall: 0.8023
Accuracy: 0.7523
Precision: 0.7317
Pressure_Change_Rate_3rd                    0.350161
Volume_to_Speed_Ratio_Stage2                0.150529
Head Clean Position Z Collect Result_Dam    0.118204
Head_DIFF_Y_Stage2&3_Fill1                  0.079854
Pressure_Change_Rate_1st                    0.047857
Pressure_Change_Rate_2nd                    0.045401
Volume_to_Speed_Ratio_Stage3                0.044123
Head_DIFF_X_Stage1&2_Fill1                  0.042590
Head_DIFF_X_Stage1&2_Dam                    0.035746
Head_DIFF_X_Stage2&3_Dam                    0.022918
CURE END POSITION X Collect Result_Dam      0.019577
Head_DIFF_Y_Stage2&3_Fill2                  0.019473
Head_DIFF_X_Stage2&3_Fill1                  0.019090
Head_DIFF_Y_Stage2&3_Dam                    0.015419
Volume_to_Speed_Ratio_Stage1                0.015324
Head_DIFF_Y_Stage1&2_Dam                    0.015012
Head_DIFF_X_Stage1&2_Fill2                  0.014070
Head_DIFF_Y_Stage1&2_Fill1      

In [328]:
# Process test data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))
test_data = test_data[test_data['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] != 0]
test_data = cat2num(test_data)
test_data = featuregen(test_data)
test_data = generating_features(test_data) 
test_data = generate_volume_to_speed_ratio(test_data)
test_data = generate_pressure_change_rate(test_data)
test_data = generate_temperature_change_rate(test_data)
test_data = generate_temperature_to_pressure_ratio(test_data)


# Scale the test data
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Select the same features as the training data
test_x_rfe = test_data[X_train.columns]

# Predict on test data
y_pred = model.predict(test_x_rfe)

# Prepare submission
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred
df_sub['target'] = df_sub['target'].map({1: 'Normal', 0: 'AbNormal'})

# Calculate the ratio of abnormal cases
counts = df_sub['target'].value_counts()
ratio = counts['AbNormal'] / (counts['AbNormal'] + counts['Normal'])
print("The ratio of abnormal is:", ratio)

# Save the submission file
df_sub.to_csv("submission.csv", index=False)

Head_DIFF_X_Stage1&2_Dam
Head_DIFF_X_Stage2&3_Dam
Head_DIFF_X_Stage1&2_Fill1
Head_DIFF_X_Stage2&3_Fill1
Head_DIFF_X_Stage1&2_Fill2
Head_DIFF_X_Stage2&3_Fill2
Head_DIFF_Y_Stage1&2_Dam
Head_DIFF_Y_Stage2&3_Dam
Head_DIFF_Y_Stage1&2_Fill1
Head_DIFF_Y_Stage2&3_Fill1
Head_DIFF_Y_Stage1&2_Fill2
Head_DIFF_Y_Stage2&3_Fill2
Head_DIFF_Z_Stage1&2_Dam
Head_DIFF_Z_Stage2&3_Dam
Head_DIFF_Z_Stage1&2_Fill1
Head_DIFF_Z_Stage2&3_Fill1
Head_DIFF_Z_Stage1&2_Fill2
Head_DIFF_Z_Stage2&3_Fill2
The ratio of abnormal is: 0.20868613559126778
