In [23]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.feature_selection import RFE
import catboost as cb
from tqdm import tqdm

ROOT_DIR = "data"
RANDOM_STATE = 110

train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))

def cat2num(X):
    non_numeric_columns = X.select_dtypes(include=['object']).columns
    # print("Non-numeric columns:", non_numeric_columns)

    encoded_columns = {}

    for column in non_numeric_columns:
        encoder = LabelEncoder()
        encoded_columns[column] = encoder.fit_transform(X[column])

    encoded_df = pd.DataFrame(encoded_columns, index=X.index)

    X = X.drop(columns=non_numeric_columns)
    X = pd.concat([X, encoded_df], axis=1)

    return X

def featuregen(train_data):
    axis = ['X', 'Y', 'Z']
    process = ['Dam', 'Fill1', 'Fill2']

    for ax in axis:
        for proc in process:
            stage1_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage1) Collect Result_{proc}'
            stage2_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage2) Collect Result_{proc}'
            new_col_1_2 = f'Head_DIFF_{ax}_Stage1&2_{proc}'
        
            train_data[new_col_1_2] = (train_data[stage1_col] - train_data[stage2_col]).abs()
        
            stage3_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage3) Collect Result_{proc}'
            new_col_2_3 = f'Head_DIFF_{ax}_Stage2&3_{proc}'

            train_data[new_col_2_3] = (train_data[stage2_col] - train_data[stage3_col]).abs()

            print(new_col_1_2)
            print(new_col_2_3)
    return train_data

def preprocess(df) :
  # Exclude columns where every value is unique for each row

  #Identify columns where all values are the same
    same_rows_columns = [column for column in df.columns if df[column].nunique() == 1]

  #Identify columns where every row's value is unique
    row_count = len(df)
    matching_row_columns = [column for column in df.columns if df[column].value_counts().size == row_count]

  #Drop columns identified in the above steps
    df.drop(columns=same_rows_columns, inplace=True)
    df.drop(columns=matching_row_columns, inplace=True)

    return df

def generating_features(df):
    # 1. Thickness features from three differen stages at DAM
    # Thickness difference
    df['Thickness_Diff_1_2'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 2 Collect Result_Dam']
    df['Thickness_Diff_2_3'] = df['THICKNESS 2 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']

    # Thickness standard deviation
    df['Thickness_Std'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].std(axis=1)

    return df
  
def condition(df):
    # Creating Condition1
    df['Condition1'] = df['Head Clean Position Z Collect Result_Dam'] <= 120
    df['Condition1'] = df['Condition1'].astype(int)
    
    # Creating Condition2
    df['Condition2'] = (df['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'] >= 284) & (df['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] >= 1000)
    df['Condition2'] = df['Condition2'].astype(int)

    try:
        df = df[['Receip No Collect Result_Dam', 
                 '1st Pressure 1st Pressure Unit Time_AutoClave', 
                 '3rd Pressure Collect Result_AutoClave', 
                 'Chamber Temp. Unit Time_AutoClave', 
                 'Receip No Collect Result_Fill1', 
                 'Receip No Collect Result_Fill2',
                 'Chamber Temp. Judge Value_AutoClave',
                 
                #  'Head_DIFF_X_Stage2&3_Dam','Head_DIFF_X_Stage1&2_Fill1','Head_DIFF_X_Stage2&3_Fill1','Head_DIFF_X_Stage1&2_Fill2','Head_DIFF_X_Stage2&3_Fill2','Head_DIFF_Y_Stage1&2_Dam','Head_DIFF_Y_Stage2&3_Dam','Head_DIFF_Y_Stage1&2_Fill1','Head_DIFF_Y_Stage2&3_Fill1','Head_DIFF_Y_Stage1&2_Fill2','Head_DIFF_Y_Stage2&3_Fill2','Head_DIFF_Z_Stage1&2_Dam','Head_DIFF_Z_Stage2&3_Dam','Head_DIFF_Z_Stage1&2_Fill1','Head_DIFF_Z_Stage2&3_Fill1','Head_DIFF_Z_Stage1&2_Fill2','Head_DIFF_Z_Stage2&3_Fill2',
                #  'Thickness_Diff_1_2', 'Thickness_Diff_2_3', 'Thickness_Std',
                 
                #  'Condition1', 
                #  'Condition2',
                 
                 'target']]
    except KeyError:
        df = df[['Receip No Collect Result_Dam', 
                 '1st Pressure 1st Pressure Unit Time_AutoClave', 
                 '3rd Pressure Collect Result_AutoClave', 
                 'Chamber Temp. Unit Time_AutoClave', 
                 'Receip No Collect Result_Fill1', 
                 'Receip No Collect Result_Fill2',
                 'Chamber Temp. Judge Value_AutoClave',
                 
                #  'Head_DIFF_X_Stage2&3_Dam','Head_DIFF_X_Stage1&2_Fill1','Head_DIFF_X_Stage2&3_Fill1','Head_DIFF_X_Stage1&2_Fill2','Head_DIFF_X_Stage2&3_Fill2','Head_DIFF_Y_Stage1&2_Dam','Head_DIFF_Y_Stage2&3_Dam','Head_DIFF_Y_Stage1&2_Fill1','Head_DIFF_Y_Stage2&3_Fill1','Head_DIFF_Y_Stage1&2_Fill2','Head_DIFF_Y_Stage2&3_Fill2','Head_DIFF_Z_Stage1&2_Dam','Head_DIFF_Z_Stage2&3_Dam','Head_DIFF_Z_Stage1&2_Fill1','Head_DIFF_Z_Stage2&3_Fill1','Head_DIFF_Z_Stage1&2_Fill2','Head_DIFF_Z_Stage2&3_Fill2',
                #  'Thickness_Diff_1_2', 'Thickness_Diff_2_3', 'Thickness_Std',
                 
                #  'Condition1', 
                #  'Condition2'
                 ]]
    
    return df

train_data = cat2num(train_data)
train_data = featuregen(train_data)
train_data = generating_features(train_data)
train_data = condition(train_data)

Head_DIFF_X_Stage1&2_Dam
Head_DIFF_X_Stage2&3_Dam
Head_DIFF_X_Stage1&2_Fill1
Head_DIFF_X_Stage2&3_Fill1
Head_DIFF_X_Stage1&2_Fill2
Head_DIFF_X_Stage2&3_Fill2
Head_DIFF_Y_Stage1&2_Dam
Head_DIFF_Y_Stage2&3_Dam
Head_DIFF_Y_Stage1&2_Fill1
Head_DIFF_Y_Stage2&3_Fill1
Head_DIFF_Y_Stage1&2_Fill2
Head_DIFF_Y_Stage2&3_Fill2
Head_DIFF_Z_Stage1&2_Dam
Head_DIFF_Z_Stage2&3_Dam
Head_DIFF_Z_Stage1&2_Fill1
Head_DIFF_Z_Stage2&3_Fill1
Head_DIFF_Z_Stage1&2_Fill2
Head_DIFF_Z_Stage2&3_Fill2


In [24]:
train_data

Unnamed: 0,Receip No Collect Result_Dam,1st Pressure 1st Pressure Unit Time_AutoClave,3rd Pressure Collect Result_AutoClave,Chamber Temp. Unit Time_AutoClave,Receip No Collect Result_Fill1,Receip No Collect Result_Fill2,Chamber Temp. Judge Value_AutoClave,target
0,1,240,0.499,361,1,1,1,1
1,1,241,0.498,483,1,1,1,1
2,1,241,0.498,363,1,1,1,1
3,1,241,0.500,483,1,1,1,1
4,1,240,0.498,480,1,1,0,1
...,...,...,...,...,...,...,...,...
40501,1,241,0.500,483,1,1,1,1
40502,1,240,0.498,361,1,1,0,1
40503,1,241,0.498,363,1,1,0,1
40504,1,241,0.499,483,1,1,1,1


In [25]:
# Scale the data
scaler = MinMaxScaler()
columns_to_scale = [col for col in train_data.columns if col != 'target']
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# Undersample the data
normal_ratio = 10.0  # 1:1 ratio
df_normal = train_data[train_data["target"] == 1]
df_abnormal = train_data[train_data["target"] == 0]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

# Split the data into features and target
train_x = df_concat.drop(columns=['target'])
train_y = df_concat['target']

X_train, X_test, y_train, y_test = train_test_split(
    train_x,
    train_y,
    test_size=0.3,
    random_state=RANDOM_STATE,
)

# Recursive Feature Elimination (RFE) for backward feature selection
model = cb.CatBoostClassifier(
    depth=6,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.01,
    verbose=0  # No output during training
)

# Train final model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

# Process test data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))
z120 = test_data['Head Clean Position Z Collect Result_Dam']

test_data = cat2num(test_data)
test_data = featuregen(test_data)
test_data = generating_features(test_data)
# test_data = condition(test_data)

# Scale the test data
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Select the same features as the training data
test_x_rfe = test_data[X_train.columns]

# Predict on test data
y_pred = model.predict(test_x_rfe)
y_pred_s = pd.Series(y_pred, name='target')

test_x_rfe = pd.concat([test_x_rfe, z120], axis=1)
test_x_rfe = pd.concat([test_x_rfe, y_pred_s], axis=1)
test_x_rfe.loc[test_x_rfe['Head Clean Position Z Collect Result_Dam'] <= 120, 'target'] = 'AbNormal'
y_pred = test_x_rfe['target']

# filtered_df = test_x_rfe[test_x_rfe['Head Clean Position Z Collect Result_Dam'] <= 120]
# print(filtered_df[['Head Clean Position Z Collect Result_Dam', 'target']])

# Prepare submission
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred
df_sub['target'] = df_sub['target'].map({1: 'Normal', 0: 'AbNormal'})

# Calculate the ratio of abnormal cases
counts = df_sub['target'].value_counts()
ratio = counts['AbNormal'] / (counts['AbNormal'] + counts['Normal'])
print("The ratio of abnormal is:", ratio)

# Save the submission file
df_sub.to_csv("submission.csv", index=False)

Total: Normal: 38156, AbNormal: 2350
F1 Score: 0.9530
Recall: 0.9990
Accuracy: 0.9105
Precision: 0.9110
Head_DIFF_X_Stage1&2_Dam
Head_DIFF_X_Stage2&3_Dam
Head_DIFF_X_Stage1&2_Fill1
Head_DIFF_X_Stage2&3_Fill1
Head_DIFF_X_Stage1&2_Fill2
Head_DIFF_X_Stage2&3_Fill2
Head_DIFF_Y_Stage1&2_Dam
Head_DIFF_Y_Stage2&3_Dam
Head_DIFF_Y_Stage1&2_Fill1
Head_DIFF_Y_Stage2&3_Fill1
Head_DIFF_Y_Stage1&2_Fill2
Head_DIFF_Y_Stage2&3_Fill2
Head_DIFF_Z_Stage1&2_Dam
Head_DIFF_Z_Stage2&3_Dam
Head_DIFF_Z_Stage1&2_Fill1
Head_DIFF_Z_Stage2&3_Fill1
Head_DIFF_Z_Stage1&2_Fill2
Head_DIFF_Z_Stage2&3_Fill2
The ratio of abnormal is: 0.003227665706051873
