In [36]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.feature_selection import RFE
import shap
import catboost as cb
import xgboost as xgb
from tqdm import tqdm
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, BorderlineSMOTE, SVMSMOTE, ADASYN, KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, CondensedNearestNeighbour, OneSidedSelection, InstanceHardnessThreshold, AllKNN, RepeatedEditedNearestNeighbours, EditedNearestNeighbours
from imblearn.combine import SMOTEENN

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


ROOT_DIR = "data"
RANDOM_STATE = 110

train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))

In [37]:
def cat2num(X):
    non_numeric_columns = X.select_dtypes(include=['object']).columns
    # print("Non-numeric columns:", non_numeric_columns)

    encoded_columns = {}

    for column in non_numeric_columns:
        encoder = LabelEncoder()
        encoded_columns[column] = encoder.fit_transform(X[column])

    encoded_df = pd.DataFrame(encoded_columns, index=X.index)

    X = X.drop(columns=non_numeric_columns)
    X = pd.concat([X, encoded_df], axis=1)

    return X

def featuregen(train_data):
    axis = ['X', 'Y', 'Z']
    process = ['Dam', 'Fill1', 'Fill2']

    for ax in axis:
        for proc in process:
            stage1_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage1) Collect Result_{proc}'
            stage2_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage2) Collect Result_{proc}'
            stage3_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage3) Collect Result_{proc}'
            new_col_1_3 = f'Head_DIFF_{ax}_Stage1&3_{proc}'
            new_col_max_min = f'Head_MinMax_{ax}_{proc}'
            
        
            train_data[new_col_1_3] = (train_data[stage1_col] - train_data[stage3_col]).abs()
            train_data[new_col_max_min] = train_data[[stage1_col, stage2_col, stage3_col]].max(axis=1) - train_data[[stage1_col, stage2_col, stage3_col]].min(axis=1)

    return train_data
                                                                                                            
def generate_stage_averages(df):
    stages = ['Stage1', 'Stage2', 'Stage3']
    
    for stage in stages:
        stage_columns = [col for col in df.columns if stage in col and ('Circle' in col or 'Line' in col)]
        df[f'{stage}_Average'] = df[stage_columns].mean(axis=1)
    
    return df
                                                                                                            
def generating_features(df):
 
    # Thickness difference
    df['Thickness_Diff_1_2'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 2 Collect Result_Dam']
    df['Thickness_Diff_2_3'] = df['THICKNESS 2 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']
    df['Thickness_Std'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].std(axis=1)
    df['Thickness_Max_Min_Diff'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].max(axis=1) - df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].min(axis=1)
    df['Temperature_Change_Rate'] = df['Chamber Temp. Collect Result_AutoClave'] * df['Chamber Temp. Unit Time_AutoClave']

    return df

def generate_volume_to_speed_ratio(df):
    df['Volume_to_Speed_Ratio_Stage1'] = df['Dispense Volume(Stage1) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_to_Speed_Ratio_Stage2'] = df['Dispense Volume(Stage2) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_to_Speed_Ratio_Stage3'] = df['Dispense Volume(Stage3) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_Sum_Fill1'] = df['Dispense Volume(Stage1) Collect Result_Fill1'] + df['Dispense Volume(Stage2) Collect Result_Fill1'] + df['Dispense Volume(Stage3) Collect Result_Fill1']
    
    return df

def generate_pressure_change_rate(df):
    df['Pressure_Change_Rate_1st'] = df['1st Pressure Collect Result_AutoClave'] * df['1st Pressure 1st Pressure Unit Time_AutoClave']
    df['Pressure_Change_Rate_2nd'] = df['2nd Pressure Collect Result_AutoClave'] * df['2nd Pressure Unit Time_AutoClave']
    df['Pressure_Change_Rate_3rd'] = df['3rd Pressure Collect Result_AutoClave'] * df['3rd Pressure Unit Time_AutoClave']
    
    return df

def generate_volume_to_time_ratio(df):
    df['Volume_to_Time_Ratio_Stage1'] = df['Dispense Volume(Stage1) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
    df['Volume_to_Time_Ratio_Stage2'] = df['Dispense Volume(Stage2) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
    df['Volume_to_Time_Ratio_Stage3'] = df['Dispense Volume(Stage3) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
    
    return df


train_data = cat2num(train_data)
train_data = featuregen(train_data)
train_data = generating_features(train_data)
train_data = generate_volume_to_speed_ratio(train_data)
train_data = generate_pressure_change_rate(train_data)
train_data = generate_volume_to_time_ratio(train_data)
train_data = generate_stage_averages(train_data)

                                                                                                            



train_data = train_data[[
    
                         'Head_DIFF_X_Stage1&3_Dam',
                         'Head_DIFF_X_Stage1&3_Fill1',
                         'Head_DIFF_X_Stage1&3_Fill2',
                         
                         'Head_MinMax_Y_Dam',
                         'Head_MinMax_Y_Fill1',
                         'Head_MinMax_Y_Fill2',
                         
                         'Stage1_Average',
                         'Stage2_Average',
                         'Stage3_Average',

                         'Thickness_Max_Min_Diff',
#                          'Thickness_Std',
                         
                         'Temperature_Change_Rate',
                         
                         'Pressure_Change_Rate_1st',
                         'Pressure_Change_Rate_2nd',
                         'Pressure_Change_Rate_3rd',
                         
                         'Volume_to_Time_Ratio_Stage1',
                         'Volume_to_Time_Ratio_Stage2',
                         'Volume_to_Time_Ratio_Stage3',
    
                         'Volume_Sum_Fill1',
                         'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
    
#                          'Receip No Collect Result_Dam', 
#                          'Receip No Collect Result_Fill1',  
#     'Production Qty Collect Result_Fill1',
#     'Production Qty Collect Result_Fill2',
#     'Production Qty Collect Result_Dam',
    
    
                         'target']]

In [38]:
X = train_data.drop(columns=['target'])
y = train_data['target']

# 오버샘플링
# oversampler = BorderlineSMOTE(sampling_strategy={0 : 10000}, random_state=RANDOM_STATE)
# borderline_smote = BorderlineSMOTE(sampling_strategy="auto", random_state=RANDOM_STATE)
oversampler = SMOTEENN(sampling_strategy= {0 : 10000}, random_state=RANDOM_STATE)
X, y = oversampler.fit_resample(X, y)

# 언더샘플링
# undersampler = RandomUnderSampler(sampling_strategy={1: 10000}, random_state=RANDOM_STATE)
# undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=RANDOM_STATE)
# undersampler = NearMiss(sampling_strategy="auto")

# undersampler = CondensedNearestNeighbour(sampling_strategy='auto')

#------------------------------------------------------------------------------------------------
# undersampler = TomekLinks(sampling_strategy='auto')
# undersampler = OneSidedSelection(sampling_strategy='auto', random_state=RANDOM_STATE)
# undersampler = AllKNN(sampling_strategy='auto')
# undersampler = RepeatedEditedNearestNeighbours(sampling_strategy='auto')
# undersampler = EditedNearestNeighbours(sampling_strategy='auto')
#------------------------------------------------------------------------------------------------
# X, y = undersampler.fit_resample(X, y)

# undersampler = NearMiss(sampling_strategy="auto")
# clf = RandomForestClassifier(random_state=RANDOM_STATE)
# undersampler = InstanceHardnessThreshold(estimator=clf, sampling_strategy='auto', random_state=RANDOM_STATE)
undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=RANDOM_STATE)
X, y = undersampler.fit_resample(X, y)

# 데이터프레임으로 다시 결합
train_data = pd.concat([pd.DataFrame(X, columns=X.columns), pd.Series(y, name='target')], axis=1)

df_normal = train_data[train_data["target"] == 1]
df_abnormal = train_data[train_data["target"] == 0]
num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# 스케일링 (정규화 또는 표준화)
scaler = MinMaxScaler()
columns_to_scale = [col for col in train_data.columns if col != 'target']
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# 데이터를 다시 피처와 타겟으로 분리
train_x = train_data.drop(columns=['target'])
train_y = train_data['target']

# 학습 및 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(
    train_x,
    train_y,
    test_size=0.22,
    random_state=RANDOM_STATE,
)

Total: Normal: 10000, AbNormal: 10000


In [None]:
# Recursive Feature Elimination (RFE) for backward feature selection
model = cb.CatBoostClassifier(
    depth=6,
    iterations=500,
    l2_leaf_reg=3,
    learning_rate=0.01,
    verbose=0  # No output during training
)

# catboost_model = cb.CatBoostClassifier(
#     depth=6,
#     iterations=500,
#     l2_leaf_reg=3,
#     learning_rate=0.05,
#     verbose=0  # No output during training
# )

# xgb_model = xgb.XGBClassifier(
#     max_depth = 4,
#     min_child_weight = 1,
#     gamma = 0.3,
#     colsample_bytree = 1.0,
#     n_etimators = 50,
#     reg_alpha = 0.01,
#     reg_lambda = 1.5,
#     sub_sample = 1.0,
#     learning_rate = 0.005, RANDOM_STATE = 110)

# model = StackingClassifier(
#     estimators=[('catboost', catboost_model), ('xgb', xgb_model)],
#     final_estimator=LogisticRegression()
# )

# Train final model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

#SHAP
# SHAP value caculation
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Features importance
df_shap = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = df_shap.abs().mean().sort_values(ascending=False)
print(shap_importance)

F1 Score: 0.8130
Recall: 0.8668
Accuracy: 0.8020
Precision: 0.7655
Pressure_Change_Rate_3rd                          0.329578
Volume_Sum_Fill1                                  0.157052
Pressure_Change_Rate_2nd                          0.117980
Volume_to_Time_Ratio_Stage1                       0.117130
Stage2_Average                                    0.093866
Head_DIFF_X_Stage1&3_Dam                          0.092403
Head_MinMax_Y_Fill1                               0.090238
Head_MinMax_Y_Dam                                 0.083981
Volume_to_Time_Ratio_Stage3                       0.069989
Stage3_Average                                    0.066497
Volume_to_Time_Ratio_Stage2                       0.064517
Head_MinMax_Y_Fill2                               0.055372
Stage1_Average                                    0.050399
Temperature_Change_Rate                           0.033667
Head_DIFF_X_Stage1&3_Fill1                        0.029103
Head_DIFF_X_Stage1&3_Fill2                      

In [40]:
# Process test data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))
test_data = cat2num(test_data)
test_data = featuregen(test_data)
test_data = generating_features(test_data) 
test_data = generate_volume_to_speed_ratio(test_data)
test_data = generate_pressure_change_rate(test_data)
test_data = generate_volume_to_time_ratio(test_data)
test_data = generate_stage_averages(test_data)

# Scale the test data
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Select the same features as the training data
test_x_rfe = test_data[X_train.columns]

# Predict on test data
y_pred = model.predict(test_x_rfe)

# Prepare submission
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred
df_sub['target'] = df_sub['target'].map({1: 'Normal', 0: 'AbNormal'})

# Calculate the ratio of abnormal cases
counts = df_sub['target'].value_counts()
ratio = counts['AbNormal'] / (counts['AbNormal'] + counts['Normal'])
print("The ratio of abnormal is:", ratio)

# Save the submission file
df_sub.to_csv("submission.csv", index=False)

The ratio of abnormal is: 0.13132884050457924


In [None]:
# 0.16615760537568722

In [None]:
# import pandas as pd
# from catboost import CatBoostClassifier
# from sklearn.model_selection import train_test_split, RandomizedSearchCV,  GridSearchCV
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# # 타겟 변수와 피처 변수 분리
# y = train_data['target']
# X = train_data.drop(columns=['target'])

# # 학습 데이터와 테스트 데이터로 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=RANDOM_STATE)

# # CatBoostClassifier 모델 설정 (GPU 사용)
# model = CatBoostClassifier(
#     task_type='CPU',  # GPU 사용 설정
#     verbose=0  # 출력 로그를 줄이기 위해 설정
# )

# # 하이퍼파라미터 그리드 정의
# param_grid = {
#     'depth': [4, 5, 6],
#     'iterations': [300, 400, 500],
#     'l2_leaf_reg': [3, 5, 7],
#     'learning_rate': [0.05, 0.1, 0.2]
# }

# # GridSearchCV 설정
# grid_search = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring='accuracy',
#     cv=3,  # 교차 검증 폴드 수
#     verbose=1,
#     n_jobs=-1  # 병렬 처리
# )

# # Grid Search 학습
# grid_search.fit(X_train, y_train)

# # 최적의 하이퍼파라미터와 모델
# best_model = grid_search.best_estimator_
# print(f'Best Parameters: {grid_search.best_params_}')

# # 테스트 데이터에 대한 예측
# y_pred = best_model.predict(X_test)

# # 성능 지표 계산
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='weighted')
# recall = recall_score(y_test, y_pred, average='weighted')
# f1 = f1_score(y_test, y_pred, average='weighted')

# print(f'Accuracy: {accuracy:.4f}')
# print(f'Precision: {precision:.4f}')
# print(f'Recall: {recall:.4f}')
# print(f'F1-score: {f1:.4f}')
