In [126]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, SMOTENC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from tqdm import tqdm

In [127]:
def read_file(file_path):
    return pd.read_csv(file_path)

In [128]:
def describe_data(df, describe_type):
    if describe_type == 'describe' or describe_type == None:
        print(df.describe())
    elif describe_type == 'info':
        print(df.info())

In [129]:
def missing_values_delete_with_threshold(df, threshold):
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    cleaned_df = df.drop(columns = columns_to_drop)
    print('Following columns are going to drop:')
    print(columns_to_drop)

    return cleaned_df

In [130]:
def fill_na(df):
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in num_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)

    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].mode()[0], inplace=True) # 최빈값

    return df

In [131]:
def change_target_type(df):
    if df['target'].dtype != 'int':
        print('Changing the target type...')
        label_encoder = LabelEncoder()
        df['target'] = label_encoder.fit_transform(df['target'])
        print(f'Changed target type = {df['target'].dtype}')
        return df
    
    else:
        print('target is int')
        return df

In [132]:
def label_encoding_categorical_data(df):
    cat_cols = df.select_dtypes(include=['object']).columns
    
    label_encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = label_encoder.fit_transform(df[col].astype(str))

    return df

In [133]:
def outliers_processing(df):
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    num_cols = num_cols[num_cols != 'target']

    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df[col] = df[col].apply(lambda x: lower_bound if x < lower_bound else x)
        df[col] = df[col].apply(lambda x: upper_bound if x > upper_bound else x)

    return df

In [134]:
def scaling_processing(df, scaler):
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    num_cols = num_cols[num_cols != 'target']

    df_copy = df.copy()
    df_copy[num_cols] = scaler.fit_transform(df_copy[num_cols])

    return df_copy

In [135]:
def data_split(df, test_size = None):
    X = df.drop('target', axis = 1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)

    return X_train, X_test, y_train, y_test

In [136]:
def data_augmentation(X, y):
    smote = SMOTE(random_state = 42)
    # smote = SMOTENC(random_state=42)
    
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    X_resampled_df = pd.DataFrame(X_resampled)
    y_resampled_df = pd.DataFrame(y_resampled, columns = ['target'])

    oversampled_df = pd.concat([X_resampled_df, y_resampled_df], axis = 1)

    return oversampled_df

In [137]:
def model_fit_transform(X, y, model):
    fitted_model = model.fit(X, y)

    return fitted_model

In [138]:
def dataframe_to_csv(df, file_name):
    df.to_csv(file_name, index = False)
    print(f'Data Saved as {file_name}')

In [139]:
def model_evaluation(model, x, y):
    y_pred = model.predict(x)
    print('Accuracy:', accuracy_score(y, y_pred))
    print(classification_report(y, y_pred))

In [140]:
def best_features(X_train, X_test, y_train):
    k_best_features = int(0.2 * X_train.shape[1])
    selector = SelectKBest(score_func=f_classif, k=k_best_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    return X_train_selected, X_test_selected

In [141]:
def k_fold_cross_validation(k, X, y, model):
    k = k
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    for train_index, val_index in kf.split(X):
        X_fold_train, X_fold_val = X[train_index], X[val_index]
        y_fold_train, y_fold_val = y[train_index], y[val_index]

        model.fit(X_fold_train, y_fold_train)

    return model

In [142]:
def save_model(model, model_name):
    joblib.dump(model, model_name +'.pkl')
    print(f'모델이 {model_name + '.pkl'}로 저장되었습니다.')

In [143]:
def drop_one_unique_column(df):
    columns_to_drop = df.columns[df.nunique() <= 1]
    df_cleaned = df.drop(columns_to_drop, axis = 1)

    return df_cleaned

In [144]:
# def ml_pipeline(df, model_name, model):
#     X = df.drop('target', axis = 1)
#     y = df['target']

#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

#     k_best_faetures = int(0.2 * X_train.shape[1])
#     selector = SelectKBest(score_func = f_classif, k = k_best_faetures)
#     X_train_selected = selector.fit_transform(X_train, y_train)
#     X_test_selected = selector.transform(X_test)

#     k = 5
#     kf = KFold(n_splits=k, shuffle = True, random_state=42)

#     for train_index, val_index in kf.split(X_selected):
#         X_fold_train, X_fold_val = X_selected[train_index], X_selected[val_index]
#         y_fold_train, y_fold_val = y[train_index], y[val_index]
    
#         model.fit(X_fold_train, y_fold_train)

#     joblib.dump(model, model_name+'.pkl')
#     print(f"모델이 {model_name+'.pkl'}로 저장되었습니다.")

#     loaded_model = joblib.load(model_name+'.pkl')

#     y_test_pred = loaded_model.predict(X_)

#     print(f'Training {model_name}...')
#     y_pred = cross_val_predict(model, X_selected, y, cv = kf)
#     accuracy = accuracy_score(y, y_pred)

#     print(f"{k}-fold 교차 검증 결과 (정확도): {accuracy}")
#     print(classification_report(y, y_pred))
#     print("\n" + "="*60 + "\n")

#     return model
    

In [145]:
train_df = read_file('D:\\LGAimers\\Hackerton\\data\\train.csv')

In [146]:
describe_data(train_df, 'describe')

       Insp. Seq No._Dam  CURE END POSITION X Collect Result_Dam  \
count            40506.0                            40506.000000   
mean                 1.0                              530.370809   
std                  0.0                              369.283055   
min                  1.0                              240.000000   
25%                  1.0                              240.000000   
50%                  1.0                              240.000000   
75%                  1.0                             1000.000000   
max                  1.0                             1000.000000   

       CURE END POSITION X Unit Time_Dam  CURE END POSITION X Judge Value_Dam  \
count                                0.0                                  0.0   
mean                                 NaN                                  NaN   
std                                  NaN                                  NaN   
min                                  NaN                       

In [147]:
train_df = train_df.replace('OK', np.nan)
missing_values_delete_df = missing_values_delete_with_threshold(train_df, 50.0)

Following columns are going to drop:
Index(['Insp Judge Code_Dam', 'CURE END POSITION X Unit Time_Dam',
       'CURE END POSITION X Judge Value_Dam',
       'CURE END POSITION Z Unit Time_Dam',
       'CURE END POSITION Z Judge Value_Dam',
       'CURE END POSITION Θ Unit Time_Dam',
       'CURE END POSITION Θ Judge Value_Dam', 'CURE SPEED Unit Time_Dam',
       'CURE SPEED Judge Value_Dam', 'CURE STANDBY POSITION X Unit Time_Dam',
       ...
       'Machine Tact time Unit Time_Fill2',
       'Machine Tact time Judge Value_Fill2', 'PalletID Unit Time_Fill2',
       'PalletID Judge Value_Fill2', 'Production Qty Unit Time_Fill2',
       'Production Qty Judge Value_Fill2', 'Receip No Unit Time_Fill2',
       'Receip No Judge Value_Fill2', 'WorkMode Unit Time_Fill2',
       'WorkMode Judge Value_Fill2'],
      dtype='object', length=294)


  train_df = train_df.replace('OK', np.nan)


In [148]:
describe_data(missing_values_delete_df['target'], 'info')

<class 'pandas.core.series.Series'>
RangeIndex: 40506 entries, 0 to 40505
Series name: target
Non-Null Count  Dtype 
--------------  ----- 
40506 non-null  object
dtypes: object(1)
memory usage: 316.6+ KB
None


In [149]:
filled_na_df = fill_na(missing_values_delete_df)
describe_data(filled_na_df, 'info')
describe_data(filled_na_df['target'], 'describe')
print(type(filled_na_df['target'][0]) == 'str')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 170 entries, Wip Line_Dam to target
dtypes: float64(72), int64(77), object(21)
memory usage: 52.5+ MB
None
count      40506
unique         2
top       Normal
freq       38156
Name: target, dtype: object
False


In [150]:
filled_na_df['target'].dtype

dtype('O')

In [151]:
target_type_changed_df = change_target_type(filled_na_df)

Changing the target type...
Changed target type = int64


In [152]:
label_encoded_df = label_encoding_categorical_data(target_type_changed_df)
describe_data(label_encoded_df, 'describe')

       Wip Line_Dam  Process Desc._Dam  Equipment_Dam  Model.Suffix_Dam  \
count       40506.0            40506.0   40506.000000      40506.000000   
mean            0.0                0.0       0.382067          0.346739   
std             0.0                0.0       0.485899          0.927665   
min             0.0                0.0       0.000000          0.000000   
25%             0.0                0.0       0.000000          0.000000   
50%             0.0                0.0       0.000000          0.000000   
75%             0.0                0.0       1.000000          0.000000   
max             0.0                0.0       1.000000          6.000000   

       Workorder_Dam  Insp. Seq No._Dam  \
count   40506.000000            40506.0   
mean      307.362218                1.0   
std       183.213263                0.0   
min         0.000000                1.0   
25%       162.000000                1.0   
50%       298.000000                1.0   
75%       449.000000   

In [153]:
describe_data(label_encoded_df['target'], 'describe')

count    40506.000000
mean         0.941984
std          0.233777
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: target, dtype: float64


In [154]:
outlier_eliminated_df = outliers_processing(label_encoded_df)
describe_data(outlier_eliminated_df, 'info')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 170 entries, Wip Line_Dam to target
dtypes: float64(110), int64(60)
memory usage: 52.5 MB
None


In [155]:
print(outlier_eliminated_df.shape)
one_unique_dropped_df = drop_one_unique_column(outlier_eliminated_df)
print(one_unique_dropped_df.shape)

(40506, 170)
(40506, 127)


In [156]:
# scaler = StandardScaler()
# scaler = RobustScaler()
scaler = Normalizer()
scaled_df = scaling_processing(one_unique_dropped_df, scaler)
dataframe_to_csv(scaled_df, 'preprocessed_train_data_RobustScaler.csv')

describe_data(scaled_df, 'describe')

Data Saved as preprocessed_train_data_Normalizer.csv
       Equipment_Dam  Workorder_Dam  CURE END POSITION X Collect Result_Dam  \
count   40506.000000   40506.000000                            40506.000000   
mean        0.000013       0.010613                                0.017924   
std         0.000018       0.006746                                0.014192   
min         0.000000       0.000000                                0.004908   
25%         0.000000       0.004652                                0.007394   
50%         0.000000       0.011451                                0.008646   
75%         0.000031       0.015061                                0.031084   
max         0.000050       0.023868                                0.049704   

       CURE END POSITION Z Collect Result_Dam  \
count                            40506.000000   
mean                                 0.000215   
std                                  0.000184   
min                                  0.

In [157]:
# plt.rcParams['figure.figsize'] = [20, 200]
# scaled_df.hist(bins = 30, layout=(100, 2))
# plt.show()

In [158]:
X_train, X_test, y_train, y_test = data_split(pd.read_csv('./scaled_data.csv'), test_size=0.3)

FileNotFoundError: [Errno 2] No such file or directory: './scaled_data.csv'

In [None]:
X_train_selected, X_test_selected = best_features(X_train = X_train, X_test = X_test, y_train=y_train)

In [None]:
X_train_selected.shape

In [None]:
oversampled_train_df = data_augmentation(X_train_selected, y_train)

In [None]:
oversampled_train_df.head()

In [None]:
X_train_oversampled = oversampled_train_df.drop('target', axis = 1)
y_train_oversampled = oversampled_train_df['target']

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
lgbm = LGBMClassifier(random_state=42)
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)

rf_model = k_fold_cross_validation(k = 5, X = X_train_oversampled, y = y_train_oversampled, model = rf)
save_model(model=rf_model, model_name='randomForest')

xgboost_model = k_fold_cross_validation(k = 5, X = X_train_oversampled, y = y_train_oversampled, model = xgboost)
save_model(model=xgboost_model, model_name='XGBoost')

lgbm_model = k_fold_cross_validation(k = 5, X = X_train_oversampled, y = y_train_oversampled, model = lgbm)
save_model(model=lgbm_model, model_name='LGBM')

adaboost_model = k_fold_cross_validation(k = 5, X = X_train_oversampled, y = y_train_oversampled, model = adaboost)
save_model(model=adaboost_model, model_name='AdaBoost')