In [532]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, SMOTENC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from tqdm import tqdm

In [533]:
def read_file(file_path):
    return pd.read_csv(file_path)

In [534]:
def describe_data(df, describe_type):
    if describe_type == 'describe' or describe_type == None:
        print(df.describe())
    elif describe_type == 'info':
        print(df.info())

In [535]:
def missing_values_delete_with_threshold(df, threshold):
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    cleaned_df = df.drop(columns = columns_to_drop)
    print('Following columns are going to drop:')
    print(columns_to_drop)

    return cleaned_df

In [536]:
def fill_na_train(df):
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in num_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)

    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].mode()[0], inplace=True) # 최빈값

    return df, num_cols, cat_cols

In [537]:
def fill_na_test(df, num_cols, cat_cols):
    for col in num_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace = True)
    
    for col in cat_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].mode()[0], inplace = True)
            
    return df

In [538]:
def change_target_type(df):
    if df['target'].dtype != 'int':
        print('Changing the target type...')
        label_encoder = LabelEncoder()
        df['target'] = label_encoder.fit_transform(df['target'])
        print(f'Changed target type = {df['target'].dtype}')
        return df
    
    else:
        print('target is int')
        return df

In [539]:
def label_encoding_categorical_data(train_df, test_df):
    train_df = train_df.drop(['Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'], axis = 1)
    test_df_no_id = test_df.drop(['Set ID', 'Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'], axis = 1)
    cat_cols = train_df.select_dtypes(include=['object']).columns
    print(cat_cols)
    label_encoder = LabelEncoder()
    for col in cat_cols:
        train_df[col] = label_encoder.fit_transform(train_df[col].astype(str))
        test_df[col] = label_encoder.transform(test_df_no_id[col].astype(str))

    return train_df, test_df

In [540]:
def outliers_processing(train_df, test_df):
    num_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
    num_cols = num_cols[num_cols != 'target']

    for col in num_cols:
        Q1 = train_df[col].quantile(0.25)
        Q3 = train_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        train_df[col] = train_df[col].apply(lambda x: lower_bound if x < lower_bound else x)
        train_df[col] = train_df[col].apply(lambda x: upper_bound if x > upper_bound else x)
        
        test_df[col] = test_df[col].apply(lambda x: lower_bound if x < lower_bound else x)
        test_df[col] = test_df[col].apply(lambda x: upper_bound if x > upper_bound else x)

    return train_df, test_df

In [541]:
def scaling_processing(train_df, test_df, scaler):
    num_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
    num_cols = num_cols[num_cols != 'target']

    train_df_copy = train_df.copy()
    test_df_copy = test_df.drop('Set ID', axis = 1).copy()
    train_df_copy[num_cols] = scaler.fit_transform(train_df_copy[num_cols])
    test_df_copy[num_cols] = scaler.transform(test_df_copy[num_cols])

    return train_df_copy, test_df_copy

In [542]:
def data_split(df, test_size = None):
    X = df.drop('target', axis = 1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)

    return X_train, X_test, y_train, y_test

In [543]:
def data_augmentation(X, y):
    smote = SMOTE(random_state = 42)
    # smote = SMOTENC(random_state=42)
    
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    X_resampled_df = pd.DataFrame(X_resampled)
    y_resampled_df = pd.DataFrame(y_resampled, columns = ['target'])

    oversampled_df = pd.concat([X_resampled_df, y_resampled_df], axis = 1)

    return oversampled_df

In [544]:
def model_fit_transform(X, y, model):
    fitted_model = model.fit(X, y)

    return fitted_model

In [545]:
def dataframe_to_csv(df, file_name):
    df.to_csv(file_name, index = False)
    print(f'Data Saved as {file_name}')

In [546]:
def model_evaluation(model, x, y):
    y_pred = model.predict(x)
    print('Accuracy:', accuracy_score(y, y_pred))
    print(classification_report(y, y_pred))

In [547]:
def best_features(X_train, X_test, y_train):
    k_best_features = int(0.2 * X_train.shape[1])
    selector = SelectKBest(score_func=f_classif, k=k_best_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    return X_train_selected, X_test_selected

In [548]:
def k_fold_cross_validation(k, X, y, model):
    k = k
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    for train_index, val_index in kf.split(X):
        X_fold_train, X_fold_val = X[train_index], X[val_index]
        y_fold_train, y_fold_val = y[train_index], y[val_index]

        model.fit(X_fold_train, y_fold_train)

    return model

In [549]:
def save_model(model, model_name):
    joblib.dump(model, model_name +'.pkl')
    print(f'모델이 {model_name + '.pkl'}로 저장되었습니다.')

In [550]:
def drop_one_unique_column(df):
    columns_to_drop = df.columns[df.nunique() <= 1]
    df_cleaned = df.drop(columns_to_drop, axis = 1)

    return df_cleaned

In [551]:
# def ml_pipeline(df, model_name, model):
#     X = df.drop('target', axis = 1)
#     y = df['target']

#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

#     k_best_faetures = int(0.2 * X_train.shape[1])
#     selector = SelectKBest(score_func = f_classif, k = k_best_faetures)
#     X_train_selected = selector.fit_transform(X_train, y_train)
#     X_test_selected = selector.transform(X_test)

#     k = 5
#     kf = KFold(n_splits=k, shuffle = True, random_state=42)

#     for train_index, val_index in kf.split(X_selected):
#         X_fold_train, X_fold_val = X_selected[train_index], X_selected[val_index]
#         y_fold_train, y_fold_val = y[train_index], y[val_index]
    
#         model.fit(X_fold_train, y_fold_train)

#     joblib.dump(model, model_name+'.pkl')
#     print(f"모델이 {model_name+'.pkl'}로 저장되었습니다.")

#     loaded_model = joblib.load(model_name+'.pkl')

#     y_test_pred = loaded_model.predict(X_)

#     print(f'Training {model_name}...')
#     y_pred = cross_val_predict(model, X_selected, y, cv = kf)
#     accuracy = accuracy_score(y, y_pred)

#     print(f"{k}-fold 교차 검증 결과 (정확도): {accuracy}")
#     print(classification_report(y, y_pred))
#     print("\n" + "="*60 + "\n")

#     return model
    

In [552]:
train_df = read_file(r'D:\LGAimers\Hackerton\Inheon\data\train.csv')
test_df = read_file(r'D:\LGAimers\Hackerton\Inheon\data\test.csv')

In [553]:
train_df['Receip No Collect Result_Dam']

0        127
1          1
2         73
3          1
4          1
        ... 
40501      1
40502    197
40503     27
40504      1
40505      1
Name: Receip No Collect Result_Dam, Length: 40506, dtype: int64

In [554]:
describe_data(train_df, 'info')
describe_data(test_df, 'info')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 464 entries, Wip Line_Dam to target
dtypes: float64(350), int64(77), object(37)
memory usage: 143.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 465 entries, Set ID to target
dtypes: float64(351), int64(77), object(37)
memory usage: 61.6+ MB
None


In [555]:
train_df = train_df.replace('OK', np.nan)
test_df = test_df.replace('OK', np.nan)
describe_data(train_df, 'info')
describe_data(test_df, 'info')
missing_values_delete_train_df = missing_values_delete_with_threshold(train_df, 50.0)
missing_values_delete_test_df = missing_values_delete_with_threshold(test_df, 50.0)
describe_data(missing_values_delete_train_df, 'info')
describe_data(missing_values_delete_test_df, 'info')

  train_df = train_df.replace('OK', np.nan)
  test_df = test_df.replace('OK', np.nan)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 464 entries, Wip Line_Dam to target
dtypes: float64(362), int64(77), object(25)
memory usage: 143.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 465 entries, Set ID to target
dtypes: float64(363), int64(77), object(25)
memory usage: 61.6+ MB
None
Following columns are going to drop:
Index(['Insp Judge Code_Dam', 'CURE END POSITION X Unit Time_Dam',
       'CURE END POSITION X Judge Value_Dam',
       'CURE END POSITION Z Unit Time_Dam',
       'CURE END POSITION Z Judge Value_Dam',
       'CURE END POSITION Θ Unit Time_Dam',
       'CURE END POSITION Θ Judge Value_Dam', 'CURE SPEED Unit Time_Dam',
       'CURE SPEED Judge Value_Dam', 'CURE STANDBY POSITION X Unit Time_Dam',
       ...
       'Machine Tact time Unit Time_Fill2',
       'Machine Tact time Judge Value_Fill2', 'PalletID Unit Time_Fill2',
       'PalletID Judge Value_Fill2', 'Production Qty U

In [556]:
set(missing_values_delete_test_df.columns) - set(missing_values_delete_train_df.columns)

{'Set ID'}

In [557]:
filled_na_train_df, num_cols, cat_cols= fill_na_train(missing_values_delete_train_df)
print(num_cols)
print(cat_cols)
filled_na_test_df = fill_na_test(missing_values_delete_test_df, num_cols, cat_cols[:-1])

Index(['Insp. Seq No._Dam', 'CURE END POSITION X Collect Result_Dam',
       'CURE END POSITION Z Collect Result_Dam',
       'CURE END POSITION Θ Collect Result_Dam',
       'CURE SPEED Collect Result_Dam',
       'CURE STANDBY POSITION X Collect Result_Dam',
       'CURE STANDBY POSITION Z Collect Result_Dam',
       'CURE STANDBY POSITION Θ Collect Result_Dam',
       'CURE START POSITION X Collect Result_Dam',
       'CURE START POSITION Z Collect Result_Dam',
       ...
       'Head Clean Position Y Collect Result_Fill2',
       'Head Clean Position Z Collect Result_Fill2',
       'Head Purge Position X Collect Result_Fill2',
       'Head Purge Position Y Collect Result_Fill2',
       'Head Purge Position Z Collect Result_Fill2',
       'Machine Tact time Collect Result_Fill2',
       'PalletID Collect Result_Fill2', 'Production Qty Collect Result_Fill2',
       'Receip No Collect Result_Fill2', 'WorkMode Collect Result_Fill2'],
      dtype='object', length=149)
Index(['Wip Line_D

In [558]:
describe_data(filled_na_train_df, 'info')
describe_data(filled_na_test_df, 'info')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 170 entries, Wip Line_Dam to target
dtypes: float64(72), int64(77), object(21)
memory usage: 52.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 170 entries, Set ID to WorkMode Collect Result_Fill2
dtypes: float64(72), int64(77), object(21)
memory usage: 22.5+ MB
None


In [559]:
target_type_changed_df = change_target_type(filled_na_train_df)

Changing the target type...
Changed target type = int32


In [560]:
print(filled_na_test_df.select_dtypes(include=['O']).columns)
label_encoded_train_df, label_encoded_test_df = \
    label_encoding_categorical_data(target_type_changed_df, filled_na_test_df)
describe_data(label_encoded_train_df, 'info')
describe_data(label_encoded_test_df, 'info')


Index(['Set ID', 'Wip Line_Dam', 'Process Desc._Dam', 'Equipment_Dam',
       'Model.Suffix_Dam', 'Workorder_Dam', 'Wip Line_AutoClave',
       'Process Desc._AutoClave', 'Equipment_AutoClave',
       'Model.Suffix_AutoClave', 'Workorder_AutoClave', 'Wip Line_Fill1',
       'Process Desc._Fill1', 'Equipment_Fill1', 'Model.Suffix_Fill1',
       'Workorder_Fill1', 'Wip Line_Fill2', 'Process Desc._Fill2',
       'Equipment_Fill2', 'Model.Suffix_Fill2', 'Workorder_Fill2'],
      dtype='object')
Index(['Wip Line_Dam', 'Process Desc._Dam', 'Equipment_Dam',
       'Model.Suffix_Dam', 'Wip Line_AutoClave', 'Process Desc._AutoClave',
       'Equipment_AutoClave', 'Model.Suffix_AutoClave', 'Wip Line_Fill1',
       'Process Desc._Fill1', 'Equipment_Fill1', 'Model.Suffix_Fill1',
       'Wip Line_Fill2', 'Process Desc._Fill2', 'Equipment_Fill2',
       'Model.Suffix_Fill2'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 166 entries, Wip L

In [561]:
print(label_encoded_train_df.columns.shape)
print(label_encoded_test_df.columns.shape)
print(set(label_encoded_test_df.columns) - set(label_encoded_train_df.columns))
label_encoded_test_df = label_encoded_test_df.drop(['Workorder_Fill2', 'Workorder_Fill1', 'Workorder_AutoClave', 'Workorder_Dam'], axis = 1)
print(set(label_encoded_test_df.columns) - set(label_encoded_train_df.columns))

(166,)
(170,)
{'Workorder_Fill2', 'Workorder_Fill1', 'Workorder_AutoClave', 'Set ID', 'Workorder_Dam'}
{'Set ID'}


In [562]:
describe_data(label_encoded_train_df['target'], 'describe')

count    40506.000000
mean         0.941984
std          0.233777
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: target, dtype: float64


In [563]:
outlier_eliminated_train_df, outlier_eliminated_test_df = \
    outliers_processing(label_encoded_train_df, label_encoded_test_df)
describe_data(outlier_eliminated_train_df, 'info')
describe_data(outlier_eliminated_test_df, 'info')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 166 entries, Wip Line_Dam to target
dtypes: float64(106), int32(17), int64(43)
memory usage: 48.7 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 166 entries, Set ID to WorkMode Collect Result_Fill2
dtypes: float64(106), int32(16), int64(43), object(1)
memory usage: 20.9+ MB
None


In [564]:
print(outlier_eliminated_train_df.shape)
print(outlier_eliminated_test_df.shape)
one_unique_dropped_train_df = drop_one_unique_column(outlier_eliminated_train_df)
one_unique_dropped_test_df = drop_one_unique_column(outlier_eliminated_test_df)
print(one_unique_dropped_train_df.shape)
print(one_unique_dropped_test_df.shape)

(40506, 166)
(17361, 166)
(40506, 127)
(17361, 127)


In [565]:
print(set(one_unique_dropped_train_df.columns) - set(one_unique_dropped_test_df.columns))
print(set(one_unique_dropped_test_df.columns) - set(one_unique_dropped_train_df.columns))

{'target'}
{'Set ID'}


In [566]:
scaler = StandardScaler()
# scaler = RobustScaler()
# scaler = Normalizer()
scaled_train_df, scaled_test_df = \
    scaling_processing(one_unique_dropped_train_df, one_unique_dropped_test_df, scaler)

print(scaled_train_df.info())
print(scaled_test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 127 entries, Equipment_Dam to target
dtypes: float64(119), int32(8)
memory usage: 38.0 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 126 entries, Equipment_Dam to WorkMode Collect Result_Fill2
dtypes: float64(119), int32(7)
memory usage: 16.2 MB
None


In [567]:
dataframe_to_csv(scaled_train_df, 'use_this_train_data_standard.csv')
dataframe_to_csv(scaled_test_df, 'use_this_test_data_standard.csv')

Data Saved as use_this_train_data_standard.csv
Data Saved as use_this_test_data_standard.csv
