### Library import for preprocessing data

In [None]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm

from autoimpute.imputations import SingleImputer, MultipleImputer, MiceImputer
from impyute.imputation import cs
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### Prepare Data

In [None]:
df = pd.read_csv('data/train.csv', encoding='utf-8')
df.head()

In [None]:
df_zero = df.replace(np.nan, 0.0)
df_nan = df_zero.replace(0.0, np.nan)

In [None]:
df_drop = df_nan.dropna()

In [None]:
df_drop.describe().T

### Impute NaN data

In [None]:
feats = df_nan[['누적전력량', '유효전력평균', '무효전력평균', '주파수', '전류평균', '상전압평균', '선간전압평균',
       '온도', 'R상유효전력', 'R상무효전력', 'R상전류', 'R상전압', 'R상선간전압', 'S상유효전력', 'S상무효전력',
       'S상전류', 'S상전압', 'S상선간전압', 'T상유효전력', 'T상무효전력', 'T상전류', 'T상전압', 'T상선간전압']]

In [None]:
imp = IterativeImputer(sample_posterior=True, random_state=42)
imp_df = imp.fit_transform(feats)

In [None]:
imp_df = pd.DataFrame(imp_df, columns=feats.columns)

In [None]:
imp_df.head()

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(data=imp_df_comp_idx[:1000], x='index', y='T상선간전압')
plt.show()

In [None]:
sing = SingleImputer(strategy='norm')

In [None]:
sing_df = sing.fit_transform(feats)

In [None]:
sing_df.head()

In [None]:
multim = MultipleImputer(n=1, strategy='least squares', return_list=True)

In [None]:
imputed_df1 = multim.fit_transform(feats)

In [None]:
dff = imputed_df1[0][1].copy()

In [None]:
dff.describe().T

In [None]:
em_imput_df = cs.em(np.array(feats1), loop=5)

In [None]:
em_imput_df = pd.DataFrame(em_imput_df, columns=feats1.columns)

In [None]:
em_imput_df.head()

In [None]:
knn_imput_df = cs.fast_knn(np.array(feats1), k=7)

### Library import for learnings

In [None]:
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
from sklearn.metrics import f1_score, classification_report

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
df_X_train = df_zero[['누적전력량', '유효전력평균', '무효전력평균', '주파수', '전류평균', '상전압평균', '선간전압평균',
       '온도', 'R상유효전력', 'R상무효전력', 'R상전류', 'R상전압', 'R상선간전압', 'S상유효전력', 'S상무효전력',
       'S상전류', 'S상전압', 'S상선간전압', 'T상유효전력', 'T상무효전력', 'T상전류', 'T상전압', 'T상선간전압']]

df_y_train = df_zero[['label_역률평균', 'label_전류고조파평균', 'label_전압고조파평균']].replace({'정상': 0,'주의': 1,'경고': 2})

In [None]:
df_X_train = sing_df.copy()
df_y_train = df[['label_역률평균', 'label_전류고조파평균', 'label_전압고조파평균']].replace({'정상': 0,'주의': 1,'경고': 2})

In [None]:
df_X_train = imp_df_comp.copy()
df_y_train = df_nan[['label_역률평균', 'label_전류고조파평균', 'label_전압고조파평균']].replace({'정상': 0,'주의': 1,'경고': 2})

In [None]:
st_scaler = StandardScaler()
st_scaled_train = st_scaler.fit_transform(df_X_train)
st_scaled_X_train = pd.DataFrame(st_scaled_train, columns=df_X_train.columns)
st_scaled_X_train.head()

In [None]:
mm_scaler = MinMaxScaler()
mm_scaled_train = mm_scaler.fit_transform(df_X_train)
mm_scaled_X_train = pd.DataFrame(mm_scaled_train, columns=df_X_train.columns)
mm_scaled_X_train.head()

In [None]:
xgb = XGBClassifier(booster='gbtree',
                    max_depth=7,
                    n_jobs=-1,
                    objective='multi:softmax', 
                    silent=True, tree_method='gpu_hist', gpu_id=0, verbosity=3)

xgb2 = XGBClassifier(booster='gbtree',
                    max_depth=7,
                    n_jobs=-1,
                    objective='multi:softmax', 
                    silent=True, tree_method='gpu_hist', gpu_id=0, verbosity=3)

xgb3 = XGBClassifier(booster='gbtree',
                    max_depth=7,
                    n_jobs=-1,
                    objective='multi:softmax', 
                    silent=True, tree_method='gpu_hist', gpu_id=0, verbosity=3)

In [None]:
xgb = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
xgb2 = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
xgb3 = XGBClassifier(tree_method='gpu_hist', gpu_id=0)

In [None]:
param_grid={ 'booster':['gbtree'],
             'max_depth':[10],
             'min_child_weight':[3],
             'subsample':[0.75],
             'colsample_bytree':[1],
             'gamma':[0],
             'lambda':[0.001],
             'reg_alpha':[1e-4],
             'n_estimators':[100],
             'objective':['multi:softmax'],
             'random_state':[42] }

### Training & Finding best Hyperparameters

In [None]:
# set Kfold
cv = StratifiedKFold(n_splits=5)

# create GridSearchCV object
gs_xgb = GridSearchCV(xgb, param_grid=param_grid, cv=cv, scoring='f1_macro', n_jobs=4, verbose=10)
gs_xgb2 = GridSearchCV(xgb2, param_grid=param_grid, cv=cv, scoring='f1_macro', n_jobs=4, verbose=10)
gs_xgb3 = GridSearchCV(xgb3, param_grid=param_grid, cv=cv, scoring='f1_macro', n_jobs=4, verbose=10)

In [None]:
df_p_train = df_y_train['label_역률평균']
df_a_train = df_y_train['label_전류고조파평균']
df_v_train = df_y_train['label_전압고조파평균']

In [None]:
start = time.time()
gs_xgb.fit(st_scaled_X_train, df_p_train)
end = time.time()

print('elapsed time for 1st fit : ', ((end - start)/60))

In [None]:
gs_xgb.best_params_

In [None]:
start2 = time.time()
gs_xgb2.fit(st_scaled_X_train, df_a_train, verbose=False)
end2 = time.time()

print('elapsed time for 2nd fit : ', ((end2 - start2)/60))

In [None]:
gs_xgb2.best_params_

In [None]:
start3 = time.time()
gs_xgb3.fit(st_scaled_X_train, df_v_train, verbose=False)
end3 = time.time()

print('elapsed time for 3rd fit : ', ((end3 - start3)/60))

In [None]:
gs_xgb3.best_params_

In [None]:
best_gs_xgb = gs_xgb.best_estimator_
best_gs_xgb2 = gs_xgb2.best_estimator_
best_gs_xgb3 = gs_xgb3.best_estimator_

In [None]:
X_train, X_val, y_train, y_val = train_test_split(st_scaled_X_train, 
                                                  df_y_train, 
                                                  test_size=0.05, 
                                                  shuffle=True, 
                                                  random_state=42)
p_train = y_train['label_역률평균']
a_train = y_train['label_전류고조파평균']
v_train = y_train['label_전압고조파평균']

p_val = y_val['label_역률평균']
a_val = y_val['label_전류고조파평균']
v_val = y_val['label_전압고조파평균']

In [None]:
xgb_model = best_gs_xgb.fit(X_train, p_train)

In [None]:
xgb_model2 = best_gs_xgb2.fit(X_train, a_train)

In [None]:
xgb_model3 = best_gs_xgb3.fit(X_train, v_train)

In [None]:
xgb_model = xgb.fit(X_train, p_train)

In [None]:
xgb_model2 = xgb2.fit(X_train, a_train)

In [None]:
xgb_model3 = xgb3.fit(X_train, v_train)

In [None]:
pred1 = xgb_model.predict(X_val)
print(classification_report(p_val, pred1))
print(f1_score(p_val, pred1, average='macro'))

In [None]:
pred2 = xgb_model2.predict(X_val)
print(classification_report(a_val, pred2))
print(f1_score(a_val, pred2, average='macro'))

In [None]:
pred3 = xgb_model3.predict(X_val)
print(classification_report(v_val, pred3))
print(f1_score(v_val, pred3, average='macro'))

#### Testing and Predicting

In [None]:
df_test = pd.read_csv('data/test.csv', encoding='utf-8')

In [None]:
df_test_zero = df_test.replace(np.nan, 0.0)
df_test_nan = df_test_zero.replace(0.0, np.nan)
df_test_clean = df_test_nan.dropna()

for i in range(24):
    if i == 0:
        continue
    df_test_zero.iloc[:, i].replace(0.0, df_test_clean.iloc[:, i].median(), inplace=True)

In [None]:
df_test_zero = df_test.replace(np.nan, 0.0)
df_test_nan = df_test_zero.replace(0.0, np.nan)

#### Imputed data version

In [None]:
noID_test = df_test_zero.drop('ID', axis=1)
imp_test = imp.transform(noID_test)
train_scaler = st_scaler.fit(df_X_train)
scaled_test = train_scaler.transform(imp_test)
scaled_df_test = pd.DataFrame(scaled_test, columns=noID_test.columns)

#### simple imputing 0 for NaN version

In [None]:
noID_test = df_test_zero.drop('ID', axis=1)
train_scaler = mm_scaler.fit(df_X_train)
scaled_test = train_scaler.transform(noID_test)
scaled_df_test = pd.DataFrame(scaled_test, columns=noID_test.columns)

In [None]:
noID_test = df_test_zero.drop('ID', axis=1)
train_scaler = st_scaler.fit(df_X_train)
scaled_test = train_scaler.transform(noID_test)
scaled_df_test = pd.DataFrame(scaled_test, columns=noID_test.columns)

In [None]:
xgb_pred = xgb_model.predict(scaled_df_test).reshape(-1,1)
xgb_pred2 = xgb_model2.predict(scaled_df_test).reshape(-1,1)
xgb_pred3 = xgb_model3.predict(scaled_df_test).reshape(-1,1)

In [None]:
xgb_pred_total = np.concatenate([xgb_pred, xgb_pred2, xgb_pred3], axis=1)

df_ID = df_test['ID']
df_pred = pd.DataFrame(xgb_pred_total, columns=['label_역률평균', 'label_전류고조파평균', 'label_전압고조파평균'])

df_result = pd.concat([df_ID, df_pred], axis=1)

df_result['label_역률평균'].replace({0:'정상', 1:'주의', 2:'경고'}, inplace=True)
df_result['label_전류고조파평균'].replace({0:'정상', 1:'주의', 2:'경고'}, inplace=True)
df_result['label_전압고조파평균'].replace({0:'정상', 1:'주의', 2:'경고'}, inplace=True)

df_result.head(10)

In [None]:
df_result.to_csv('xgb__result.csv', index=False)

In [None]:
gs_xgb_pred = best_gs_xgb.predict(scaled_df_test).reshape(-1,1)
gs_xgb_pred2 = best_gs_xgb2.predict(scaled_df_test).reshape(-1,1)
gs_xgb_pred3 = best_gs_xgb3.predict(scaled_df_test).reshape(-1,1)

In [None]:
gs_xgb_pred_total = np.concatenate([gs_xgb_pred, gs_xgb_pred2, gs_xgb_pred3], axis=1)

df_ID = df_test['ID']
df_pred = pd.DataFrame(gs_xgb_pred_total, columns=['label_역률평균', 'label_전류고조파평균', 'label_전압고조파평균'])

df_result = pd.concat([df_ID, df_pred], axis=1)

df_result['label_역률평균'].replace({0:'정상', 1:'주의', 2:'경고'}, inplace=True)
df_result['label_전류고조파평균'].replace({0:'정상', 1:'주의', 2:'경고'}, inplace=True)
df_result['label_전압고조파평균'].replace({0:'정상', 1:'주의', 2:'경고'}, inplace=True)

df_result.head(10)

In [None]:
df_result.to_csv('gs_xgb_zero_st_result.csv', index=False)