In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts # sklearn 0.18.1
from sklearn.model_selection import GridSearchCV # sklearn 0.18.1
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Dense, Flatten, Dropout
from keras.layers import Input
from keras.optimizers import Adam
from keras import initializers
from tqdm import tqdm

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import make_scorer, matthews_corrcoef

import os
from time import time
from __future__ import division
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [3]:
# Keras 가 Tensorflow 를 벡엔드로 사용할 수 있도록 설정합니다.
os.environ["KERAS_BACKEND"] = "tensorflow"

# 실험을 재현하고 동일한 결과를 얻을 수 있는지 확인하기 위해 seed 를 설정합니다.
seed = 2019
np.random.seed(seed)
tf.set_random_seed(seed)

# Data Preprocessing

In [4]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
secom = pd.read_csv(url, header=None, delim_whitespace=True)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_csv(url, header=None, usecols=[0], squeeze=True, delim_whitespace=True) 
# delim_whitespace = True : 빈 공간(' ')을 구분자로 인식하고 데이터 읽어옴
# squeeze 만약 컬럼 하나만 읽어오면 데이터 구조를 Series로 읽어옴

In [5]:
print(type(secom)) # 데이터 구조: DataFrame
print(secom.shape) # 1567개의 인스턴스 590개의 속성
secom.head() 

<class 'pandas.core.frame.DataFrame'>
(1567, 590)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [6]:
print('The dataset has {} observations/rows and {} variables/columns.'.format(secom.shape[0], secom.shape[1]))
print('The majority class has {} observations, minority class {}.'.format(y[y == -1].size, y[y == 1].size))
print('The dataset is imbalanced. The ratio of majority class to minority class is {%.2f}:1.' % (float(y[y == -1].size/y[y == 1].size)))

The dataset has 1567 observations/rows and 590 variables/columns.
The majority class has 1463 observations, minority class 104.
The dataset is imbalanced. The ratio of majority class to minority class is {14.07}:1.


##결측치 확인

In [7]:
# 결측치를 가지고 있는 컬럼이 몇 개 인지 확인하는 코드

nmv = secom.dropna(axis=1) # 열을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of columns after removing columns with missing data: {}'.format(nmv.shape[1]))

nmv = secom.dropna(axis=0) # 행을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of rows after removing rows with missing data: {}'.format(nmv.shape[0]))

del nmv

No. of columns after removing columns with missing data: 52
No. of rows after removing rows with missing data: 0


상수 값을 가지는 열 제거

In [8]:
dropthese = [i for i in range(590) if secom[i].std() == 0]
secom_categorical = secom.drop(dropthese, axis = 1)
print(secom_categorical.shape)
secom_categorical.head()

print('There are {} columns which have identical values recorded. We will drop these.'.format(len(dropthese)))
print('The data set now has {} columns.'.format(secom_categorical.shape[1]))

(1567, 474)
There are 116 columns which have identical values recorded. We will drop these.
The data set now has 474 columns.


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(secom_categorical, y, test_size = 0.2)

In [10]:
# ndarray 
print(X_train.shape) # (1253, 474)
print(X_test.shape) # (314, 474)
print(y_train.shape) # (1253,)
print(y_test.shape) # (314,)

(1253, 474)
(314, 474)
(1253,)
(314,)


In [11]:
# ndarray -> DataFrame
# 결측치 채워주는 함수에 적용하기 위하여
df_X_train = pd.DataFrame(X_train)
df_X_test = pd.DataFrame(X_test)
print(df_X_train.shape)
print(df_X_test.shape)

(1253, 474)
(314, 474)


In [12]:
def fill_NaN_by_Gaussian(df_X_train, df_X_test) :
      
    """결측치를 해당 열의 가우시안 분포를 따르는 난수로 대체하는 함수이다.
       단, test셋의 각 열은 train셋의 각 열의 가우시안 분포를 따른다고 가정한다.
       따라서, test셋은 train셋의 mean, std를 사용한다.                      """
    
    for column in df_X_train.columns.values : 
        
        mean = df_X_train[column].mean()
        std = df_X_train[column].std()
            
        X_train_NaN_size = sum(df_X_train[column].isnull())
        X_test_NaN_size = sum(df_X_test[column].isnull())
          
        df_X_train.loc[df_X_train[column].isnull(), column] = np.random.normal(mean, std, size = X_train_NaN_size)            
        df_X_test.loc[df_X_test[column].isnull(), column] = np.random.normal(mean, std, size = X_test_NaN_size)

    return (df_X_train, df_X_test)
                          
                          
# main
df_X_train, df_X_test = fill_NaN_by_Gaussian(df_X_train, df_X_test)
print(df_X_train.shape)       
print(df_X_test.shape)

(1253, 474)
(314, 474)


In [13]:
# 결측치를 채우고 난 후 정상적으로 결측치가 채워졌는지 재확인하는 코드

# df_X_train
nmv = df_X_train.dropna(axis=1) # 열을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of columns after removing columns with missing data: {}'.format(nmv.shape[1]))

nmv = df_X_train.dropna(axis=0) # 행을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of rows after removing rows with missing data: {}'.format(nmv.shape[0]))

del nmv

# df_X_test
nmv = df_X_test.dropna(axis=1) # 열을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of columns after removing columns with missing data: {}'.format(nmv.shape[1]))

nmv = df_X_test.dropna(axis=0) # 행을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of rows after removing rows with missing data: {}'.format(nmv.shape[0]))

del nmv

No. of columns after removing columns with missing data: 474
No. of rows after removing rows with missing data: 1253
No. of columns after removing columns with missing data: 474
No. of rows after removing rows with missing data: 314


In [14]:
# 표준 정규 분포로 Normalization 

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
std_scale_parameters = scaler.fit(df_X_train.values) # train 데이터의 평균과 표준편차로 파라미터를 피팅한 후 저장, test 데이터에도 똑같은 파라미터를 적용하기 위함

scaled_X_train= std_scale_parameters.transform(df_X_train) # [n_samples, n_features]의 크기로 반환, [1253,474]
scaled_X_test= std_scale_parameters.transform(df_X_test) # [n_samples, n_features]의 크기로 반환, [314,474]

scaled_df_X_train = pd.DataFrame(scaled_X_train, index = df_X_train.index, columns = df_X_train.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용
scaled_df_X_test = pd.DataFrame(scaled_X_test, index =df_X_test.index, columns = df_X_test.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용

# print(std_scale_parameters.mean_): 평균
# print(std_scale_parameters.var_): 분산
# print(std_scale_parameters.scale_): 표준편차

# print(std_scale_parameters.mean_.size): n_features, 474개
# print(std_scale_parameters.var_.size): n_features, 474개
# print(std_scale_parameters.scale_.size): n_features, 474개

In [15]:
print(scaled_df_X_train.shape)
scaled_df_X_train.head()    

(1253, 474)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,580,581,582,583,584,585,586,587,588,589
1089,0.602322,-0.400285,0.950564,4.886215,-0.045986,-2.190411,0.230697,-0.358783,1.121164,-0.689363,...,-0.232675,-0.945351,1.9119,-0.216286,-0.190221,-0.214762,-0.421097,0.088563,-0.130543,0.059218
368,-1.068769,0.979995,-0.43684,-0.98444,-0.052737,0.341591,0.000254,0.561953,0.303079,1.475667,...,-1.484362,0.613374,-0.59095,0.151672,0.052051,0.145533,-1.35682,0.181532,0.401566,2.980367
624,-0.347026,-0.096745,0.03967,-1.136098,-0.050593,0.846917,-0.091924,0.911996,-0.099366,0.215425,...,0.083727,0.438684,-2.004325,-0.153208,-0.165994,-0.13888,0.10323,-0.178722,-0.024121,-0.365486
968,-0.990243,-0.406249,-0.933659,-0.427573,-0.052785,0.030972,-0.080401,-0.753773,1.015605,-1.206386,...,1.348161,-0.836682,0.675197,-0.263595,-0.238675,-0.256171,0.038698,-0.887607,-0.946443,-0.624594
924,1.035909,-1.290776,3.905145,2.117196,-0.055901,-1.735562,-0.679553,2.009796,1.70174,1.346411,...,-1.195157,-0.784323,2.530251,-0.205773,-0.165994,-0.20987,-0.493696,-1.352451,-1.265708,-0.721971


In [16]:
print(scaled_df_X_test.shape)
scaled_df_X_test.head()

(314, 474)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,580,581,582,583,584,585,586,587,588,589
689,0.144543,-2.250101,-0.700481,-1.145942,-0.053239,0.911694,-0.068879,-0.529038,0.553782,-0.624736,...,-0.443273,-0.311094,-0.502615,-0.205773,-0.093312,-0.19605,-0.48563,-0.59708,-0.449808,-0.281106
1143,0.298488,-0.489371,-0.721166,-0.575257,-0.06076,0.399736,-0.161056,-0.554916,-0.224718,-0.226198,...,-0.02533,0.140341,0.262963,-0.300391,-0.262902,-0.290922,0.603358,-0.794639,-0.910969,-0.698212
1122,-1.069445,1.0205,2.591456,1.652048,-0.053957,-1.17558,-0.230189,-0.805531,-0.112561,-0.0754,...,-1.031704,-0.735681,-1.120966,0.172698,0.124732,0.169718,0.264562,-0.899229,-1.088338,-0.676655
756,-0.753581,0.116963,0.375143,0.385662,-0.057839,0.79366,0.03482,0.726759,-1.451846,1.0879,...,-2.475898,-0.075965,-0.649841,0.020258,-0.044858,0.023073,-0.364631,1.297155,1.004622,0.653497
1026,-0.014133,0.730131,0.885877,1.48739,-0.053222,-0.85011,0.219174,-0.285233,-0.033392,0.33391,...,0.186703,0.795215,-1.297638,-0.048077,-0.020631,-0.038939,-0.76796,-0.48087,-0.37886,0.028609
