# Import ~ Seed 선언

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Dense, Flatten, Dropout
from keras.layers import Input
from keras.optimizers import Adam
from keras import initializers
from tqdm import tqdm

import os
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
# Keras 가 Tensorflow 를 벡엔드로 사용할 수 있도록 설정합니다.
os.environ["KERAS_BACKEND"] = "tensorflow"

# 실험을 재현하고 동일한 결과를 얻을 수 있는지 확인하기 위해 seed 를 설정합니다.
seed = 2019
np.random.seed(seed)
tf.set_random_seed(seed)

# Data Preprocessing

## 데이터셋 불러오기

In [3]:
# 데이터셋 로드
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
secom = pd.read_csv(url, header=None, delim_whitespace=True)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_csv(url, header=None, usecols=[0], squeeze=True, delim_whitespace=True) 
# delim_whitespace = True : 빈 공간(' ')을 구분자로 인식하고 데이터 읽어옴
# squeeze 만약 컬럼 하나만 읽어오면 데이터 구조를 Series로 읽어옴

In [4]:
print(type(secom)) # 데이터 구조: DataFrame
print(secom.shape) # 1567개의 인스턴스 590개의 속성
secom.head()

<class 'pandas.core.frame.DataFrame'>
(1567, 590)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [5]:
print('The dataset has {} observations/rows and {} variables/columns.'.format(secom.shape[0], secom.shape[1]))
print('The majority class has {} observations, minority class {}.'.format(y[y == -1].size, y[y == 1].size))
print('The dataset is imbalanced. The ratio of majority class to minority class is {%.2f}:1.' % (float(y[y == -1].size/y[y == 1].size)))

The dataset has 1567 observations/rows and 590 variables/columns.
The majority class has 1463 observations, minority class 104.
The dataset is imbalanced. The ratio of majority class to minority class is {14.07}:1.


## 결측치 확인

In [6]:
# 결측치를 가지고 있는 컬럼이 몇 개 인지 확인하는 코드

nmv = secom.dropna(axis=1) # 열을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of columns after removing columns with missing data: {}'.format(nmv.shape[1]))

nmv = secom.dropna(axis=0) # 행을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of rows after removing rows with missing data: {}'.format(nmv.shape[0]))

del nmv # nmv 변수는 오류 방지를 위해 확인 후 바로 삭제

No. of columns after removing columns with missing data: 52
No. of rows after removing rows with missing data: 0


In [9]:
# 결측치 파악을 위한 문구 출력
criteria_list = [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 400, 500, 600, 700, 800, 900, 1000]

for criteria in criteria_list :
    columns_filtered = list(filter(lambda i: sum(secom[i].isnull()) > criteria, secom_categorical.columns))
    print('The number of columns with more than {:>4d} missing values(about {}%): {:>2d}'.format(criteria, int((criteria/1567)*100), len(columns_filtered)))           

The number of columns with more than    1 missing values(about 0%): 342
The number of columns with more than    5 missing values(about 0%): 226
The number of columns with more than   10 missing values(about 0%): 118
The number of columns with more than   20 missing values(about 1%): 98
The number of columns with more than   30 missing values(about 1%): 60
The number of columns with more than   40 missing values(about 2%): 60
The number of columns with more than   50 missing values(about 3%): 60
The number of columns with more than   60 missing values(about 3%): 52
The number of columns with more than   70 missing values(about 4%): 52
The number of columns with more than   80 missing values(about 5%): 52
The number of columns with more than   90 missing values(about 5%): 52
The number of columns with more than  100 missing values(about 6%): 52
The number of columns with more than  200 missing values(about 12%): 52
The number of columns with more than  400 missing values(about 25%): 32
T

## 상수 값을 가지는 열 제거

In [10]:
dropthese = [i for i in range(590) if secom[i].std() == 0]
secom_categorical = secom.drop(dropthese, axis = 1)
print(secom_categorical.shape)
secom_categorical.head()

print('There are {} columns which have identical values recorded. We will drop these.'.format(len(dropthese)))
print('The data set now has {} columns.'.format(secom_categorical.shape[1]))

(1567, 474)
There are 116 columns which have identical values recorded. We will drop these.
The data set now has 474 columns.


# 940개 이상의 결측치 열 제거

In [11]:
# 결측치가 criteria(200)을 초과하는 열을 추출해내는 함수
def get_columns_over_940NaN(df) :
    criteria = 940
    filtered_columns = list(filter(lambda i: sum(df[i].isnull()) > criteria, df.columns))
    return filtered_columns

# main
filtered_columns = get_columns_over_940NaN(secom_categorical)
cs = secom_categorical.drop(filtered_columns, 1)

print(cs.shape)
type(cs)
 

(1567, 450)


pandas.core.frame.DataFrame

# the mean heuristic and the nearest neighbor heuristic

In [12]:
def mhimputer(df):

    
    #case1 = np.isnan(df.loc[row-1,column]) == True
    #case2 = np.isnan(df.loc[row-1,column]) == False
    #case3 = np.isnan(df.loc[row+1,column]) == True
    #case4 = np.isnan(df.loc[row+1,column]) == False
   
    #a = df.loc[row,column] == df.loc[0, column] or df.loc[1566,column]
    
    for row in df.index:
        
        for column in df.columns:
            
            a = df.loc[row,column] == df.loc[0, column] or df.loc[row,column] == df.loc[1566,column]
            
            if row == 0 and np.isnan(df.loc[0,column]):
                for i in df.index:
                    if np.isnan(df.loc[i+1,column]) == False:
                        df.loc[0,column] = df.loc[i+1,column]
                        break
            
            elif row == 1566 and np.isnan(df.loc[1566,column]):
                for i in reversed(df.index):
                    if np.isnan(df.loc[i-1,column]) == False:
                        df.loc[1566,column]= df.loc[i-1,column]
                        break
                        
            
            elif not a:
                #print(row,column)
                
                case1 = np.isnan(df.loc[row-1,column]) == True
                case2 = np.isnan(df.loc[row-1,column]) == False
                case3 = np.isnan(df.loc[row+1,column]) == True
                case4 = np.isnan(df.loc[row+1,column]) == False
                
                if case2 and case4 and np.isnan(df.loc[row,column]):
                    df.loc[row,column] = (df.loc[row+1,column]+df.loc[row-1,column])/2
                
                elif case2 and np.isnan(df.loc[row,column]):
                    df.loc[row,column] = df.loc[row-1, column]
                   
    return pd.DataFrame(df)

In [13]:
mhimputer(cs)
cs.to_csv("C:/Users/juj11/Desktop/cs.csv",header=False, index = False)

## 데이터셋 분리

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cs, y, test_size = 0.2, stratify=y)

In [15]:
# ndarray 
print(X_train.shape) # (1253, 474)
print(X_test.shape) # (314, 474)
print(y_train.shape) # (1253,)
print(y_test.shape) # (314,)

(1253, 450)
(314, 450)
(1253,)
(314,)


# MinMax Scaler

In [16]:
# 표준 정규 분포로 Normalization 
# train 데이터의 각 열에 적용한 평균과 표준편차를 test 데이터의 각 열에 동일하게 적용함
from sklearn.preprocessing import MinMaxScaler

df_X_train = pd.DataFrame(X_train)
df_X_test = pd.DataFrame(X_test)

scaler = MinMaxScaler(feature_range=(0, 1))

mm_scale_parameters = scaler.fit(df_X_train.values) # train 데이터의 평균과 표준편차로 파라미터를 피팅한 후 저장, test 데이터에도 똑같은 파라미터를 적용하기 위함

scaled_X_train= mm_scale_parameters.transform(df_X_train) # [n_samples, n_features]의 크기로 반환, [1253,474]
scaled_X_test= mm_scale_parameters.transform(df_X_test) # [n_samples, n_features]의 크기로 반환, [314,474]

scaled_df_X_train = pd.DataFrame(scaled_X_train, index = df_X_train.index, columns = df_X_train.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용
scaled_df_X_test = pd.DataFrame(scaled_X_test, index =df_X_test.index, columns = df_X_test.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용

In [17]:
print(scaled_df_X_train.shape)
scaled_df_X_train.head()    

(1253, 450)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
340,0.536541,0.542221,0.472519,0.268771,0.000644,0.51357,0.945568,0.535024,0.521739,0.268487,...,0.00799,0.026229,0.739812,0.039966,0.036239,0.03801,0.12949,0.08605,0.094203,0.958659
732,0.460006,0.473353,0.590392,0.423271,0.000304,0.510977,0.950233,0.670606,0.570768,0.391354,...,0.009298,0.037452,0.510972,0.009884,0.010774,0.009734,0.421053,0.067797,0.072464,0.03411
378,0.340688,0.442161,0.574943,0.358185,0.000562,0.414818,0.945568,0.578642,0.47271,0.569966,...,0.008349,0.102416,0.589342,0.031586,0.029383,0.030468,0.203843,0.104302,0.112319,0.203911
787,0.565641,0.397476,0.425955,0.385426,0.000315,0.382913,0.973561,0.45982,0.325624,0.459613,...,0.00896,0.06235,0.733542,0.028792,0.026445,0.027402,0.452799,0.061278,0.072464,0.028563
1113,0.32071,0.386468,0.672785,0.523021,0.000534,0.312977,0.951788,0.841427,0.354302,0.43686,...,0.011738,0.229956,0.827586,0.009669,0.012733,0.009168,0.35589,0.482399,0.485507,0.212209


In [18]:
print(scaled_df_X_test.shape)
scaled_df_X_test.head()

(314, 450)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
1521,0.37025,0.470037,0.51101,0.950255,0.000108,0.037045,0.971229,0.280189,0.444958,0.312856,...,0.007402,0.078318,0.717868,0.016115,0.012733,0.015392,0.23726,0.143416,0.134058,0.166813
868,0.554932,0.529323,0.515854,0.561417,0.000441,0.271617,0.962675,0.54856,0.407956,0.340159,...,0.012172,0.106351,0.796238,0.019123,0.019589,0.018141,0.536341,0.323338,0.278986,0.080133
282,0.490417,0.529526,0.458206,0.293231,0.000564,0.403571,0.945568,0.628062,0.564292,0.410694,...,0.004828,0.099925,0.793103,0.019983,0.021548,0.018912,0.242272,0.178618,0.15942,0.189654
1118,0.570516,0.491122,0.624912,0.561628,0.000589,0.355656,0.959565,0.253116,0.533765,0.32992,...,0.010898,0.107022,0.683386,0.012462,0.008815,0.011953,0.269841,0.118644,0.101449,0.108422
609,0.413051,0.509634,0.205878,0.273664,0.000216,0.335071,0.98367,0.493984,0.194265,0.41752,...,0.006713,0.029107,0.680251,0.020198,0.013712,0.01936,0.351713,0.162973,0.130435,0.084616


# Import and Apply PCA

In [19]:
# 몇 개의 주성분을 사용해야 하는지 알아보기 위해 작성한 코드
from sklearn.decomposition import PCA

varianceList = [0.80, 0.85, 0.9, 0.95, 0.99, 1]

for ratio in varianceList :
    pca = PCA(ratio)
    pca.fit(scaled_X_train)
    if ratio == 1 :
        print("Choose {:3d} eigenvectors which explain {}% of the variance.".format(474, (pca.n_components)*100))
    else :
        print("Choose {:3d} eigenvectors which explain {:>3d}% of the variance.".format(pca.n_components_, int(pca.n_components*100)))
    del pca

Choose  68 eigenvectors which explain  80% of the variance.
Choose  83 eigenvectors which explain  85% of the variance.
Choose 104 eigenvectors which explain  90% of the variance.
Choose 135 eigenvectors which explain  95% of the variance.
Choose 198 eigenvectors which explain  99% of the variance.
Choose 474 eigenvectors which explain 100% of the variance.


In [22]:
# PCA 인스턴스 객체를 생성.
pca = PCA(0.90)

# 생성된 PCA 인스턴스 객체에 scaled_df_X_train를 Fitting.
# 주의: fit은 오직 scaled_df_X_train에만 적용.
# 동일한 파라미터를 scaled_df_X_test에 적용하기 위함.
pca.fit(scaled_df_X_train)

# transform하게 되면  ndarray로 반환하기 때문에 별도의 DataFrame 형변환 필요.
X_train_after_PCA = pca.transform(scaled_df_X_train)
X_test_after_PCA = pca.transform(scaled_df_X_test)

In [23]:
# PCA 수행 후의 X_train을 DataFrame으로 형변환 
df_X_train_after_PCA = pd.DataFrame(data=X_train_after_PCA, index=scaled_df_X_train.index)
print(df_X_train_after_PCA.shape)
df_X_train_after_PCA.head()

(1253, 104)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
340,0.169864,-0.087369,0.71983,-0.572293,0.568542,-0.414154,0.505739,0.226454,-0.643786,0.172362,...,0.057686,0.021313,0.117835,0.183323,-0.104596,-0.009324,0.229717,0.000151,-0.033039,-0.013611
732,-0.20469,-0.0031,-0.338688,-0.01288,-0.170542,0.21716,0.38263,-0.603935,0.274384,-0.107089,...,0.105954,0.288072,-0.372694,0.142691,0.031484,0.446973,-0.178323,-0.196046,-0.325473,0.095824
378,0.120871,0.165,0.844035,-0.675522,0.025772,-0.021095,0.61541,-0.120836,0.108491,-0.012067,...,0.036553,0.254908,-0.003305,-0.184018,-0.040299,0.045238,-0.127659,-0.047273,0.089174,0.037686
787,-0.210902,0.59432,-0.151037,-0.029691,-0.111073,-0.083939,-0.911433,0.180009,0.099216,0.038521,...,-0.061125,-0.049444,-0.012462,-0.050378,0.089586,-0.121068,-0.020311,0.153347,0.034938,0.049885
1113,-0.615036,0.257067,0.534452,0.006903,-0.165122,0.425646,-0.225973,0.231096,0.08688,-0.159935,...,0.006633,-0.20512,-0.016022,0.02215,0.029626,-0.122059,0.106268,-0.0151,-0.037243,-0.092334


In [24]:
# PCA 수행 후의 X_test을 DataFrame으로 형변환 
df_X_test_after_PCA = pd.DataFrame(data=X_test_after_PCA, index=scaled_df_X_test.index)
print(df_X_test_after_PCA.shape)
df_X_test_after_PCA.head()

(314, 104)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
1521,-0.616349,0.192142,0.075692,0.612043,0.469694,0.08669,0.094833,-0.69807,0.250145,0.441079,...,0.04758,0.006534,0.07138,0.132794,-0.006865,0.014109,0.154576,0.139828,-0.214379,-0.038229
868,-0.617224,-0.448706,0.349682,0.102128,-0.418104,-0.091262,0.2887,0.537486,-0.449576,-0.148715,...,0.041395,0.035041,-0.03777,0.011368,-0.047857,0.116487,0.004795,0.032867,0.053466,-0.012807
282,0.042186,0.152346,0.170462,-0.700766,-0.664811,-0.046194,-0.096133,0.411768,0.063113,-0.1771,...,-0.063451,-0.133584,0.05727,-0.037049,0.054972,-0.202862,0.074688,0.061221,-0.028015,-0.076556
1118,-0.68097,0.522752,0.576162,0.506556,-0.459303,-0.369791,-0.249794,-0.23495,0.222179,0.04001,...,0.268719,0.146278,-0.023201,-0.060457,-0.015017,0.08575,0.041264,-0.120564,-0.099609,0.166262
609,0.841786,-0.548992,-0.177217,-0.769191,-0.104494,0.103151,0.212182,0.322808,-0.443025,0.392955,...,-0.250863,-0.120884,-0.006373,0.24278,0.021123,-0.038552,-0.0701,0.141421,-0.061452,-0.140634
