# Import ~ Seed 선언

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Dense, Flatten, Dropout
from keras.layers import Input
from keras.optimizers import Adam
from keras import initializers
from tqdm import tqdm

import os
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [3]:
# Keras 가 Tensorflow 를 벡엔드로 사용할 수 있도록 설정합니다.
os.environ["KERAS_BACKEND"] = "tensorflow"

# 실험을 재현하고 동일한 결과를 얻을 수 있는지 확인하기 위해 seed 를 설정합니다.
seed = 2019
np.random.seed(seed)
tf.set_random_seed(seed)

# Data Preprocessing

## 데이터셋 불러오기

In [4]:
# 데이터셋 로드
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
secom = pd.read_csv(url, header=None, delim_whitespace=True)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_csv(url, header=None, usecols=[0], squeeze=True, delim_whitespace=True) 
# delim_whitespace = True : 빈 공간(' ')을 구분자로 인식하고 데이터 읽어옴
# squeeze 만약 컬럼 하나만 읽어오면 데이터 구조를 Series로 읽어옴

In [5]:
print(type(secom)) # 데이터 구조: DataFrame
print(secom.shape) # 1567개의 인스턴스 590개의 속성
secom.head()

<class 'pandas.core.frame.DataFrame'>
(1567, 590)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [6]:
print('The dataset has {} observations/rows and {} variables/columns.'.format(secom.shape[0], secom.shape[1]))
print('The majority class has {} observations, minority class {}.'.format(y[y == -1].size, y[y == 1].size))
print('The dataset is imbalanced. The ratio of majority class to minority class is {%.2f}:1.' % (float(y[y == -1].size/y[y == 1].size)))

The dataset has 1567 observations/rows and 590 variables/columns.
The majority class has 1463 observations, minority class 104.
The dataset is imbalanced. The ratio of majority class to minority class is {14.07}:1.


## 결측치 확인

In [7]:
# 결측치를 가지고 있는 컬럼이 몇 개 인지 확인하는 코드

nmv = secom.dropna(axis=1) # 열을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of columns after removing columns with missing data: {}'.format(nmv.shape[1]))

nmv = secom.dropna(axis=0) # 행을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of rows after removing rows with missing data: {}'.format(nmv.shape[0]))

del nmv # nmv 변수는 오류 방지를 위해 확인 후 바로 삭제

No. of columns after removing columns with missing data: 52
No. of rows after removing rows with missing data: 0


## 상수 값을 가지는 열 제거

In [8]:
dropthese = [i for i in range(590) if secom[i].std() == 0]
secom_categorical = secom.drop(dropthese, axis = 1)
print(secom_categorical.shape)
secom_categorical.head()

print('There are {} columns which have identical values recorded. We will drop these.'.format(len(dropthese)))
print('The data set now has {} columns.'.format(secom_categorical.shape[1]))

(1567, 474)
There are 116 columns which have identical values recorded. We will drop these.
The data set now has 474 columns.


# 200개 이상의 결측치 열 제거

In [9]:
# 결측치가 criteria(200)을 초과하는 열을 추출해내는 함수
def get_columns_over_200NaN(df) :
    criteria = 200
    filtered_columns = list(filter(lambda i: sum(df[i].isnull()) > criteria, df.columns))
    return filtered_columns

# main
filtered_columns = get_columns_over_200NaN(secom_categorical)
secom_categorical = secom_categorical.drop(filtered_columns, 1)

print(secom_categorical.shape)
secom_categorical.head()

(1567, 422)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,97.6133,0.1242,1.5005,0.0162,-0.0034,...,1.6765,14.9509,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,102.3433,0.1247,1.4966,-0.0005,-0.0148,...,1.1065,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,95.4878,0.1241,1.4436,0.0041,0.0013,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,104.2367,0.1217,1.4882,-0.0124,-0.0033,...,1.7585,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.3967,0.1235,1.5031,-0.0031,-0.0072,...,1.6597,10.9698,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


## 데이터셋 분리

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(secom_categorical, y, test_size = 0.2, stratify=y)

In [11]:
# ndarray 
print(X_train.shape) # (1253, 474)
print(X_test.shape) # (314, 474)
print(y_train.shape) # (1253,)
print(y_test.shape) # (314,)

(1253, 422)
(314, 422)
(1253,)
(314,)


In [33]:
# 결측치 파악을 위한 문구 출력
criteria_list = [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 400, 500, 600, 700, 800, 900, 1000]

for criteria in criteria_list :
    columns_filtered = list(filter(lambda i: sum(secom_categorical[i].isnull()) > criteria, secom_categorical.columns))
    print('The number of columns with more than {:>4d} missing values(about {}%): {:>2d}'.format(criteria, int((criteria/1567)*100), len(columns_filtered)))           

The number of columns with more than    1 missing values(about 0%): 290
The number of columns with more than    5 missing values(about 0%): 174
The number of columns with more than   10 missing values(about 0%): 66
The number of columns with more than   20 missing values(about 1%): 46
The number of columns with more than   30 missing values(about 1%):  8
The number of columns with more than   40 missing values(about 2%):  8
The number of columns with more than   50 missing values(about 3%):  8
The number of columns with more than   60 missing values(about 3%):  0
The number of columns with more than   70 missing values(about 4%):  0
The number of columns with more than   80 missing values(about 5%):  0
The number of columns with more than   90 missing values(about 5%):  0
The number of columns with more than  100 missing values(about 6%):  0
The number of columns with more than  200 missing values(about 12%):  0
The number of columns with more than  400 missing values(about 25%):  0
Th

## 결측치 채우기 (가우시안 분포)

In [12]:
# fill_NaN_by_Gaussian 함수를 적용하기 위해 데이터 형변환 (ndarray -> DataFrame)
df_X_train = pd.DataFrame(X_train)
df_X_test = pd.DataFrame(X_test)
print(df_X_train.shape)
print(df_X_test.shape)

(1253, 422)
(314, 422)


In [13]:
def fill_NaN_by_Gaussian(df_X_train, df_X_test) :
      
    """결측치를 해당 열의 가우시안 분포를 따르는 난수로 대체하는 함수이다.
       단, test셋의 각 열은 train셋의 각 열의 가우시안 분포를 따른다고 가정한다.
       따라서, test셋은 train셋의 mean, std를 사용한다.                      """
    
    for column in df_X_train.columns.values : 
        
        mean = df_X_train[column].mean()
        std = df_X_train[column].std()
            
        X_train_NaN_size = sum(df_X_train[column].isnull())
        X_test_NaN_size = sum(df_X_test[column].isnull())
          
        df_X_train.loc[df_X_train[column].isnull(), column] = np.random.normal(mean, std, size = X_train_NaN_size)            
        df_X_test.loc[df_X_test[column].isnull(), column] = np.random.normal(mean, std, size = X_test_NaN_size)

    return (df_X_train, df_X_test)
                          
                          
# main
df_X_train, df_X_test = fill_NaN_by_Gaussian(df_X_train, df_X_test)
print(df_X_train.shape)       
print(df_X_test.shape)

(1253, 422)
(314, 422)


In [14]:
# 결측치를 채우고 난 후 정상적으로 결측치가 채워졌는지 재확인하는 코드

# df_X_train
nmv = df_X_train.dropna(axis=1) # 열을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of columns after removing columns with missing data: {}'.format(nmv.shape[1]))

nmv = df_X_train.dropna(axis=0) # 행을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of rows after removing rows with missing data: {}'.format(nmv.shape[0]))

del nmv  # nmv 변수는 오류 방지를 위해 확인 후 바로 삭제

# df_X_test
nmv = df_X_test.dropna(axis=1) # 열을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of columns after removing columns with missing data: {}'.format(nmv.shape[1]))

nmv = df_X_test.dropna(axis=0) # 행을 기준으로 NaN 값이 존재하면 그 열을 삭제하고 반환
print('No. of rows after removing rows with missing data: {}'.format(nmv.shape[0]))

del nmv  # nmv 변수는 오류 방지를 위해 확인 후 바로 삭제

No. of columns after removing columns with missing data: 422
No. of rows after removing rows with missing data: 1253
No. of columns after removing columns with missing data: 422
No. of rows after removing rows with missing data: 314


## Normalization 

-  std_scale = preprocessing.StandardScaler().fit(X_train)

   X_train = std_scale.transform(X_train)

   X_test = std_scale.transform(X_test)
   

-  from sklearn.preprocessing import StandardScaler
   
   scaled_features = StandardScaler().fit_transform(df.values)
   
   ~ (생략) ~
   
   scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
   
   
-  fit_transform(X, y=None, **fit_params)
  
   Fit to data, then transform it.

   Parameters:	
  
   X : numpy array of shape [n_samples, n_features] (Training set.)

   y : numpy array of shape [n_samples] (Target values.)

   Returns:	
  
   X_new : numpy array of shape [n_samples, n_features_new] (Transformed array.)


# MinMax Scaler

In [15]:
# 표준 정규 분포로 Normalization 
# train 데이터의 각 열에 적용한 평균과 표준편차를 test 데이터의 각 열에 동일하게 적용함
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))

mm_scale_parameters = scaler.fit(df_X_train.values) # train 데이터의 평균과 표준편차로 파라미터를 피팅한 후 저장, test 데이터에도 똑같은 파라미터를 적용하기 위함

scaled_X_train= mm_scale_parameters.transform(df_X_train) # [n_samples, n_features]의 크기로 반환, [1253,474]
scaled_X_test= mm_scale_parameters.transform(df_X_test) # [n_samples, n_features]의 크기로 반환, [314,474]

scaled_df_X_train = pd.DataFrame(scaled_X_train, index = df_X_train.index, columns = df_X_train.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용
scaled_df_X_test = pd.DataFrame(scaled_X_test, index =df_X_test.index, columns = df_X_test.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용

In [16]:
print(scaled_df_X_train.shape)
scaled_df_X_train.head()    

(1253, 422)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
1089,0.032245,-0.114485,0.314324,0.948694,-0.949477,-0.784008,0.879478,0.056296,0.084957,-0.351536,...,-0.973583,-0.7287,0.8,-0.977051,-0.974684,-0.978401,-0.591912,-0.656539,-0.729927,-0.71099
368,-0.371075,0.208597,-0.007655,-0.488985,-0.950122,-0.100144,0.849139,0.346798,-0.10834,0.105802,...,-0.96434,-0.769689,0.26875,-0.947301,-0.955209,-0.949275,-0.805147,-0.635403,-0.620438,0.055099
624,-0.196881,-0.043435,0.10293,-0.526126,-0.949917,0.036338,0.837004,0.457241,-0.203429,-0.16041,...,-0.992856,-0.754995,-0.03125,-0.971951,-0.972736,-0.972267,-0.472426,-0.717305,-0.708029,-0.822372
968,-0.352123,-0.115881,-0.122952,-0.352613,-0.950126,-0.184039,0.838521,-0.068328,0.060016,-0.460751,...,-0.980963,-0.74895,0.5375,-0.980875,-0.978578,-0.981748,-0.487132,-0.878468,-0.89781,-0.890324
924,0.136892,-0.322922,1.0,0.270582,-0.950424,-0.661158,0.75964,0.80361,0.222136,0.078498,...,-0.97859,-0.823422,0.93125,-0.976201,-0.972736,-0.978005,-0.608456,-0.984148,-0.963504,-0.915862


In [17]:
print(scaled_df_X_test.shape)
scaled_df_X_test.head()

(314, 422)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
689,-0.07824,-0.547471,-0.068838,-0.528536,-0.95017,0.053834,0.840038,0.002578,-0.049104,-0.337884,...,-0.97555,-0.734372,0.2875,-0.976201,-0.966894,-0.976888,-0.606618,-0.812417,-0.79562,-0.800242
1143,-0.041086,-0.135337,-0.073639,-0.38878,-0.950888,-0.08444,0.827902,-0.005587,-0.233048,-0.253697,...,-0.985499,-0.86707,0.45,-0.98385,-0.980526,-0.984557,-0.358456,-0.857332,-0.890511,-0.909631
1122,-0.371238,0.218078,0.695129,0.156671,-0.950238,-0.509914,0.818801,-0.084658,-0.206547,-0.221843,...,-0.973425,-0.712823,0.15625,-0.945601,-0.949367,-0.94732,-0.435662,-0.88111,-0.927007,-0.903978
756,-0.295004,0.006587,0.180784,-0.153458,-0.950609,0.021954,0.85369,0.398797,-0.522993,0.023891,...,-0.975045,-0.705983,0.25625,-0.957926,-0.962999,-0.959175,-0.579044,-0.38177,-0.49635,-0.555137
1026,-0.116537,0.150111,0.299311,0.116347,-0.950168,-0.422008,0.877961,0.079502,-0.187841,-0.135381,...,-0.981085,-0.923611,0.11875,-0.963451,-0.961052,-0.964188,-0.670956,-0.785997,-0.781022,-0.719018


# Import and Apply PCA

In [18]:
# 몇 개의 주성분을 사용해야 하는지 알아보기 위해 작성한 코드
from sklearn.decomposition import PCA

varianceList = [0.80, 0.85, 0.9, 0.95, 0.99, 1]

for ratio in varianceList :
    pca = PCA(ratio)
    pca.fit(scaled_X_train)
    if ratio == 1 :
        print("Choose {:3d} eigenvectors which explain {}% of the variance.".format(474, (pca.n_components)*100))
    else :
        print("Choose {:3d} eigenvectors which explain {:>3d}% of the variance.".format(pca.n_components_, int(pca.n_components*100)))
    del pca

Choose  65 eigenvectors which explain  80% of the variance.
Choose  78 eigenvectors which explain  85% of the variance.
Choose  98 eigenvectors which explain  90% of the variance.
Choose 128 eigenvectors which explain  95% of the variance.
Choose 188 eigenvectors which explain  99% of the variance.
Choose 474 eigenvectors which explain 100% of the variance.


In [19]:
# PCA 인스턴스 객체를 생성.
pca = PCA(0.8)

# 생성된 PCA 인스턴스 객체에 scaled_df_X_train를 Fitting.
# 주의: fit은 오직 scaled_df_X_train에만 적용.
# 동일한 파라미터를 scaled_df_X_test에 적용하기 위함.
pca.fit(scaled_df_X_train)

# transform하게 되면  ndarray로 반환하기 때문에 별도의 DataFrame 형변환 필요.
X_train_after_PCA = pca.transform(scaled_df_X_train)
X_test_after_PCA = pca.transform(scaled_df_X_test)

In [20]:
# PCA 수행 후의 X_train을 DataFrame으로 형변환 
df_X_train_after_PCA = pd.DataFrame(data=X_train_after_PCA, index=scaled_df_X_train.index)
print(df_X_train_after_PCA.shape)
df_X_train_after_PCA.head()

(1253, 65)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
1089,-0.903395,-0.247233,0.759519,0.816241,-0.425685,0.091098,0.053567,-0.558968,0.149118,-0.13834,...,-0.066945,-0.242118,0.079649,-0.280433,-0.149107,-0.406006,0.162388,-0.584658,-0.204708,-0.0056
368,0.839242,1.41721,0.367891,0.292192,-1.114996,0.465425,0.34217,-0.393334,0.057625,-1.370308,...,-0.314842,0.472295,-0.179106,-0.321275,-0.067517,-0.259126,0.337592,0.014285,-0.393288,-0.273621
624,1.137073,-1.384722,-0.879379,-0.30411,0.199366,-0.557952,-1.278298,0.372888,0.07836,-0.156254,...,-0.038746,0.35267,0.160692,-0.259189,-0.237391,-0.226352,-0.33265,-0.096655,0.011927,-0.150845
968,-0.910507,0.483764,-1.452042,-0.719074,0.224417,-0.410125,1.832303,-0.634627,0.965997,-0.775354,...,0.25947,0.164011,-0.27094,-0.324605,0.581865,0.087382,0.129002,0.18199,0.018207,0.10994
924,0.174344,0.474171,-1.881797,0.072734,0.508951,-0.695294,0.603328,0.498879,-0.423531,1.10895,...,0.219536,-0.032348,0.534001,0.581231,0.25426,-0.01977,-0.105475,-0.244919,0.087369,-0.095033


In [21]:
# PCA 수행 후의 X_test을 DataFrame으로 형변환 
df_X_test_after_PCA = pd.DataFrame(data=X_test_after_PCA, index=scaled_df_X_test.index)
print(df_X_test_after_PCA.shape)
df_X_test_after_PCA.head()

(314, 65)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
689,0.880095,0.208061,-0.505179,0.879564,-0.463139,-0.702884,-0.164634,0.442332,-0.666352,-0.615188,...,-0.411532,0.277539,-0.068409,-0.163422,-0.124531,1.564981,0.440899,0.333391,-0.396088,0.415174
1143,-1.257077,0.485972,-0.124221,0.034897,0.224455,-0.842593,-0.846801,0.336878,0.34674,-0.07355,...,-0.28619,0.071769,0.61505,0.263279,0.172013,0.048027,-0.132774,0.107658,0.022037,0.052704
1122,-0.784314,-0.753157,1.565528,-0.092707,-1.027178,-0.654934,0.047248,-0.09849,-0.492723,0.006055,...,-0.157482,-0.328976,0.606282,-0.347548,-0.229952,-0.188944,-0.282449,-0.092019,0.560609,-0.070362
756,0.33492,-0.239147,1.87414,-2.112174,-0.376863,1.47401,-0.446107,-0.578506,-0.031759,0.139953,...,0.491603,-0.044109,0.035412,-0.177368,-0.01045,-0.116838,0.243663,0.155863,0.117827,-0.103362
1026,-1.424533,0.673877,-0.073256,0.584817,0.55691,-0.355742,-0.301483,-0.208224,-0.955554,-0.454921,...,-0.178543,-0.644358,-0.249644,-0.167087,-0.495041,-0.525178,0.016739,0.545592,-0.243223,-0.088396


## 소수 클래스 추출

In [22]:
# 소수 클래스 데이터 프레임을 만들기 위해 y_train을 데이터 프레임으로 변환
df_y_train = y_train.to_frame()
df_y_train.columns = [590]
df_y_train.head()

Unnamed: 0,590
1089,-1
368,1
624,-1
968,-1
924,1


In [23]:
# df_train_after_preprocessing: 전처리가 끝난 train data
df_train_after_preprocessing = scaled_df_X_train.join(df_y_train, how ='left')
print(df_train_after_preprocessing.shape)
df_train_after_preprocessing.head()

(1253, 423)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,577,582,583,584,585,586,587,588,589,590
1089,0.032245,-0.114485,0.314324,0.948694,-0.949477,-0.784008,0.879478,0.056296,0.084957,-0.351536,...,-0.7287,0.8,-0.977051,-0.974684,-0.978401,-0.591912,-0.656539,-0.729927,-0.71099,-1
368,-0.371075,0.208597,-0.007655,-0.488985,-0.950122,-0.100144,0.849139,0.346798,-0.10834,0.105802,...,-0.769689,0.26875,-0.947301,-0.955209,-0.949275,-0.805147,-0.635403,-0.620438,0.055099,1
624,-0.196881,-0.043435,0.10293,-0.526126,-0.949917,0.036338,0.837004,0.457241,-0.203429,-0.16041,...,-0.754995,-0.03125,-0.971951,-0.972736,-0.972267,-0.472426,-0.717305,-0.708029,-0.822372,-1
968,-0.352123,-0.115881,-0.122952,-0.352613,-0.950126,-0.184039,0.838521,-0.068328,0.060016,-0.460751,...,-0.74895,0.5375,-0.980875,-0.978578,-0.981748,-0.487132,-0.878468,-0.89781,-0.890324,-1
924,0.136892,-0.322922,1.0,0.270582,-0.950424,-0.661158,0.75964,0.80361,0.222136,0.078498,...,-0.823422,0.93125,-0.976201,-0.972736,-0.978005,-0.608456,-0.984148,-0.963504,-0.915862,1


In [24]:
# df_train_after_preprocessing에서 소수 클래스만 추출하여 데이터 프레임화
df_minority_class = df_train_after_preprocessing.loc[df_train_after_preprocessing[590]==1]
print(df_minority_class.shape)
df_minority_class.head()

(79, 423)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,577,582,583,584,585,586,587,588,589,590
368,-0.371075,0.208597,-0.007655,-0.488985,-0.950122,-0.100144,0.849139,0.346798,-0.10834,0.105802,...,-0.769689,0.26875,-0.947301,-0.955209,-0.949275,-0.805147,-0.635403,-0.620438,0.055099,1
924,0.136892,-0.322922,1.0,0.270582,-0.950424,-0.661158,0.75964,0.80361,0.222136,0.078498,...,-0.823422,0.93125,-0.976201,-0.972736,-0.978005,-0.608456,-0.984148,-0.963504,-0.915862,1
58,-0.149092,-0.331123,0.281506,-0.19079,-0.950346,0.18541,0.837004,0.087667,-0.443492,0.392491,...,-0.8505,0.69375,-0.974076,-0.966894,-0.97541,-0.509191,-0.31572,-0.386861,-0.604956,1
795,0.063659,-0.157702,-0.103751,-0.350559,-0.949907,-0.060671,0.890096,0.058015,-0.240842,-0.542662,...,-0.604076,0.35,-0.983425,-0.980526,-0.98414,-0.143382,-0.402906,-0.423358,-0.820512,1
327,-0.311314,0.164827,0.397327,-0.106797,-0.950819,-0.270862,0.826385,-0.06489,-0.303196,-0.23777,...,-0.840737,0.2125,-0.915002,-0.925998,-0.917652,-0.474265,-0.830911,-0.839416,-0.872711,1


In [25]:
# 소수 클래스 y

df_minority_class_y = df_minority_class[590]
df_minority_class_y = df_minority_class_y.to_frame()
df_minority_class_y.head()


Unnamed: 0,590
368,1
924,1
58,1
795,1
327,1


In [26]:
# 소수 클래스 X

df_minority_class_X = df_minority_class.drop(590,1)
df_minority_class_X.head()


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
368,-0.371075,0.208597,-0.007655,-0.488985,-0.950122,-0.100144,0.849139,0.346798,-0.10834,0.105802,...,-0.96434,-0.769689,0.26875,-0.947301,-0.955209,-0.949275,-0.805147,-0.635403,-0.620438,0.055099
924,0.136892,-0.322922,1.0,0.270582,-0.950424,-0.661158,0.75964,0.80361,0.222136,0.078498,...,-0.97859,-0.823422,0.93125,-0.976201,-0.972736,-0.978005,-0.608456,-0.984148,-0.963504,-0.915862
58,-0.149092,-0.331123,0.281506,-0.19079,-0.950346,0.18541,0.837004,0.087667,-0.443492,0.392491,...,-0.980472,-0.8505,0.69375,-0.974076,-0.966894,-0.97541,-0.509191,-0.31572,-0.386861,-0.604956
795,0.063659,-0.157702,-0.103751,-0.350559,-0.949907,-0.060671,0.890096,0.058015,-0.240842,-0.542662,...,-0.97539,-0.604076,0.35,-0.983425,-0.980526,-0.98414,-0.143382,-0.402906,-0.423358,-0.820512
327,-0.311314,0.164827,0.397327,-0.106797,-0.950819,-0.270862,0.826385,-0.06489,-0.303196,-0.23777,...,-0.969173,-0.840737,0.2125,-0.915002,-0.925998,-0.917652,-0.474265,-0.830911,-0.839416,-0.872711


## 중간 정리 (변수)

(secom, y): 세콤 데이터셋을 처음 불러올 때 

secom_categorical: secom에서 상수값을 제거 

(X_train, X_test, y_train, y_test): secom_categorical과 y를 훈련셋과 테스트셋으로 8:2로 분리

df_X_train: X_train을 데이터 프레임으로 변환 후 결측치를 가우시안 분포를 따르는 난수로 채움 

df_X_test: X_test을 데이터 프레임으로 변환 후 결측치를 가우시안 분포를 따르는 난수로 채움

scaled_df_X_train: df_X_train에서 각 열의 데이터가 표준 정규분포를 따르도록 Normalization 수행

scaled_df_X_test: df_X_test에서  각 열의 데이터가 표준 정규분포를 따르도록 Normalization 수행

df_y_train:y_train을 데이터 프레임으로 변환

df_train_after_preprocessing: scaled_df_X_train + df_y_train 

df_minority_class: df_train_after_preprocessing에서 소수 클래스만 추출하여 데이터 프레임으로 변환

# SMOTE

In [67]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=0, ratio = 0.5, k_neighbors=10)
X_smo, y_smo = sm.fit_resample(X_train_after_PCA, y_train)

df_X_smo = pd.DataFrame(X_smo)
df_y_smo = pd.DataFrame(y_smo)

print(X_smo.shape)
print(y_smo.shape)

SMOTE_secom = pd.concat([df_X_smo,df_y_smo], axis=1)

#SMOTE_secom.drop(SMOTE_secom.columns[[1]], axis = 1, inplace = True)

print(SMOTE_secom.shape)

SMOTE_secom.to_csv("C:/Users/juj11/Desktop/SMOTE_secom.csv",sep=',',index = False)

(1755, 65)
(1755,)
(1755, 66)


In [28]:
smo_file = pd.read_csv("C:/Users/juj11/Desktop/SMOTE_secom.csv")

print(smo_file)

             0         1         2         3         4         5         6  \
0    -0.903395 -0.247233  0.759519  0.816241 -0.425685  0.091098  0.053567   
1     0.839242  1.417210  0.367891  0.292192 -1.114996  0.465425  0.342170   
2     1.137073 -1.384722 -0.879379 -0.304110  0.199366 -0.557952 -1.278298   
3    -0.910507  0.483764 -1.452042 -0.719074  0.224417 -0.410125  1.832303   
4     0.174344  0.474171 -1.881797  0.072734  0.508951 -0.695294  0.603328   
5    -0.353106 -0.227973  0.079937 -0.576316  0.786091 -0.094164  0.281762   
6     0.515999 -1.849574 -1.480031  0.884675  0.856621  0.623028  1.022031   
7     0.639542 -2.263681 -1.247147  0.153142  0.535746 -0.851404 -0.638253   
8    -0.424098 -0.722374 -0.910452 -0.678949  1.008854 -0.972776 -0.533744   
9     1.558148  0.249647 -1.873322 -1.025119  0.531289  0.004042 -0.402189   
10   -1.428988 -0.627282  0.805627 -1.259534  0.875070 -0.425056 -0.403914   
11    0.508115  2.535816  1.009491 -0.910398 -0.449616  0.162985

[1761 rows x 66 columns]


# ADASYN

In [29]:
from collections import Counter
from imblearn.over_sampling import ADASYN

ada = ADASYN(random_state=0, ratio = 0.5)

X_ada, y_ada = ada.fit_resample(X_train_after_PCA, y_train)

df_X_ada = pd.DataFrame(X_ada)
df_y_ada = pd.DataFrame(y_ada)

print(X_ada.shape)
print(y_ada.shape)

ADASYN_secom = pd.concat([df_X_ada,df_y_ada], axis=1)
print(ADASYN_secom.shape)

ADASYN_secom.to_csv("C:/Users/juj11/Desktop/ADASYN_secom.csv",sep=',', index = False)

(1759, 65)
(1759,)
(1759, 66)


In [30]:
ada_file = pd.read_csv("C:/Users/juj11/Desktop/ADASYN_secom.csv")


#  Naive random over-sampling

In [31]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state = 0)
nro_X, nro_y = ros.fit_resample(X_train_after_PCA, y_train)

from collections import Counter
print(sorted(Counter(nro_y).items()))

[(-1, 1174), (1, 1174)]


In [36]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(nro_X,nro_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [37]:
df_nro_X = pd.DataFrame(nro_X)
df_nro_y = pd.DataFrame(nro_y)

In [38]:
df_X_nro = pd.DataFrame(nro_X)
df_y_nro = pd.DataFrame(nro_y)

print(df_X_nro.shape)
print(df_y_nro.shape)

NOS_secom = pd.concat([df_X_nro,df_y_nro], axis=1)
print(NOS_secom.shape)

NOS_secom.to_csv("C:/Users/juj11/Desktop/NOS_secom.csv",sep=',', index = False)

(2348, 65)
(2348, 1)
(2348, 66)


# Borderline SMOTE

In [39]:
from collections import Counter
from imblearn.over_sampling import BorderlineSMOTE
bsm = BorderlineSMOTE(random_state=0)
X_bsmo, y_bsmo = bsm.fit_resample(X_train_after_PCA, y_train)

df_X_bsmo = pd.DataFrame(X_bsmo)
df_y_bsmo = pd.DataFrame(y_bsmo)

Borderline_SMOTE_secom = pd.concat([df_X_bsmo,df_y_bsmo], axis=1)

print(Borderline_SMOTE_secom)

Borderline_SMOTE_secom.to_csv("C:/Users/juj11/Desktop/Borderline_SMOTE_secom.csv",sep=',', index = False)

            0         1         2         3         4         5         6   \
0    -0.903395 -0.247233  0.759519  0.816241 -0.425685  0.091098  0.053567   
1     0.839242  1.417210  0.367891  0.292192 -1.114996  0.465425  0.342170   
2     1.137073 -1.384722 -0.879379 -0.304110  0.199366 -0.557952 -1.278298   
3    -0.910507  0.483764 -1.452042 -0.719074  0.224417 -0.410125  1.832303   
4     0.174344  0.474171 -1.881797  0.072734  0.508951 -0.695294  0.603328   
5    -0.353106 -0.227973  0.079937 -0.576316  0.786091 -0.094164  0.281762   
6     0.515999 -1.849574 -1.480031  0.884675  0.856621  0.623028  1.022031   
7     0.639542 -2.263681 -1.247147  0.153142  0.535746 -0.851404 -0.638253   
8    -0.424098 -0.722374 -0.910452 -0.678949  1.008854 -0.972776 -0.533744   
9     1.558148  0.249647 -1.873322 -1.025119  0.531289  0.004042 -0.402189   
10   -1.428988 -0.627282  0.805627 -1.259534  0.875070 -0.425056 -0.403914   
11    0.508115  2.535816  1.009491 -0.910398 -0.449616  0.162985

[2340 rows x 66 columns]
