# Import ~ Seed 선언

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Dense, Flatten, Dropout
from keras.layers import Input
from keras.optimizers import Adam
from keras import initializers
from tqdm import tqdm

import os
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
# Keras 가 Tensorflow 를 벡엔드로 사용할 수 있도록 설정합니다.
os.environ["KERAS_BACKEND"] = "tensorflow"

# 실험을 재현하고 동일한 결과를 얻을 수 있는지 확인하기 위해 seed 를 설정합니다.
seed = 2019
np.random.seed(seed)
tf.set_random_seed(seed)

# Data Preprocessing

## 데이터셋 불러오기

In [3]:
# 데이터셋 로드
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
secom = pd.read_csv(url, header=None, delim_whitespace=True)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_csv(url, header=None, usecols=[0], squeeze=True, delim_whitespace=True) 
# delim_whitespace = True : 빈 공간(' ')을 구분자로 인식하고 데이터 읽어옴
# squeeze 만약 컬럼 하나만 읽어오면 데이터 구조를 Series로 읽어옴

In [4]:
print(type(secom)) # 데이터 구조: DataFrame
print(secom.shape) # 1567개의 인스턴스 590개의 속성
secom.head()

<class 'pandas.core.frame.DataFrame'>
(1567, 590)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [5]:
print('The dataset has {} observations/rows and {} variables/columns.'.format(secom.shape[0], secom.shape[1]))
print('The majority class has {} observations, minority class {}.'.format(y[y == -1].size, y[y == 1].size))
print('The dataset is imbalanced. The ratio of majority class to minority class is {%.2f}:1.' % (float(y[y == -1].size/y[y == 1].size)))

The dataset has 1567 observations/rows and 590 variables/columns.
The majority class has 1463 observations, minority class 104.
The dataset is imbalanced. The ratio of majority class to minority class is {14.07}:1.


## 상수 값을 가지는 열 제거

In [6]:
dropthese = [i for i in range(590) if secom[i].std() == 0]
secom_categorical = secom.drop(dropthese, axis = 1)
print(secom_categorical.shape)
secom_categorical.head()

print('There are {} columns which have identical values recorded. We will drop these.'.format(len(dropthese)))
print('The data set now has {} columns.'.format(secom_categorical.shape[1]))

(1567, 474)
There are 116 columns which have identical values recorded. We will drop these.
The data set now has 474 columns.


## 940개 이상의 결측치 열 제거

In [7]:
# 결측치가 criteria(940)을 초과하는 열을 추출해내는 함수
def get_columns_over_940NaN(df) :
    criteria = 940
    filtered_columns = list(filter(lambda i: sum(df[i].isnull()) > criteria, df.columns))
    return filtered_columns

# main
filtered_columns = get_columns_over_940NaN(secom_categorical)
cs = secom_categorical.drop(filtered_columns, 1)

print(cs.shape)
type(cs)
 

(1567, 450)


pandas.core.frame.DataFrame

### the mean heuristic and the nearest neighbor heuristic

In [8]:
def mhimputer(df):

    for row in df.index:
        
        for column in df.columns:
            
            a = df.loc[row,column] == df.loc[0, column] or df.loc[row,column] == df.loc[1566,column]
            
            # 첫번째 행의 값이 NA인 경우 가장 가까운 행의 값으로 채워 넣음.
            if row == 0 and np.isnan(df.loc[0,column]):
                for i in df.index:
                    if np.isnan(df.loc[i+1,column]) == False:
                        df.loc[0,column] = df.loc[i+1,column]
                        break
            
            # 마지막 행의 값이 NA인 경우 가장 가까운 행의 값으로 채워 넣음.
            elif row == 1566 and np.isnan(df.loc[1566,column]):
                for i in reversed(df.index):
                    if np.isnan(df.loc[i-1,column]) == False:
                        df.loc[1566,column]= df.loc[i-1,column]
                        break
                        
            # 첫번째 행이나 마지막행이 접근하지 못하도록 막음.
            elif not a:
                
                case1 = np.isnan(df.loc[row-1,column]) == True
                case2 = np.isnan(df.loc[row-1,column]) == False
                case3 = np.isnan(df.loc[row+1,column]) == True
                case4 = np.isnan(df.loc[row+1,column]) == False
                
                # 행의 값이 NA일 때, 위 아래 값이 있다면 평균을 내어 값을 채워 넣음.
                if case2 and case4 and np.isnan(df.loc[row,column]):
                    df.loc[row,column] = (df.loc[row+1,column]+df.loc[row-1,column])/2
                
                # 행의 아랫 값이 없을 경우에는 위에 값을 복사하여 채워 넣음.
                elif case2 and np.isnan(df.loc[row,column]):
                    df.loc[row,column] = df.loc[row-1, column]
                   
    return pd.DataFrame(df)

In [9]:
mhimputer(cs)
cs.to_csv("C:/Users/juj11/Desktop/cs.csv",header=False, index = False)

## 데이터셋 분리

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cs, y, test_size = 0.2, stratify=y)

In [11]:
# ndarray 
print(X_train.shape) # (1253, 474)
print(X_test.shape) # (314, 474)
print(y_train.shape) # (1253,)
print(y_test.shape) # (314,)

(1253, 450)
(314, 450)
(1253,)
(314,)


## MinMax Scaler

In [12]:
# 표준 정규 분포로 Normalization 
# train 데이터의 각 열에 적용한 평균과 표준편차를 test 데이터의 각 열에 동일하게 적용함
from sklearn.preprocessing import MinMaxScaler

df_X_train = pd.DataFrame(X_train)
df_X_test = pd.DataFrame(X_test)

scaler = MinMaxScaler(feature_range=(0, 1))

mm_scale_parameters = scaler.fit(df_X_train.values) # train 데이터의 평균과 표준편차로 파라미터를 피팅한 후 저장, test 데이터에도 똑같은 파라미터를 적용하기 위함

scaled_X_train= mm_scale_parameters.transform(df_X_train)
scaled_X_test= mm_scale_parameters.transform(df_X_test)

scaled_df_X_train = pd.DataFrame(scaled_X_train, index = df_X_train.index, columns = df_X_train.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용
scaled_df_X_test = pd.DataFrame(scaled_X_test, index =df_X_test.index, columns = df_X_test.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용

In [13]:
print(scaled_df_X_train.shape)
scaled_df_X_train.head()    

(1253, 450)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
340,0.536541,0.542221,0.472519,0.268771,0.000644,0.51357,0.945568,0.535024,0.521739,0.268487,...,0.00799,0.026229,0.739812,0.039966,0.036239,0.03801,0.12949,0.08605,0.094203,0.958659
732,0.460006,0.473353,0.590392,0.423271,0.000304,0.510977,0.950233,0.670606,0.570768,0.391354,...,0.009298,0.037452,0.510972,0.009884,0.010774,0.009734,0.421053,0.067797,0.072464,0.03411
378,0.340688,0.442161,0.574943,0.358185,0.000562,0.414818,0.945568,0.578642,0.47271,0.569966,...,0.008349,0.102416,0.589342,0.031586,0.029383,0.030468,0.203843,0.104302,0.112319,0.203911
787,0.565641,0.397476,0.425955,0.385426,0.000315,0.382913,0.973561,0.45982,0.325624,0.459613,...,0.00896,0.06235,0.733542,0.028792,0.026445,0.027402,0.452799,0.061278,0.072464,0.028563
1113,0.32071,0.386468,0.672785,0.523021,0.000534,0.312977,0.951788,0.841427,0.354302,0.43686,...,0.011738,0.229956,0.827586,0.009669,0.012733,0.009168,0.35589,0.482399,0.485507,0.212209


In [14]:
print(scaled_df_X_test.shape)
scaled_df_X_test.head()

(314, 450)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
1521,0.37025,0.470037,0.51101,0.950255,0.000108,0.037045,0.971229,0.280189,0.444958,0.312856,...,0.007402,0.078318,0.717868,0.016115,0.012733,0.015392,0.23726,0.143416,0.134058,0.166813
868,0.554932,0.529323,0.515854,0.561417,0.000441,0.271617,0.962675,0.54856,0.407956,0.340159,...,0.012172,0.106351,0.796238,0.019123,0.019589,0.018141,0.536341,0.323338,0.278986,0.080133
282,0.490417,0.529526,0.458206,0.293231,0.000564,0.403571,0.945568,0.628062,0.564292,0.410694,...,0.004828,0.099925,0.793103,0.019983,0.021548,0.018912,0.242272,0.178618,0.15942,0.189654
1118,0.570516,0.491122,0.624912,0.561628,0.000589,0.355656,0.959565,0.253116,0.533765,0.32992,...,0.010898,0.107022,0.683386,0.012462,0.008815,0.011953,0.269841,0.118644,0.101449,0.108422
609,0.413051,0.509634,0.205878,0.273664,0.000216,0.335071,0.98367,0.493984,0.194265,0.41752,...,0.006713,0.029107,0.680251,0.020198,0.013712,0.01936,0.351713,0.162973,0.130435,0.084616


# Import and Apply PCA

In [15]:
from sklearn.decomposition import PCA


# PCA 인스턴스 객체를 생성.
pca = PCA(0.90)

# 생성된 PCA 인스턴스 객체에 scaled_df_X_train를 Fitting.
# 주의: fit은 오직 scaled_df_X_train에만 적용.
# 동일한 파라미터를 scaled_df_X_test에 적용하기 위함.
pca.fit(scaled_df_X_train)

# transform하게 되면  ndarray로 반환하기 때문에 별도의 DataFrame 형변환 필요.
X_train_after_PCA = pca.transform(scaled_df_X_train)
X_test_after_PCA = pca.transform(scaled_df_X_test)

In [16]:
# PCA 수행 후의 X_train을 DataFrame으로 형변환 
df_X_train_after_PCA = pd.DataFrame(data=X_train_after_PCA, index=scaled_df_X_train.index)
print(df_X_train_after_PCA.shape)
df_X_train_after_PCA.head()

(1253, 104)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
340,0.169864,-0.087369,0.71983,-0.572293,0.568542,-0.414154,0.505739,0.226454,-0.643786,0.172362,...,0.057686,0.021313,0.117835,0.183323,-0.104596,-0.009324,0.229717,0.000151,-0.033039,-0.013611
732,-0.20469,-0.0031,-0.338688,-0.01288,-0.170542,0.21716,0.38263,-0.603935,0.274384,-0.107089,...,0.105954,0.288072,-0.372694,0.142691,0.031484,0.446973,-0.178323,-0.196046,-0.325473,0.095824
378,0.120871,0.165,0.844035,-0.675522,0.025772,-0.021095,0.61541,-0.120836,0.108491,-0.012067,...,0.036553,0.254908,-0.003305,-0.184018,-0.040299,0.045238,-0.127659,-0.047273,0.089174,0.037686
787,-0.210902,0.59432,-0.151037,-0.029691,-0.111073,-0.083939,-0.911433,0.180009,0.099216,0.038521,...,-0.061125,-0.049444,-0.012462,-0.050378,0.089586,-0.121068,-0.020311,0.153347,0.034938,0.049885
1113,-0.615036,0.257067,0.534452,0.006903,-0.165122,0.425646,-0.225973,0.231096,0.08688,-0.159935,...,0.006633,-0.20512,-0.016022,0.02215,0.029626,-0.122059,0.106268,-0.0151,-0.037243,-0.092334


In [17]:
# PCA 수행 후의 X_test을 DataFrame으로 형변환 
df_X_test_after_PCA = pd.DataFrame(data=X_test_after_PCA, index=scaled_df_X_test.index)
print(df_X_test_after_PCA.shape)
df_X_test_after_PCA.head()

(314, 104)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
1521,-0.616349,0.192142,0.075692,0.612043,0.469694,0.08669,0.094833,-0.69807,0.250145,0.441079,...,0.04758,0.006534,0.07138,0.132794,-0.006865,0.014109,0.154576,0.139828,-0.214379,-0.038229
868,-0.617224,-0.448706,0.349682,0.102128,-0.418104,-0.091262,0.2887,0.537486,-0.449576,-0.148715,...,0.041395,0.035041,-0.03777,0.011368,-0.047857,0.116487,0.004795,0.032867,0.053466,-0.012807
282,0.042186,0.152346,0.170462,-0.700766,-0.664811,-0.046194,-0.096133,0.411768,0.063113,-0.1771,...,-0.063451,-0.133584,0.05727,-0.037049,0.054972,-0.202862,0.074688,0.061221,-0.028015,-0.076556
1118,-0.68097,0.522752,0.576162,0.506556,-0.459303,-0.369791,-0.249794,-0.23495,0.222179,0.04001,...,0.268719,0.146278,-0.023201,-0.060457,-0.015017,0.08575,0.041264,-0.120564,-0.099609,0.166262
609,0.841786,-0.548992,-0.177217,-0.769191,-0.104494,0.103151,0.212182,0.322808,-0.443025,0.392955,...,-0.250863,-0.120884,-0.006373,0.24278,0.021123,-0.038552,-0.0701,0.141421,-0.061452,-0.140634


# Random Over-Sampling

In [18]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Random Oversampling을 활용하여 Train Dataset의 소수 클래스를 증폭시킴.
ros = RandomOverSampler(random_state = 0,sampling_strategy='minority')
X_ro, y_ro = ros.fit_resample(X_train_after_PCA, y_train)

# 오버샘플링한 데이터의 y_label 값의 균형을 확인함.
print(sorted(Counter(y_ro).items()))

[(-1, 1170), (1, 1170)]


# Borderline Smote 

In [19]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import BorderlineSMOTE

bsm = BorderlineSMOTE(random_state=0, k_neighbors=1)
X_bsmo, y_bsmo = bsm.fit_resample(X_train_after_PCA, y_train)

df_X_smo = pd.DataFrame(X_bsmo)
df_y_smo = pd.DataFrame(y_bsmo)

print(X_bsmo.shape)
print(y_bsmo.shape)


(1253, 104)
(1253,)


# K-fold Cross Validation (5)

### -----------------------------------------------------------------------------------------

In [21]:
# y_train의 값이 DataFrame 형이라서 k-fold validation하는데 오류가 생겨 ndarray형으로 변환
y_train = y_train.values 

In [26]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score



epochs = 1000
k_fold = 5 
finer_hyperparameters_list = []

skf = StratifiedKFold(n_splits = k_fold, shuffle = True, random_state=seed) # 데이터셋을 5개로 나누고 shuffle 옵션 값을 True로 설정

#각 지표 점수와 파라미터 사용값을 저장하기 위한 리스트
total_recall,total_precision, total_f_measure, total_fpr = [],[],[],[]
total_n_estimators, total_max_features, total_max_depth, total_min_samples_split, total_min_samples_leaf = [],[],[],[],[]



for epoch in range(1, epochs+1) :
    
    # 각 파라미터 값을 랜덤으로 찍어 내어 가장 좋은 지표 값을 확인
    
    n_estimators = np.random.randint(low=30, high=500)
    max_features = np.random.uniform(low=0.1, high=1.0)
    max_depth = np.random.randint(low=2, high=130)
    min_samples_split = np.random.uniform(low=0.160, high=0.247)
    min_samples_leaf = np.random.uniform(low=0.043, high=0.061)
    
    
    # 성능 평가 모델로는 RandomforestClassifer를 사용함.
    
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_features=max_features,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   random_state=seed,
                                   class_weight="balanced",
                                   n_jobs=-1)
    recall_list = []
    precision_list = []
    f_measure_list = []
    fpr_list = []

    
    count = 0
    
        
    # Train 데이터 내에서 또한 트레인셋과 검증 셋으로 나눈다.
    
    for train_index, validation_index in skf.split(X_train_after_PCA, y_train) :
        
        count = count + 1
    
        # train dataset 내에서 k-fold를 위해 또 분리해낸 train[train_index] set을 Random Over-sampler를 사용해 오버샘플링함.
        #X_samp, y_samp = RandomOverSampler(sampling_strategy='minority', random_state=seed).fit_sample(X_train_after_PCA[train_index], y_train[train_index])
        
        bsm = BorderlineSMOTE(sampling_strategy='minority',random_state=seed, k_neighbors=1)
        X_bsmo, y_bsmo = bsm.fit_sample(X_train_after_PCA[train_index], y_train[train_index])
        
        
        model.fit(X_bsmo, y_bsmo)
        predictions = model.predict(X_train_after_PCA[validation_index])
        table = confusion_matrix(y_train[validation_index], predictions, labels=[1, -1])
        
        print(table)
        
        tp = table[0][0]
        fn = table[0][1]
        fp = table[1][0]
        tn = table[1][1]

        recall_value = tp/(tp+fn)
        precision_value = tp/(tp+fp)
        f_measure_value = (2*tp)/((2*tp)+fp+fn)
        fpr_value = fp/(fp+tn)

        recall_list.append(recall_value)
        precision_list.append(precision_value) 
        f_measure_list.append(f_measure_value) 
        fpr_list.append(fpr_value)
    
        if count == k_fold :
        
            recall = sum(recall_list)/k_fold
            precision = sum(precision_list)/k_fold        
            f_measure = sum(f_measure_list)/k_fold
            fpr = sum(fpr_list)/k_fold
           
            print(recall_list)
            print(precision_list) 
            print(f_measure_list)
            print(fpr_list)
        
            
            print("epoch={:5d}, n_estimators={:4d}, max_depth={:3d}, max_features={:.5f}, min_samples_split={:.5f},min_samples_leaf={:.5f}\nrecall={:.5f}, precision={:.5f}, f_measure={:.5f}, fpr={:.5f}".format(epoch, n_estimators, max_depth, max_features, min_samples_split, min_samples_leaf, recall, precision, f_measure, fpr))
            
            total_n_estimators.append(n_estimators)
            total_max_depth.append(max_depth)
            total_max_features.append(max_features)
            total_min_samples_split.append(min_samples_split)
            total_min_samples_leaf.append(min_samples_leaf)
            total_recall.append(recall)
            total_precision.append(precision)
            total_f_measure.append(f_measure)
            total_fpr.append(fpr)
    

[[  3  14]
 [ 10 224]]
[[  1  16]
 [ 19 215]]
[[  1  16]
 [ 29 205]]
[[  2  14]
 [ 10 224]]
[[  1  15]
 [ 27 207]]
[0.17647058823529413, 0.058823529411764705, 0.058823529411764705, 0.125, 0.0625]
[0.23076923076923078, 0.05, 0.03333333333333333, 0.16666666666666666, 0.03571428571428571]
[0.2, 0.05405405405405406, 0.0425531914893617, 0.14285714285714285, 0.045454545454545456]
[0.042735042735042736, 0.0811965811965812, 0.12393162393162394, 0.042735042735042736, 0.11538461538461539]
epoch=    1, n_estimators=  85, max_depth= 56, max_features=0.65313, min_samples_split=0.16155,min_samples_leaf=0.04330
recall=0.09632, precision=0.10330, f_measure=0.09698, fpr=0.08120
[[  0  17]
 [  2 232]]
[[  1  16]
 [  8 226]]
[[  0  17]
 [  3 231]]
[[  0  16]
 [  4 230]]
[[  0  16]
 [ 11 223]]
[0.0, 0.058823529411764705, 0.0, 0.0, 0.0]
[0.0, 0.1111111111111111, 0.0, 0.0, 0.0]
[0.0, 0.07692307692307693, 0.0, 0.0, 0.0]
[0.008547008547008548, 0.03418803418803419, 0.01282051282051282, 0.017094017094017096, 0.

recall=0.21471, precision=0.10620, f_measure=0.13667, fpr=0.13932
[[  1  16]
 [  5 229]]
[[  2  15]
 [ 24 210]]
[[  1  16]
 [ 16 218]]
[[  1  15]
 [ 10 224]]
[[  1  15]
 [ 17 217]]
[0.058823529411764705, 0.11764705882352941, 0.058823529411764705, 0.0625, 0.0625]
[0.16666666666666666, 0.07692307692307693, 0.058823529411764705, 0.09090909090909091, 0.05555555555555555]
[0.08695652173913043, 0.09302325581395349, 0.058823529411764705, 0.07407407407407407, 0.058823529411764705]
[0.021367521367521368, 0.10256410256410256, 0.06837606837606838, 0.042735042735042736, 0.07264957264957266]
epoch=   27, n_estimators= 361, max_depth=  4, max_features=0.24463, min_samples_split=0.24139,min_samples_leaf=0.04376
recall=0.07206, precision=0.08978, f_measure=0.07434, fpr=0.06154
[[  2  15]
 [ 10 224]]
[[  7  10]
 [ 39 195]]
[[  3  14]
 [ 34 200]]
[[  2  14]
 [ 15 219]]
[[  2  14]
 [ 37 197]]
[0.11764705882352941, 0.4117647058823529, 0.17647058823529413, 0.125, 0.125]
[0.16666666666666666, 0.152173913043

[0.021367521367521368, 0.05555555555555555, 0.03418803418803419, 0.03418803418803419, 0.0641025641025641]
epoch=   51, n_estimators= 383, max_depth= 69, max_features=0.31103, min_samples_split=0.18012,min_samples_leaf=0.05708
recall=0.03603, precision=0.06012, f_measure=0.04279, fpr=0.04188
[[  2  15]
 [ 10 224]]
[[  2  15]
 [ 22 212]]
[[  1  16]
 [ 22 212]]
[[  2  14]
 [ 11 223]]
[[  0  16]
 [ 24 210]]
[0.11764705882352941, 0.11764705882352941, 0.058823529411764705, 0.125, 0.0]
[0.16666666666666666, 0.08333333333333333, 0.043478260869565216, 0.15384615384615385, 0.0]
[0.13793103448275862, 0.0975609756097561, 0.05, 0.13793103448275862, 0.0]
[0.042735042735042736, 0.09401709401709402, 0.09401709401709402, 0.04700854700854701, 0.10256410256410256]
epoch=   52, n_estimators= 151, max_depth= 62, max_features=0.42913, min_samples_split=0.18852,min_samples_leaf=0.04980
recall=0.08382, precision=0.08946, f_measure=0.08468, fpr=0.07607
[[  2  15]
 [  2 232]]
[[  0  17]
 [  7 227]]
[[  0  17]
 

[0.16666666666666666, 0.2535211267605634, 0.14285714285714285, 0.13043478260869565, 0.0625]
[0.06837606837606838, 0.19230769230769232, 0.20512820512820512, 0.11538461538461539, 0.19658119658119658]
epoch=   76, n_estimators= 137, max_depth= 27, max_features=0.74609, min_samples_split=0.24124,min_samples_leaf=0.05939
recall=0.26250, precision=0.11211, f_measure=0.15120, fpr=0.15556
[[  3  14]
 [ 15 219]]
[[  5  12]
 [ 42 192]]
[[  5  12]
 [ 41 193]]
[[  3  13]
 [ 22 212]]
[[  3  13]
 [ 41 193]]
[0.17647058823529413, 0.29411764705882354, 0.29411764705882354, 0.1875, 0.1875]
[0.16666666666666666, 0.10638297872340426, 0.10869565217391304, 0.12, 0.06818181818181818]
[0.17142857142857143, 0.15625, 0.15873015873015872, 0.14634146341463414, 0.1]
[0.0641025641025641, 0.1794871794871795, 0.1752136752136752, 0.09401709401709402, 0.1752136752136752]
epoch=   77, n_estimators= 452, max_depth= 66, max_features=0.79267, min_samples_split=0.22534,min_samples_leaf=0.05168
recall=0.22794, precision=0.11

recall=0.15735, precision=0.10786, f_measure=0.12269, fpr=0.10684
[[  0  17]
 [  4 230]]
[[  1  16]
 [ 11 223]]
[[  0  17]
 [  6 228]]
[[  0  16]
 [  6 228]]
[[  0  16]
 [ 13 221]]
[0.0, 0.058823529411764705, 0.0, 0.0, 0.0]
[0.0, 0.08333333333333333, 0.0, 0.0, 0.0]
[0.0, 0.06896551724137931, 0.0, 0.0, 0.0]
[0.017094017094017096, 0.04700854700854701, 0.02564102564102564, 0.02564102564102564, 0.05555555555555555]
epoch=   89, n_estimators= 212, max_depth= 35, max_features=0.26214, min_samples_split=0.16588,min_samples_leaf=0.05677
recall=0.01176, precision=0.01667, f_measure=0.01379, fpr=0.03419
[[  0  17]
 [  7 227]]
[[  3  14]
 [ 13 221]]
[[  1  16]
 [ 27 207]]
[[  2  14]
 [ 16 218]]
[[  2  14]
 [ 29 205]]
[0.0, 0.17647058823529413, 0.058823529411764705, 0.125, 0.125]
[0.0, 0.1875, 0.03571428571428571, 0.1111111111111111, 0.06451612903225806]
[0.0, 0.18181818181818182, 0.044444444444444446, 0.11764705882352941, 0.0851063829787234]
[0.029914529914529916, 0.05555555555555555, 0.115384615

 [ 12 222]]
[[  0  16]
 [ 27 207]]
[0.11764705882352941, 0.11764705882352941, 0.11764705882352941, 0.0625, 0.0]
[0.25, 0.06896551724137931, 0.05714285714285714, 0.07692307692307693, 0.0]
[0.16, 0.08695652173913043, 0.07692307692307693, 0.06896551724137931, 0.0]
[0.02564102564102564, 0.11538461538461539, 0.14102564102564102, 0.05128205128205128, 0.11538461538461539]
epoch=  102, n_estimators=  97, max_depth= 30, max_features=0.35971, min_samples_split=0.21149,min_samples_leaf=0.05165
recall=0.08309, precision=0.09061, f_measure=0.07857, fpr=0.08974
[[  0  17]
 [  4 230]]
[[  0  17]
 [  3 231]]
[[  0  17]
 [  3 231]]
[[  0  16]
 [  1 233]]
[[  0  16]
 [  4 230]]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.017094017094017096, 0.01282051282051282, 0.01282051282051282, 0.004273504273504274, 0.017094017094017096]
epoch=  103, n_estimators= 251, max_depth= 23, max_features=0.10553, min_samples_split=0.18902,min_samples_leaf=0.05824
recall=0.00000, precisio

[0.058823529411764705, 0.058823529411764705, 0.058823529411764705, 0.0625, 0.0625]
[0.2, 0.09090909090909091, 0.058823529411764705, 0.125, 0.05263157894736842]
[0.09090909090909091, 0.07142857142857142, 0.058823529411764705, 0.08333333333333333, 0.05714285714285714]
[0.017094017094017096, 0.042735042735042736, 0.06837606837606838, 0.029914529914529916, 0.07692307692307693]
epoch=  127, n_estimators= 366, max_depth=121, max_features=0.38578, min_samples_split=0.16969,min_samples_leaf=0.04456
recall=0.06029, precision=0.10547, f_measure=0.07233, fpr=0.04701
[[  2  15]
 [ 13 221]]
[[  5  12]
 [ 28 206]]
[[  1  16]
 [ 36 198]]
[[  2  14]
 [ 16 218]]
[[  2  14]
 [ 33 201]]
[0.11764705882352941, 0.29411764705882354, 0.058823529411764705, 0.125, 0.125]
[0.13333333333333333, 0.15151515151515152, 0.02702702702702703, 0.1111111111111111, 0.05714285714285714]
[0.125, 0.2, 0.037037037037037035, 0.11764705882352941, 0.0784313725490196]
[0.05555555555555555, 0.11965811965811966, 0.15384615384615385,

recall=0.09559, precision=0.08019, f_measure=0.08362, fpr=0.09060
[[  1  16]
 [  1 233]]
[[  1  16]
 [  5 229]]
[[  0  17]
 [  3 231]]
[[  0  16]
 [  2 232]]
[[  0  16]
 [  4 230]]
[0.058823529411764705, 0.058823529411764705, 0.0, 0.0, 0.0]
[0.5, 0.16666666666666666, 0.0, 0.0, 0.0]
[0.10526315789473684, 0.08695652173913043, 0.0, 0.0, 0.0]
[0.004273504273504274, 0.021367521367521368, 0.01282051282051282, 0.008547008547008548, 0.017094017094017096]
epoch=  152, n_estimators= 232, max_depth= 35, max_features=0.12507, min_samples_split=0.18544,min_samples_leaf=0.04966
recall=0.02353, precision=0.13333, f_measure=0.03844, fpr=0.01282
[[  2  15]
 [ 22 212]]
[[  8   9]
 [ 63 171]]
[[  3  14]
 [ 45 189]]
[[  3  13]
 [ 30 204]]
[[  2  14]
 [ 35 199]]
[0.11764705882352941, 0.47058823529411764, 0.17647058823529413, 0.1875, 0.125]
[0.08333333333333333, 0.11267605633802817, 0.0625, 0.09090909090909091, 0.05405405405405406]
[0.0975609756097561, 0.18181818181818182, 0.09230769230769231, 0.12244897959

[0.03418803418803419, 0.0811965811965812, 0.08974358974358974, 0.038461538461538464, 0.1111111111111111]
epoch=  176, n_estimators= 482, max_depth=  4, max_features=0.43467, min_samples_split=0.18155,min_samples_leaf=0.06035
recall=0.09706, precision=0.10101, f_measure=0.09451, fpr=0.07094
[[  4  13]
 [ 17 217]]
[[  8   9]
 [ 44 190]]
[[  5  12]
 [ 45 189]]
[[  4  12]
 [ 25 209]]
[[  2  14]
 [ 43 191]]
[0.23529411764705882, 0.47058823529411764, 0.29411764705882354, 0.25, 0.125]
[0.19047619047619047, 0.15384615384615385, 0.1, 0.13793103448275862, 0.044444444444444446]
[0.21052631578947367, 0.2318840579710145, 0.14925373134328357, 0.17777777777777778, 0.06557377049180328]
[0.07264957264957266, 0.18803418803418803, 0.19230769230769232, 0.10683760683760683, 0.18376068376068377]
epoch=  177, n_estimators= 101, max_depth=119, max_features=0.73393, min_samples_split=0.24277,min_samples_leaf=0.05950
recall=0.27500, precision=0.12534, f_measure=0.16700, fpr=0.14872
[[  2  15]
 [ 11 223]]
[[  6 

recall=0.16838, precision=0.14464, f_measure=0.15137, fpr=0.08889
[[  3  14]
 [ 12 222]]
[[  3  14]
 [ 28 206]]
[[  2  15]
 [ 34 200]]
[[  2  14]
 [ 16 218]]
[[  2  14]
 [ 34 200]]
[0.17647058823529413, 0.17647058823529413, 0.11764705882352941, 0.125, 0.125]
[0.2, 0.0967741935483871, 0.05555555555555555, 0.1111111111111111, 0.05555555555555555]
[0.1875, 0.125, 0.07547169811320754, 0.11764705882352941, 0.07692307692307693]
[0.05128205128205128, 0.11965811965811966, 0.1452991452991453, 0.06837606837606838, 0.1452991452991453]
epoch=  202, n_estimators= 397, max_depth= 41, max_features=0.57734, min_samples_split=0.21165,min_samples_leaf=0.04814
recall=0.14412, precision=0.10380, f_measure=0.11651, fpr=0.10598
[[  3  14]
 [ 16 218]]
[[  6  11]
 [ 44 190]]
[[  4  13]
 [ 41 193]]
[[  3  13]
 [ 23 211]]
[[  2  14]
 [ 42 192]]
[0.17647058823529413, 0.35294117647058826, 0.23529411764705882, 0.1875, 0.125]
[0.15789473684210525, 0.12, 0.08888888888888889, 0.11538461538461539, 0.045454545454545456

recall=0.16765, precision=0.10269, f_measure=0.12287, fpr=0.11624
[[  1  16]
 [  3 231]]
[[  1  16]
 [ 13 221]]
[[  0  17]
 [ 11 223]]
[[  0  16]
 [  9 225]]
[[  1  15]
 [ 13 221]]
[0.058823529411764705, 0.058823529411764705, 0.0, 0.0, 0.0625]
[0.25, 0.07142857142857142, 0.0, 0.0, 0.07142857142857142]
[0.09523809523809523, 0.06451612903225806, 0.0, 0.0, 0.06666666666666667]
[0.01282051282051282, 0.05555555555555555, 0.04700854700854701, 0.038461538461538464, 0.05555555555555555]
epoch=  227, n_estimators= 325, max_depth= 57, max_features=0.24702, min_samples_split=0.19190,min_samples_leaf=0.05264
recall=0.03603, precision=0.07857, f_measure=0.04528, fpr=0.04188
[[  1  16]
 [  6 228]]
[[  3  14]
 [ 22 212]]
[[  1  16]
 [ 29 205]]
[[  1  15]
 [ 14 220]]
[[  1  15]
 [ 28 206]]
[0.058823529411764705, 0.17647058823529413, 0.058823529411764705, 0.0625, 0.0625]
[0.14285714285714285, 0.12, 0.03333333333333333, 0.06666666666666667, 0.034482758620689655]
[0.08333333333333333, 0.14285714285714285

recall=0.17941, precision=0.09834, f_measure=0.12279, fpr=0.12393
[[  3  14]
 [ 11 223]]
[[  5  12]
 [ 33 201]]
[[  3  14]
 [ 36 198]]
[[  2  14]
 [ 16 218]]
[[  2  14]
 [ 36 198]]
[0.17647058823529413, 0.29411764705882354, 0.17647058823529413, 0.125, 0.125]
[0.21428571428571427, 0.13157894736842105, 0.07692307692307693, 0.1111111111111111, 0.05263157894736842]
[0.1935483870967742, 0.18181818181818182, 0.10714285714285714, 0.11764705882352941, 0.07407407407407407]
[0.04700854700854701, 0.14102564102564102, 0.15384615384615385, 0.06837606837606838, 0.15384615384615385]
epoch=  252, n_estimators= 395, max_depth=  8, max_features=0.74808, min_samples_split=0.20148,min_samples_leaf=0.04523
recall=0.17941, precision=0.11731, f_measure=0.13485, fpr=0.11282
[[  3  14]
 [  8 226]]
[[  5  12]
 [ 31 203]]
[[  3  14]
 [ 39 195]]
[[  2  14]
 [ 13 221]]
[[  2  14]
 [ 38 196]]
[0.17647058823529413, 0.29411764705882354, 0.17647058823529413, 0.125, 0.125]
[0.2727272727272727, 0.1388888888888889, 0.071

recall=0.14265, precision=0.10286, f_measure=0.11704, fpr=0.09487
[[  1  16]
 [ 12 222]]
[[  5  12]
 [ 34 200]]
[[  1  16]
 [ 19 215]]
[[  2  14]
 [ 11 223]]
[[  1  15]
 [ 24 210]]
[0.058823529411764705, 0.29411764705882354, 0.058823529411764705, 0.125, 0.0625]
[0.07692307692307693, 0.1282051282051282, 0.05, 0.15384615384615385, 0.04]
[0.06666666666666667, 0.17857142857142858, 0.05405405405405406, 0.13793103448275862, 0.04878048780487805]
[0.05128205128205128, 0.1452991452991453, 0.0811965811965812, 0.04700854700854701, 0.10256410256410256]
epoch=  277, n_estimators= 123, max_depth= 28, max_features=0.29704, min_samples_split=0.23622,min_samples_leaf=0.05892
recall=0.11985, precision=0.08979, f_measure=0.09720, fpr=0.08547
[[  1  16]
 [  6 228]]
[[  1  16]
 [ 15 219]]
[[  1  16]
 [ 18 216]]
[[  1  15]
 [  9 225]]
[[  1  15]
 [ 19 215]]
[0.058823529411764705, 0.058823529411764705, 0.058823529411764705, 0.0625, 0.0625]
[0.14285714285714285, 0.0625, 0.05263157894736842, 0.1, 0.05]
[0.0833

[0.058823529411764705, 0.0, 0.0, 0.0, 0.0]
[0.5, 0.0, 0.0, 0.0, 0.0]
[0.10526315789473684, 0.0, 0.0, 0.0, 0.0]
[0.004273504273504274, 0.017094017094017096, 0.017094017094017096, 0.01282051282051282, 0.02564102564102564]
epoch=  302, n_estimators= 355, max_depth= 66, max_features=0.12940, min_samples_split=0.18484,min_samples_leaf=0.05638
recall=0.01176, precision=0.10000, f_measure=0.02105, fpr=0.01538
[[  2  15]
 [  8 226]]
[[  3  14]
 [ 24 210]]
[[  1  16]
 [ 27 207]]
[[  2  14]
 [ 14 220]]
[[  2  14]
 [ 31 203]]
[0.11764705882352941, 0.17647058823529413, 0.058823529411764705, 0.125, 0.125]
[0.2, 0.1111111111111111, 0.03571428571428571, 0.125, 0.06060606060606061]
[0.14814814814814814, 0.13636363636363635, 0.044444444444444446, 0.125, 0.08163265306122448]
[0.03418803418803419, 0.10256410256410256, 0.11538461538461539, 0.05982905982905983, 0.13247863247863248]
epoch=  303, n_estimators= 368, max_depth= 15, max_features=0.78157, min_samples_split=0.16359,min_samples_leaf=0.05687
recall

 [ 14 220]]
[[  2  14]
 [ 35 199]]
[0.11764705882352941, 0.35294117647058826, 0.23529411764705882, 0.0625, 0.125]
[0.16666666666666666, 0.15789473684210525, 0.10256410256410256, 0.06666666666666667, 0.05405405405405406]
[0.13793103448275862, 0.21818181818181817, 0.14285714285714285, 0.06451612903225806, 0.07547169811320754]
[0.042735042735042736, 0.13675213675213677, 0.14957264957264957, 0.05982905982905983, 0.14957264957264957]
epoch=  315, n_estimators= 146, max_depth= 62, max_features=0.78005, min_samples_split=0.17983,min_samples_leaf=0.04497
recall=0.17868, precision=0.10957, f_measure=0.12779, fpr=0.10769
[[  1  16]
 [ 11 223]]
[[  3  14]
 [ 20 214]]
[[  2  15]
 [ 31 203]]
[[  1  15]
 [ 14 220]]
[[  3  13]
 [ 30 204]]
[0.058823529411764705, 0.17647058823529413, 0.11764705882352941, 0.0625, 0.1875]
[0.08333333333333333, 0.13043478260869565, 0.06060606060606061, 0.06666666666666667, 0.09090909090909091]
[0.06896551724137931, 0.15, 0.08, 0.06451612903225806, 0.12244897959183673]
[0.

recall=0.00000, precision=0.00000, f_measure=0.00000, fpr=0.01368
[[  1  16]
 [  7 227]]
[[  3  14]
 [ 17 217]]
[[  1  16]
 [ 23 211]]
[[  0  16]
 [ 10 224]]
[[  2  14]
 [ 24 210]]
[0.058823529411764705, 0.17647058823529413, 0.058823529411764705, 0.0, 0.125]
[0.125, 0.15, 0.041666666666666664, 0.0, 0.07692307692307693]
[0.08, 0.16216216216216217, 0.04878048780487805, 0.0, 0.09523809523809523]
[0.029914529914529916, 0.07264957264957266, 0.09829059829059829, 0.042735042735042736, 0.10256410256410256]
epoch=  341, n_estimators= 379, max_depth=129, max_features=0.51427, min_samples_split=0.18468,min_samples_leaf=0.04939
recall=0.08382, precision=0.07872, f_measure=0.07724, fpr=0.06923
[[  1  16]
 [  8 226]]
[[  1  16]
 [ 27 207]]
[[  1  16]
 [ 17 217]]
[[  1  15]
 [  8 226]]
[[  0  16]
 [ 14 220]]
[0.058823529411764705, 0.058823529411764705, 0.058823529411764705, 0.0625, 0.0]
[0.1111111111111111, 0.03571428571428571, 0.05555555555555555, 0.1111111111111111, 0.0]
[0.07692307692307693, 0.044

[0.11764705882352941, 0.17647058823529413, 0.058823529411764705, 0.125, 0.125]
[0.15384615384615385, 0.09375, 0.03225806451612903, 0.16666666666666666, 0.058823529411764705]
[0.13333333333333333, 0.12244897959183673, 0.041666666666666664, 0.14285714285714285, 0.08]
[0.04700854700854701, 0.12393162393162394, 0.1282051282051282, 0.042735042735042736, 0.13675213675213677]
epoch=  366, n_estimators= 106, max_depth= 93, max_features=0.69416, min_samples_split=0.16166,min_samples_leaf=0.05317
recall=0.12059, precision=0.10107, f_measure=0.10406, fpr=0.09573
[[  3  14]
 [ 10 224]]
[[  4  13]
 [ 26 208]]
[[  2  15]
 [ 28 206]]
[[  3  13]
 [ 14 220]]
[[  2  14]
 [ 34 200]]
[0.17647058823529413, 0.23529411764705882, 0.11764705882352941, 0.1875, 0.125]
[0.23076923076923078, 0.13333333333333333, 0.06666666666666667, 0.17647058823529413, 0.05555555555555555]
[0.2, 0.1702127659574468, 0.0851063829787234, 0.18181818181818182, 0.07692307692307693]
[0.042735042735042736, 0.1111111111111111, 0.119658119

KeyboardInterrupt: 

In [27]:
hyper_tuning = pd.DataFrame({
    'n_estimators':total_n_estimators,
    'max_depth':total_max_depth,
    'max_features':total_max_features,
    'min_samples_split': total_min_samples_split,
    'min_samples_leaf':total_min_samples_leaf,
    'recall':total_recall,
    'precision':total_precision,
    'f_measure':total_f_measure,
    'fpr':total_fpr   
})

hyper_tuning
hyper_tuning.to_csv("C:/Users/juj11/Desktop/hyper_tuning7.csv",header = True, index = False)

## Random Forest (SMOTE) (hyperparameter_tuning)

In [115]:
def rfc_classweights1():
  
    rfc_cw = RandomForestClassifier(n_estimators= 110,
                                    max_depth = 50,
                                    max_features = 0.3,
                                    min_samples_split = 0.145,
                                    min_samples_leaf = 0.1,
                                    class_weight = 'balanced',
                                    n_jobs = -1,
                                    random_state= seed
                                    )
    rfc_cw.fit(X_ro, y_ro)
    y_pred = rfc_cw.predict(X_test_after_PCA)
    #rfc_cw.n_classes_ = 2
        
    table = confusion_matrix(y_test, y_pred, labels = [1,-1]) #labels=[1, -1]
        
    print(table)
        
    tp = table[0][0]
    fn = table[0][1]
    fp = table[1][0]
    tn = table[1][1]

    recall_value = tp/(tp+fn)
    precision_value = tp/(tp+fp)
    f_measure_value = (2*tp)/((2*tp)+fp+fn)
    fpr_value = fp/(fp+tn)
        
    print("recall: ",recall_value)
    print("precision: ",precision_value)
    print("f_measure: ", f_measure_value)
    print("FPR: ",fpr_value)


In [116]:
rfc_classweights1()

[[  9  12]
 [ 73 220]]
recall:  0.42857142857142855
precision:  0.10975609756097561
f_measure:  0.17475728155339806
FPR:  0.24914675767918087


## Random Forest (SMOTE) (Research_parameter)

In [112]:
def rfc_classweights2():
  
    rfc_cw2 = RandomForestClassifier(n_estimators= 71,
                                    max_depth = 100,
                                    max_features = 0.333931,
                                    min_samples_split = 0.44277,
                                    min_samples_leaf = 0.00049,
                                    class_weight = 'balanced',
                                    n_jobs = -1,
                                    random_state= seed
                                    )
    rfc_cw2.fit(X_bsmo, y_bsmo)
    y_pred = rfc_cw2.predict(X_test_after_PCA)
    rfc_cw2.n_classes_ = 2
        
    table = confusion_matrix(y_test, y_pred, labels = [1,-1]) #labels=[1, -1]
        
    print(table)
        
    tp = table[0][0]
    fn = table[0][1]
    fp = table[1][0]
    tn = table[1][1]

    recall_value = tp/(tp+fn)
    precision_value = tp/(tp+fp)
    f_measure_value = (2*tp)/((2*tp)+fp+fn)
    fpr_value = fp/(fp+tn)
        
    print("recall: ",recall_value)
    print("precision: ",precision_value)
    print("f_measure: ", f_measure_value)
    print("FPR: ",fpr_value)


In [113]:
rfc_classweights2()

[[  9  12]
 [ 52 241]]
recall:  0.42857142857142855
precision:  0.14754098360655737
f_measure:  0.21951219512195122
FPR:  0.17747440273037543
