### 0 - import library

In [1]:
# import pyreadr as py
import missingno as msno
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
import numpy as np
import tensorflow as tf
from numpy import mean
from numpy import std
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import model_from_json

In [3]:
import platform
import sys

os_windows = False
os_ubuntu = False
if platform.platform() == "Linux-5.4.0-58-generic-x86_64-with-glibc2.10":
    os_ubuntu = True
else:
    os_windows = True
print("os_windows: " + str(os_windows))
print("os_ubuntu: " + str(os_ubuntu))

print("sys.version: " + str(sys.version))

os_windows: True
os_ubuntu: False
sys.version: 3.7.1 (default, Oct 28 2018, 08:39:03) [MSC v.1912 64 bit (AMD64)]


In [4]:
import math

In [5]:
import sklearn

print(sklearn.__version__)
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

0.23.2


### 1 - define helper function

In [6]:
def normalization_data(data, eta=2):
    data_normalized = np.copy(data)
    shape0 = data.shape[0]
    shape1 = data.shape[1]
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    for i in range(shape1):
        for j in range(shape0):
            data_normalized[j, i] = (data[j, i] - mean[i]) / (std[i] * eta)
    return data_normalized

In [7]:
# mix the fault free data and faulty data
def mix_shuffle_data(fault_free_data, faulty_data):

    fault_free_label = np.zeros((fault_free_data.shape[0], 1))
    faulty_label = np.ones((faulty_data.shape[0], 1))

    a1 = np.hstack((fault_free_label, fault_free_data))
    a2 = np.hstack((faulty_label, faulty_data))
    mixed_data = np.vstack((a1, a2))
    np.random.shuffle(mixed_data)
    return mixed_data

In [8]:
def pred_error_RF(prediction, label):
    detection_rate=prediction.sum()/prediction.shape[0]
    s1=label+str(detection_rate)
    
    print(s1)
    return detection_rate

### 2 - read data

In [9]:
if os_windows:
    root_folder = "C:/Users/liang/Masterarbeit_Jupyter_Lab/"
else:
    root_folder = ""

In [10]:
eta = 2
print("eta: " + str(eta))

eta: 2


In [11]:
read_data, normalize_data = True, True

# fault_free_train_samplesizeX: X data from TEP_FaultFree_Training.RData
if read_data and normalize_data:
    fault_free_train_samplesize20000 = pd.read_csv(
        root_folder + "data_set_csv/fault_free_train_sample_size=20000", sep=",", header=None,
    ).to_numpy()
    fault_free_train_samplesize40000 = pd.read_csv(
        root_folder + "data_set_csv/fault_free_train_sample_size=40000", sep=",", header=None,
    ).to_numpy()
    fault_free_train_samplesize100000 = pd.read_csv(
        root_folder + "data_set_csv/fault_free_train_sample_size=100000", sep=",", header=None,
    ).to_numpy()
    fault_free_train_samplesize250000 = pd.read_csv(
        root_folder + "data_set_csv/fault_free_train_sample_size=250000", sep=",", header=None,
    ).to_numpy()

    fault_free_train_samplesize20000_normalized = normalization_data(fault_free_train_samplesize20000, eta)
    fault_free_train_samplesize40000_normalized = normalization_data(fault_free_train_samplesize40000, eta)
    fault_free_train_samplesize100000_normalized = normalization_data(fault_free_train_samplesize100000, eta)
    fault_free_train_samplesize250000_normalized = normalization_data(fault_free_train_samplesize250000, eta)

In [12]:
# fault_free_test1: first 20000 data from TEP_FaultFree_Testing.RData
fault_free_test1 = pd.read_csv(root_folder + "data_set_csv/fault_free_test1=20000", sep=",", header=None).to_numpy()
fault_free_test1_normalized = normalization_data(fault_free_test1, eta)
fault_free_test1_normalized.shape

(20000, 52)

In [13]:
# fault_free_test2: second 20000 data from TEP_FaultFree_Testing.RData
fault_free_test2 = pd.read_csv(root_folder + "data_set_csv/fault_free_test2=20000", sep=",", header=None).to_numpy()
fault_free_test2_normalized = normalization_data(fault_free_test2, eta)
fault_free_test2_normalized.shape

(20000, 52)

In [25]:
# faulty_train: 10% data (470400) from TEP_Faulty_Training.RData
faulty_train = pd.read_csv(root_folder + "data_set_csv/faulty_train=10%", sep=",", header=None).to_numpy()
faulty_train_normalized = normalization_data(faulty_train, eta)
faulty_train_normalized.shape

(470400, 52)

In [26]:
# faulty_test: 10% data (784000) from TEP_Faulty_Testing.RData
faulty_test = pd.read_csv(root_folder + "data_set_csv/faulty_test=10%", sep=",", header=None).to_numpy()
faulty_test_normalized = normalization_data(faulty_test, eta)
faulty_test_normalized.shape

(784000, 52)

In [27]:
fault_free_train = fault_free_train_samplesize250000
fault_free_train_normalized = fault_free_train_samplesize250000_normalized
faulty_train_sample_size = 400000

train_data = mix_shuffle_data(fault_free_train, faulty_train[:faulty_train_sample_size, :])
train_data_normalized = mix_shuffle_data(
    fault_free_train_normalized, faulty_train_normalized[:faulty_train_sample_size, :]
)

train_data.shape
train_data_normalized.shape
# np.random.shuffle(train_data)
# np.random.shuffle(train_data_normalized)

(650000, 53)

### 3 - set model parameters using grid search

In [28]:
# define the model
# model = RandomForestClassifier()
# evaluate the model
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

sample_size_start = 0
sample_size_end = 5000

# if use the normalized data: normalization improve performance
set_normalized = True
set_grid_search = True

if set_normalized:
    X = train_data_normalized[sample_size_start:sample_size_end, 1:]
    y = train_data_normalized[sample_size_start:sample_size_end, 0]
else:
    X = train_data[sample_size_start:sample_size_end, 1:]
    y = train_data[sample_size_start:sample_size_end, 0]

In [29]:
train_data_normalized

array([[ 1.        ,  0.00419509, -0.10971104, ..., -0.70422291,
        -0.07203067, -0.35302938],
       [ 0.        ,  0.22750791, -0.6937791 , ...,  0.17309119,
        -0.6335835 , -0.20452161],
       [ 1.        , -0.07479211, -0.32015851, ...,  0.30704842,
         0.00597616, -0.14314217],
       ...,
       [ 1.        ,  1.94557093, -0.4168506 , ...,  0.29437501,
        -0.08823172,  0.15740965],
       [ 0.        , -0.39390043, -0.042429  , ..., -0.12121977,
         0.41020926,  0.89442484],
       [ 0.        ,  0.56744434, -0.09389007, ...,  0.54360275,
         0.14764315,  0.42198376]])

In [30]:
X


array([[ 4.19509221e-03, -1.09711039e-01,  7.29694046e-01, ...,
        -7.04222915e-01, -7.20306737e-02, -3.53029377e-01],
       [ 2.27507905e-01, -6.93779098e-01,  6.06652999e-01, ...,
         1.73091193e-01, -6.33583496e-01, -2.04521610e-01],
       [-7.47921134e-02, -3.20158514e-01, -7.06274079e-02, ...,
         3.07048421e-01,  5.97616421e-03, -1.43142173e-01],
       ...,
       [ 3.34557921e-01, -1.35058922e-01, -1.93774977e-02, ...,
        -4.59889051e-01, -8.27734497e-01,  2.16336961e-01],
       [ 1.20035124e+00, -1.08593230e-01, -1.02253225e-01, ...,
        -5.94988644e-01,  1.05922832e+00,  7.26696299e-01],
       [-6.34157104e-01,  8.16257849e-01, -2.54833171e-02, ...,
         1.37368551e+00, -6.97700413e-04,  5.95362245e-02]])

In [31]:
y

array([1., 0., 1., ..., 0., 0., 1.])

In [32]:
if set_grid_search:
    # model fitting and hyperparameter tunning using gridsearch
    test_RF_classifier = RandomForestClassifier()
    # weights = np.linspace(0.05, 0.95, 20)
    prams = {
        "n_estimators": [100, 200, 500],
        "max_depth": [15, 20, 25, 30, 35], 
    }
    model = GridSearchCV(test_RF_classifier, param_grid=prams, verbose=10, n_jobs=-1, scoring="accuracy", cv=3)
    model.fit(X, y)
    print("Best estimator is", model.best_params_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done  35 out of  45 | elapsed:   39.5s remaining:   11.2s
[Parallel(n_jobs=-1)]: Done  40 out of  45 | elapsed:   46.7s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.0min finished


Best estimator is {'max_depth': 30, 'n_estimators': 500}


### 4 - model prediction

In [33]:
max_depth=model.best_params_.get('max_depth')
n_estimators=model.best_params_.get('n_estimators')

RF_classifier = RandomForestClassifier(n_jobs=-1, max_depth=max_depth, n_estimators=n_estimators)
RF_classifier.fit(X, y)

RandomForestClassifier(max_depth=30, n_estimators=500, n_jobs=-1)

In [34]:
if set_normalized:
    fault_free_train_samplesize250000_normalized_pred = RF_classifier.predict(fault_free_train_samplesize250000_normalized)
    fault_free_test1_normalized_pred = RF_classifier.predict(fault_free_test1_normalized)
    fault_free_test2_normalized_pred = RF_classifier.predict(fault_free_test2_normalized)
    faulty_train_normalized_pred = RF_classifier.predict(faulty_train_normalized)
    faulty_test_normalized_pred = RF_classifier.predict(faulty_test_normalized)
    pred_error_RF(fault_free_train_samplesize250000_normalized_pred, "fault_free_train_samplesize250000_normalized_pred: ")
    pred_error_RF(fault_free_test1_normalized_pred, "fault_free_test1_normalized_pred: ")
    pred_error_RF(fault_free_test2_normalized_pred, "fault_free_test2_normalized_pred: ")
    pred_error_RF(faulty_train_normalized_pred, "faulty_train_normalized_pred: ")
    pred_error_RF(faulty_test_normalized_pred, "faulty_test_normalized_pred: ")
else:
    fault_free_train_samplesize250000_pred = RF_classifier.predict(fault_free_train_samplesize250000)
    fault_free_test1_pred = RF_classifier.predict(fault_free_test1)
    fault_free_test2_pred = RF_classifier.predict(fault_free_test2)
    faulty_train_pred = RF_classifier.predict(faulty_train)
    faulty_test_pred = RF_classifier.predict(faulty_test)
    pred_error_RF(fault_free_train_samplesize250000_pred, "fault_free_train_samplesize250000: ")
    pred_error_RF(fault_free_test1_pred, "fault_free_test1: ")
    pred_error_RF(fault_free_test2_pred, "fault_free_test2: ")
    pred_error_RF(faulty_train_pred, "faulty_train: ")
    pred_error_RF(faulty_test_pred, "faulty_test: ")

fault_free_train_samplesize250000_normalized_pred: 0.000948
fault_free_test1_normalized_pred: 0.00055
fault_free_test2_normalized_pred: 0.0008
faulty_train_normalized_pred: 0.993218537414966
faulty_test_normalized_pred: 0.9968686224489796
