### 0 - import library

In [30]:
# import pyreadr as py
import missingno as msno
import pandas as pd
from matplotlib import pyplot as plt

In [31]:
import numpy as np
import tensorflow as tf
from numpy import mean
from numpy import std
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import model_from_json

In [32]:
import platform
import sys


os_windows = False
os_ubuntu = False
if platform.platform() == "Linux-5.4.0-58-generic-x86_64-with-glibc2.10":
    os_ubuntu = True
else:
    os_windows = True
print("os_windows: " + str(os_windows))
print("os_ubuntu: " + str(os_ubuntu))

print("sys.version: " + str(sys.version))

os_windows: True
os_ubuntu: False
sys.version: 3.7.1 (default, Oct 28 2018, 08:39:03) [MSC v.1912 64 bit (AMD64)]


In [33]:
import math

In [34]:
import sklearn


print("sklearn.__version__: " + str(sklearn.__version__))
from sklearn import datasets
from sklearn import svm
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

sklearn.__version__: 0.23.2


### 1 - define helper function

In [35]:
def normalization_data(data, eta=2):
    data_normalized = np.copy(data)
    shape0 = data.shape[0]
    shape1 = data.shape[1]
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    for i in range(shape1):
        for j in range(shape0):
            data_normalized[j, i] = (data[j, i] - mean[i]) / (std[i] * eta)
    return data_normalized

In [36]:
# mix the fault free data and faulty data
def mix_shuffle_data(fault_free_data, faulty_data):

    fault_free_label = np.zeros((fault_free_data.shape[0], 1))
    faulty_label = np.ones((faulty_data.shape[0], 1))

    a1 = np.hstack((fault_free_label, fault_free_data))
    a2 = np.hstack((faulty_label, faulty_data))
    mixed_data = np.vstack((a1, a2))
    np.random.shuffle(mixed_data)
    return mixed_data

In [49]:
def pred_error_classifier(prediction, label):
    detection_rate = prediction.sum() / prediction.shape[0]
    s1 = label + str(detection_rate)

    print(s1)
    return detection_rate

### 2 - read data

In [38]:
if os_windows:
    root_folder = "C:/Users/liang/Masterarbeit_Jupyter_Lab/"
else:
    root_folder = ""

In [39]:
eta = 2
print("eta: " + str(eta))

eta: 2


In [40]:
read_data, normalize_data = True, True

if read_data and normalize_data:
    fault_free_train_samplesize20000 = pd.read_csv(
        root_folder + "data_set_csv/fault_free_train_sample_size=20000", sep=",", header=None,
    ).to_numpy()
    fault_free_train_samplesize40000 = pd.read_csv(
        root_folder + "data_set_csv/fault_free_train_sample_size=40000", sep=",", header=None,
    ).to_numpy()
    fault_free_train_samplesize100000 = pd.read_csv(
        root_folder + "data_set_csv/fault_free_train_sample_size=100000", sep=",", header=None,
    ).to_numpy()
    fault_free_train_samplesize250000 = pd.read_csv(
        root_folder + "data_set_csv/fault_free_train_sample_size=250000", sep=",", header=None,
    ).to_numpy()

    fault_free_train_samplesize20000_normalized = normalization_data(fault_free_train_samplesize20000, eta)
    fault_free_train_samplesize40000_normalized = normalization_data(fault_free_train_samplesize40000, eta)
    fault_free_train_samplesize100000_normalized = normalization_data(fault_free_train_samplesize100000, eta)
    fault_free_train_samplesize250000_normalized = normalization_data(fault_free_train_samplesize250000, eta)

In [41]:
fault_free_test1 = pd.read_csv(root_folder + "data_set_csv/fault_free_test1=20000", sep=",", header=None).to_numpy()
fault_free_test1_normalized = normalization_data(fault_free_test1, eta)
fault_free_test1_normalized.shape

(20000, 52)

In [42]:
fault_free_test2 = pd.read_csv(root_folder + "data_set_csv/fault_free_test2=20000", sep=",", header=None).to_numpy()
fault_free_test2_normalized = normalization_data(fault_free_test2, eta)
fault_free_test2_normalized.shape

(20000, 52)

In [43]:
faulty_train = pd.read_csv(root_folder + "data_set_csv/faulty_train=10%", sep=",", header=None).to_numpy()
faulty_train_normalized = normalization_data(faulty_train, eta)
faulty_train_normalized.shape

(470400, 52)

In [67]:
faulty_test = pd.read_csv(root_folder + "data_set_csv/faulty_test=10%", sep=",", header=None).to_numpy()
faulty_test_normalized = normalization_data(faulty_test, eta)
faulty_test_normalized.shape

(784000, 52)

In [73]:
fault_free_train = fault_free_train_samplesize250000
fault_free_train_normalized = fault_free_train_samplesize250000_normalized
faulty_train_sample_size = 400000

train_data = mix_shuffle_data(fault_free_train, faulty_train[:faulty_train_sample_size, :])
train_data_normalized = mix_shuffle_data(
    fault_free_train_normalized, faulty_train_normalized[:faulty_train_sample_size, :]
)

train_data.shape
train_data_normalized.shape
# np.random.shuffle(train_data)
# np.random.shuffle(train_data_normalized)

(650000, 53)

### 3 - set model parameters using grid search

In [74]:
# define the model
# model = RandomForestClassifier()
# evaluate the model
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

sample_size_start = 0
sample_size_end = 5000

set_normalized = True
set_grid_search = True

if set_normalized:
    X = train_data_normalized[sample_size_start:sample_size_end, 1:]
    y = train_data_normalized[sample_size_start:sample_size_end, 0]
else:
    X = train_data[sample_size_start:sample_size_end, 1:]
    y = train_data[sample_size_start:sample_size_end, 0]

In [75]:
svc_classifier = svm.SVC(kernel='rbf', C=1).fit(X, y)

In [76]:
# if set_grid_search:
#     # model fitting and hyperparameter tunning using gridsearch
#     test_RF_classifier = RandomForestClassifier()
#     # weights = np.linspace(0.05, 0.95, 20)
#     prams = {
#         "n_estimators": [100, 200, 500],
#         "max_depth": [15, 20, 25, 30, 35],  # ,'class_weight': [{0: x, 1: 1.0-x} for x in weights]
#     }
#     model = GridSearchCV(test_RF_classifier, param_grid=prams, verbose=10, n_jobs=-1, scoring="accuracy", cv=3)
#     model.fit(X, y)
#     print("Best estimator is", model.best_params_)

### 4 - model prediction

In [77]:
# max_depth=model.best_params_.get('max_depth')
# n_estimators=model.best_params_.get('n_estimators')

# RF_classifier = RandomForestClassifier(n_jobs=-1, max_depth=max_depth, n_estimators=n_estimators)
# RF_classifier.fit(X, y)

In [78]:
classifier = svc_classifier
if set_normalized:
    fault_free_train_samplesize20000_normalized_pred = classifier.predict(fault_free_train_samplesize20000_normalized)
    fault_free_test1_normalized_pred = classifier.predict(fault_free_test1_normalized)
    fault_free_test2_normalized_pred = classifier.predict(fault_free_test2_normalized)
    faulty_train_normalized_pred = classifier.predict(faulty_train_normalized)
    faulty_test_normalized_pred = classifier.predict(faulty_test_normalized)
    pred_error_classifier(fault_free_train_samplesize20000_normalized_pred, "fault_free_train_samplesize20000_normalized_pred: ")
    pred_error_classifier(fault_free_test1_normalized_pred, "fault_free_test1_normalized_pred: ")
    pred_error_classifier(fault_free_test2_normalized_pred, "fault_free_test2_normalized_pred: ")
    pred_error_classifier(faulty_train_normalized_pred, "faulty_train_normalized_pred: ")
    pred_error_classifier(faulty_test_normalized_pred, "faulty_test_normalized_pred: ")
else:
    fault_free_train_samplesize20000_pred = classifier.predict(fault_free_train_samplesize20000)
    fault_free_test1_pred = classifier.predict(fault_free_test1)
    fault_free_test2_pred = classifier.predict(fault_free_test2)
    faulty_train_pred = classifier.predict(faulty_train)
    faulty_test_pred = classifier.predict(faulty_test)
    pred_error_classifier(fault_free_train_samplesize20000_pred, "fault_free_train_samplesize20000: ")
    pred_error_classifier(fault_free_test1_pred, "fault_free_test1: ")
    pred_error_classifier(fault_free_test2_pred, "fault_free_test2: ")
    pred_error_classifier(faulty_train_pred, "faulty_train: ")
    pred_error_classifier(faulty_test_pred, "faulty_test: ")

fault_free_train_samplesize20000_normalized_pred: 0.0191
fault_free_test1_normalized_pred: 0.01635
fault_free_test2_normalized_pred: 0.01615
faulty_train_normalized_pred: 0.9668771258503401
faulty_test_normalized_pred: 0.9543686224489796


array([1., 1., 0., ..., 0., 0., 0.])