<a href="https://colab.research.google.com/drive/1mVGcYKkreXHHjT2qTKhZ-jPgm_k-V5h-?usp=sharing">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Import Package

In [None]:
!pip install skl2onnx
!pip install tf2onnx

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from google.colab import drive
import matplotlib.pyplot as plt

import sklearn
from imblearn.combine import SMOTETomek
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

## Setting

In [None]:
# connect to google drive
drive.mount('/gdrive')

In [None]:
train_ds = '/gdrive/MyDrive/PIJ/dataset/train.xlsx'
test_ds = '/gdrive/MyDrive/PIJ/dataset/test.csv'

## Read Dataset

In [None]:
train_ds = pd.read_excel(train_ds)
test_ds = pd.read_csv(test_ds)

In [None]:
train_ds_default = train_ds.copy()
test_ds_default = test_ds.copy()

In [None]:
# restore dataset without reading the file
train_ds = train_ds_default.copy()
test_ds = test_ds_default.copy()

## Explore and Preprocess Dataset

In [None]:
train_ds.shape

In [None]:
train_ds.head()

In [None]:
train_ds.info()

In [None]:
# print feature which non-null value percentage < 80%
drop_feature = []

for col in train_ds.columns:
    num = train_ds.count(axis=0)[col]
    percentage = round(num / train_ds.shape[0], 4) * 100.0
    
    if percentage < 80:
        print(f'{col}: {percentage}%')
        drop_feature.append(col)

In [None]:
# drop the features with too many null values in train_ds and test_ds
train_ds.drop(labels=drop_feature, axis=1, inplace=True)
test_ds.drop(labels=drop_feature, axis=1, inplace=True)

In [None]:
# drop row contain null value
train_ds = train_ds.dropna()
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# convert str to int in categorical feature
train_ds['Joint'] = train_ds['Joint'].astype('category').cat.codes
test_ds['Joint'] = test_ds['Joint'].astype('category').cat.codes

train_ds['SEX'] = train_ds['SEX'].astype('category').cat.codes
test_ds['SEX'] = test_ds['SEX'].astype('category').cat.codes

## Deep Neural Network

In [None]:
# scaling numerical feature
for col in train_ds.columns[1:]:
    max_value = train_ds[col].max()
    train_ds[col] = train_ds[col] / max_value
    test_ds[col] = test_ds[col] / max_value

In [None]:
train_ds

In [None]:
# drop the column with too many zero
features = ['outcome']

for col in train_ds.columns[1:]:
    mask = train_ds[col] != 0.0
    val = mask.sum() / len(mask)
    if val > 0.1:
        features.append(col)

train_ds = train_ds[features]

In [None]:
train_ds

In [None]:
# drop the outlier sample in dataframe (must not drop abnormal sample)
condition = (np.abs(stats.zscore(train_ds[train_ds.columns[1:]].iloc[877:])) < 2).all(axis=1)
condition = np.append(np.ones((877)).astype(bool), condition)
train_ds = train_ds[condition]
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# extract part of 'normal' data from train_ds, train autoencoder with normal samples
prob = (train_ds['outcome'] == 0).values.astype('int8')
prob = prob / prob.sum()
idx = np.random.choice(np.arange(train_ds.shape[0]), size=32000, p=prob, replace=False)
train_ds_normal = train_ds.iloc[idx]

# use remaining data in train_ds as val_ds
val_ds = train_ds.drop(idx)

In [None]:
# create model
class AnomalyDetector(Model):
    def __init__(self):
        super(AnomalyDetector, self).__init__()

        self.encoder = tf.keras.Sequential([
            layers.Input(shape=(16, 1)),
            layers.Conv1D(64, 3, strides=1, padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.MaxPooling1D(pool_size=2),
            layers.Conv1D(32, 3, strides=1, padding='same', activation='relu'),
            layers.Dropout(rate=0.2),
            layers.BatchNormalization(),
            layers.MaxPooling1D(pool_size=2),
            layers.Conv1D(16, 3, strides=1, padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.MaxPooling1D(pool_size=2),])
        
        self.decoder = tf.keras.Sequential([
            layers.Conv1D(16, 3, strides=1, padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.UpSampling1D(size=2),
            layers.Conv1D(32, 3, strides=1, padding='same', activation='relu'),
            layers.Dropout(rate=0.2),
            layers.BatchNormalization(),
            layers.UpSampling1D(size=2),
            layers.Conv1D(64, 3, strides=1, padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.UpSampling1D(size=2),
            layers.Conv1D(1, 3, strides=1, padding='same', activation='sigmoid')])
        
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = AnomalyDetector()

In [None]:
# compile model
autoencoder.compile(optimizer=Adam(learning_rate=0.005), loss='mae')

In [None]:
# reshape and pad training data for autoencoder
X = train_ds_normal[train_ds_normal.columns[1:]].values
padding = np.zeros((train_ds_normal.shape[0], 3))
X = np.append(X, padding, axis=1)
X = X.reshape(X.shape[0], X.shape[1], 1)

In [None]:
history = autoencoder.fit(
    X, X, 
    epochs=120, 
    batch_size=64,
    shuffle=True
)

In [None]:
plt.plot(history.history['loss'])
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

In [None]:
# reshape and pad validation data for autoencoder
val_X = val_ds[val_ds.columns[1:]].values
padding = np.zeros((val_ds.shape[0], 3))
val_X = np.append(val_X, padding, axis=1)
val_X = val_X.reshape(val_X.shape[0], val_X.shape[1], 1)

In [None]:
# store the predicted error of normal and abnormal data in validation dataset
idx = 0
errors_normal = []
errors_abnormal = []

for row in val_X:
    
    row = row.reshape(1, 16, 1)
    res = autoencoder.predict(row)
    e = np.mean(np.abs(res.reshape(-1) - row[0].reshape(-1)))

    if idx < 877:
        errors_abnormal.append(e)
    else:
        errors_normal.append(e)

    idx += 1

In [None]:
# plot the error distribution of validation data
bin = np.linspace(0, 0.1, 100)
plt.figure(figsize=(10, 8))
plt.hist(errors_normal, bin, alpha=0.5, label='normal')
plt.hist(errors_abnormal, bin, alpha=0.5, label='abnormal')
plt.legend()
plt.plot()

In [None]:
# plot the error distribution of validation data (zoom in)
bin = np.linspace(0, 0.01, 100)
plt.figure(figsize=(10, 8))
plt.hist(errors_normal, bin, alpha=0.5, label='normal')
plt.hist(errors_abnormal, bin, alpha=0.5, label='abnormal')
plt.legend()
plt.plot()

In [None]:
# calculate threshold
threshold = np.mean(errors_normal) + 1 * np.std(errors_normal)
print(f'threshold: {threshold}')

In [None]:
# reshape and pad testing data
test_ds = test_ds[features[1:]]
test_X = test_ds.values
padding = np.zeros((test_ds.shape[0], 3))
test_X = np.append(test_X, padding, axis=1)
test_X = test_X.reshape(test_X.shape[0], test_X.shape[1], 1)

In [None]:
# autoencoder predicts on testing data
result = []
for row in test_X:
 
    row = row.reshape(1, 16, 1)
    res = autoencoder.predict(row)
    e = np.mean(np.abs(res.reshape(-1) - row[0].reshape(-1)))

    if e < threshold:
        result.append(0)
    else:
        result.append(1)

In [None]:
np.array(result)

In [None]:
# save processed training and testing dataset
pd.DataFrame(X.reshape(-1 ,16)).to_excel('/gdrive/MyDrive/PIJ/dataset/processed/cnn/train.xlsx')
pd.DataFrame(X.reshape(-1 ,16)).to_excel('/gdrive/MyDrive/PIJ/dataset/processed/cnn/test.xlsx')

In [None]:
# Serialize model
autoencoder.save("/gdrive/MyDrive/PIJ/dataset/processed/cnn/model", save_format='tf')
!python3 -m tf2onnx.convert --saved-model /gdrive/MyDrive/PIJ/dataset/processed/cnn/model --output /gdrive/MyDrive/PIJ/dataset/processed/cnn/model.onnx

## Decision Tree (C4.5)

In [None]:
# the shape of dataset
print('train_ds shape: ', train_ds.shape)
print('test_ds shape: ', test_ds.shape)

In [None]:
# scaling numerical feature
for col in train_ds.columns[1:]:
    max_value = train_ds[col].max()
    train_ds[col] = train_ds[col] / max_value
    test_ds[col] = test_ds[col] / max_value

In [None]:
train_ds

In [None]:
# drop the column with too many zero
features = ['outcome']

for col in train_ds.columns[1:]:
    mask = train_ds[col] != 0.0
    val = mask.sum() / len(mask)
    if val > 0.1:
        features.append(col)

train_ds = train_ds[features]

In [None]:
features

In [None]:
train_ds

In [None]:
# drop the outlier sample in dataframe (must not drop abnormal sample)
condition = (np.abs(stats.zscore(train_ds[train_ds.columns[1:]].iloc[877:])) < 1.5).all(axis=1)
condition = np.append(np.ones((877)).astype(bool), condition)
train_ds = train_ds[condition]
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# look class distribution of train_ds
series = train_ds['outcome'].value_counts()
print(f'class 0: {round(series[0] / (series[0] + series[1]), 2)}')
print(f'class 1: {round(series[1] / (series[0] + series[1]), 2)}')
plt.bar(x=series.index, height=series.values)

In [None]:
# split val_ds from train_ds with same class distribution => 0.95:0.05 => 950:50
prob = (train_ds['outcome'] == 1.0).values.astype('int')
prob = prob / prob.sum()
idx1 = np.random.choice(np.arange(train_ds.shape[0]), size=50, p=prob)

prob = (train_ds['outcome'] == 0.0).values.astype('int')
prob = prob / prob.sum()
idx2 = np.random.choice(np.arange(train_ds.shape[0]), size=950, p=prob)

val_ds = pd.concat([train_ds.iloc[idx1], train_ds.iloc[idx2]])
val_ds.reset_index(inplace=True, drop=True)

# drop the row in train_ds
train_ds = train_ds.drop(idx1)
train_ds = train_ds.drop(idx2)
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# over-sampling train_ds
smt = SMOTETomek()
X = train_ds[train_ds.columns[1:]]
y = train_ds[[train_ds.columns[0]]]
X_res, y_res = smt.fit_resample(X, y)
train_ds = pd.concat([X_res, y_res], axis=1)

In [None]:
# look class distribution
print(f'train_ds shape: {train_ds.shape}')
series = train_ds['outcome'].value_counts()
plt.bar(x=series.index, height=series.values)

In [None]:
# train and validate model
X = train_ds[train_ds.columns[:-1]].values
y = train_ds[train_ds.columns[-1]].values
X, y = sklearn.utils.shuffle(X, y)

val_X = val_ds[val_ds.columns[1:]].values
val_y = val_ds[val_ds.columns[0]].values
val_X, val_y = sklearn.utils.shuffle(val_X, val_y)


for i in range(10, 30):
    print(f'depth: {i}')
    clf = DecisionTreeClassifier(max_depth=i, criterion='entropy')
    clf.fit(X, y)
    print(f'train acc: {(clf.predict(X) == y).sum() / y.shape[0]}')
    res = clf.predict(val_X) == val_y
    print(f'val acc: {res.sum() / res.shape[0]}')
    res = (clf.predict(val_X) == val_y)[val_y == 1.0]
    print(f'val acc (class = 1): {res.sum() / res.shape[0]}')
    print()

In [None]:
# train model on whole dataset (train_ds + val_ds)
columns = val_ds.columns.tolist()[1:] + val_ds.columns.tolist()[0:1]
val_ds = val_ds[columns]
train_ds = pd.concat([train_ds, val_ds])

X = train_ds[train_ds.columns[:-1]].values
y = train_ds[train_ds.columns[-1]].values
X, y = sklearn.utils.shuffle(X, y)

clf = DecisionTreeClassifier(max_depth=14, criterion='entropy')
clf.fit(X, y)

In [None]:
# model predict
test_X = test_ds[columns[:-1]].values
clf.predict(test_X)

In [None]:
# save processed training and testing dataset
train_ds = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
train_ds.to_excel('/gdrive/MyDrive/PIJ/dataset/processed/decision_tree/train.xlsx')

test_ds = pd.DataFrame(test_X)
test_ds.to_excel('/gdrive/MyDrive/PIJ/dataset/processed/decision_tree/test.xlsx')

In [None]:
# Serialize model
initial_type = [('float_input', FloatTensorType([None, test_ds.shape[1]]))]
onx = convert_sklearn(clf, initial_types=initial_type)
with open('/gdrive/MyDrive/PIJ/dataset/processed/decision_tree/decision_tree' + ".onnx", "wb") as f:
    f.write(onx.SerializeToString())

## Random Forest

In [None]:
# the shape of dataset
print('train_ds shape: ', train_ds.shape)
print('test_ds shape: ', test_ds.shape)

In [None]:
# scaling numerical feature
for col in train_ds.columns[1:]:
    max_value = train_ds[col].max()
    train_ds[col] = train_ds[col] / max_value
    test_ds[col] = test_ds[col] / max_value

In [None]:
train_ds

In [None]:
# drop the column with too many zero
features = ['outcome']

for col in train_ds.columns[1:]:
    mask = train_ds[col] != 0.0
    val = mask.sum() / len(mask)
    if val > 0.1:
        features.append(col)

train_ds = train_ds[features]

In [None]:
train_ds

In [None]:
# drop the outlier sample in dataframe (must not drop abnormal sample)
condition = (np.abs(stats.zscore(train_ds[train_ds.columns[1:]].iloc[877:])) < 1.5).all(axis=1)
condition = np.append(np.ones((877)).astype(bool), condition)
train_ds = train_ds[condition]
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# look class distribution of train_ds
series = train_ds['outcome'].value_counts()
print(f'class 0: {round(series[0] / (series[0] + series[1]), 2)}')
print(f'class 1: {round(series[1] / (series[0] + series[1]), 2)}')
plt.bar(x=series.index, height=series.values)

In [None]:
# split val_ds from train_ds with same class distribution => 0.95:0.05 => 950:50
prob = (train_ds['outcome'] == 1.0).values.astype('int')
prob = prob / prob.sum()
idx1 = np.random.choice(np.arange(train_ds.shape[0]), size=20, p=prob)

prob = (train_ds['outcome'] == 0.0).values.astype('int')
prob = prob / prob.sum()
idx2 = np.random.choice(np.arange(train_ds.shape[0]), size=980, p=prob)

val_ds = pd.concat([train_ds.iloc[idx1], train_ds.iloc[idx2]])
val_ds.reset_index(inplace=True, drop=True)

# drop the row in train_ds
train_ds = train_ds.drop(idx1)
train_ds = train_ds.drop(idx2)
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# over-sampling train_ds
smt = SMOTETomek()
X = train_ds[train_ds.columns[1:]]
y = train_ds[[train_ds.columns[0]]]
X_res, y_res = smt.fit_resample(X, y)
train_ds = pd.concat([X_res, y_res], axis=1)

In [None]:
# look class distribution
print(f'train_ds shape: {train_ds.shape}')
series = train_ds['outcome'].value_counts()
plt.bar(x=series.index, height=series.values)

In [None]:
# train and validate model
X = train_ds[train_ds.columns[:-1]].values
y = train_ds[train_ds.columns[-1]].values
X, y = sklearn.utils.shuffle(X, y)

val_X = val_ds[val_ds.columns[1:]].values
val_y = val_ds[val_ds.columns[0]].values
val_X, val_y = sklearn.utils.shuffle(val_X, val_y)

for depth in range(10, 30):

    print(f'depth: {depth}')

    clf = RandomForestClassifier(n_estimators=10, max_depth=depth)
    clf.fit(X, y)
    print(f'train acc: {(clf.predict(X) == y).sum() / y.shape[0]}')

    res = clf.predict(val_X) == val_y
    val_acc_total = res.sum() / res.shape[0]
    print(f'val acc: {val_acc_total}')

    res = (clf.predict(val_X) == val_y)[val_y == 1.0]
    val_acc_part = res.sum() / res.shape[0]
    print(f'val acc (class = 1): {val_acc_part}')

    print()

In [None]:
# train model on whole dataset (train_ds + val_ds)
columns = val_ds.columns.tolist()[1:] + val_ds.columns.tolist()[0:1]
val_ds = val_ds[columns]
train_ds = pd.concat([train_ds, val_ds])

X = train_ds[train_ds.columns[:-1]].values
y = train_ds[train_ds.columns[-1]].values
X, y = sklearn.utils.shuffle(X, y)


clf = RandomForestClassifier(n_estimators=10, max_depth=25)
clf.fit(X, y)

In [None]:
# model predict
test_X = test_ds[columns[:-1]].values
clf.predict(test_X)

In [None]:
# save processed training and testing dataset
train_ds = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
train_ds.to_excel('/gdrive/MyDrive/PIJ/dataset/processed/random_forest/train.xlsx')

test_ds = pd.DataFrame(test_X)
test_ds.to_excel('/gdrive/MyDrive/PIJ/dataset/processed/random_forest/test.xlsx')

In [None]:
# Serialize model
initial_type = [('float_input', FloatTensorType([None, test_ds.shape[1]]))]
onx = convert_sklearn(clf, initial_types=initial_type)
with open('/gdrive/MyDrive/PIJ/dataset/processed/random_forest/random_forest' + ".onnx", "wb") as f:
    f.write(onx.SerializeToString())

## Logistic Regression

In [None]:
# the shape of dataset
print('train_ds shape: ', train_ds.shape)
print('test_ds shape: ', test_ds.shape)

In [None]:
# scaling numerical feature
for col in train_ds.columns[1:]:
    max_value = train_ds[col].max()
    train_ds[col] = train_ds[col] / max_value
    test_ds[col] = test_ds[col] / max_value

In [None]:
train_ds

In [None]:
# drop the column with too many zero
features = ['outcome']

for col in train_ds.columns[1:]:
    mask = train_ds[col] != 0.0
    val = mask.sum() / len(mask)
    if val > 0.1:
        features.append(col)

train_ds = train_ds[features]

In [None]:
train_ds

In [None]:
# drop the outlier sample in dataframe (must not drop abnormal sample)
condition = (np.abs(stats.zscore(train_ds[train_ds.columns[1:]].iloc[877:])) < 1.5).all(axis=1)
condition = np.append(np.ones((877)).astype(bool), condition)
train_ds = train_ds[condition]
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# look class distribution of train_ds
series = train_ds['outcome'].value_counts()
print(f'class 0: {round(series[0] / (series[0] + series[1]), 2)}')
print(f'class 1: {round(series[1] / (series[0] + series[1]), 2)}')
plt.bar(x=series.index, height=series.values)

In [None]:
# split val_ds from train_ds with same class distribution => 0.95:0.05 => 950:50
prob = (train_ds['outcome'] == 1.0).values.astype('int')
prob = prob / prob.sum()
idx1 = np.random.choice(np.arange(train_ds.shape[0]), size=20, p=prob)

prob = (train_ds['outcome'] == 0.0).values.astype('int')
prob = prob / prob.sum()
idx2 = np.random.choice(np.arange(train_ds.shape[0]), size=980, p=prob)

val_ds = pd.concat([train_ds.iloc[idx1], train_ds.iloc[idx2]])
val_ds.reset_index(inplace=True, drop=True)

# drop the row in train_ds
train_ds = train_ds.drop(idx1)
train_ds = train_ds.drop(idx2)
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# over-sampling train_ds
smt = SMOTETomek()
X = train_ds[train_ds.columns[1:]]
y = train_ds[[train_ds.columns[0]]]
X_res, y_res = smt.fit_resample(X, y)
train_ds = pd.concat([X_res, y_res], axis=1)

In [None]:
# look class distribution
print(f'train_ds shape: {train_ds.shape}')
series = train_ds['outcome'].value_counts()
plt.bar(x=series.index, height=series.values)

In [None]:
# train and validate model
X = train_ds[train_ds.columns[:-1]].values
y = train_ds[train_ds.columns[-1]].values
X, y = sklearn.utils.shuffle(X, y)

val_X = val_ds[val_ds.columns[1:]].values
val_y = val_ds[val_ds.columns[0]].values
val_X, val_y = sklearn.utils.shuffle(val_X, val_y)

In [None]:
lr = LogisticRegression(max_iter=400)
lr.fit(X, y)
print(f'train acc: {(lr.predict(X) == y).sum() / y.shape[0]}')

res = lr.predict(val_X) == val_y
val_acc_total = res.sum() / res.shape[0]
print(f'val acc: {val_acc_total}')

res = (lr.predict(val_X) == val_y)[val_y == 1.0]
val_acc_part = res.sum() / res.shape[0]
print(f'val acc (class = 1): {val_acc_part}')

In [None]:
# train model on whole dataset (train_ds + val_ds)
columns = val_ds.columns.tolist()[1:] + val_ds.columns.tolist()[0:1]
val_ds = val_ds[columns]
train_ds = pd.concat([train_ds, val_ds])

X = train_ds[train_ds.columns[:-1]].values
y = train_ds[train_ds.columns[-1]].values
X, y = sklearn.utils.shuffle(X, y)


lr = LogisticRegression(max_iter=400)
lr.fit(X, y)

In [None]:
# model predict
test_X = test_ds[columns[:-1]].values
lr.predict(test_X)

In [None]:
lr.intercept_

In [None]:
# save processed training and testing dataset
train_ds = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
train_ds.to_excel('/gdrive/MyDrive/PIJ/dataset/processed/logistic_regression/train.xlsx')

test_ds = pd.DataFrame(test_X)
test_ds.to_excel('/gdrive/MyDrive/PIJ/dataset/processed/logistic_regression/test.xlsx')

In [None]:
# Serialize model
initial_type = [('float_input', FloatTensorType([None, test_ds.shape[1]]))]
onx = convert_sklearn(lr, initial_types=initial_type)
with open('/gdrive/MyDrive/PIJ/dataset/processed/logistic_regression/logistic_regression' + ".onnx", "wb") as f:
    f.write(onx.SerializeToString())

## Support Vector Machine

In [None]:
# the shape of dataset
print('train_ds shape: ', train_ds.shape)
print('test_ds shape: ', test_ds.shape)

In [None]:
# scaling numerical feature
for col in train_ds.columns[1:]:
    max_value = train_ds[col].max()
    train_ds[col] = train_ds[col] / max_value
    test_ds[col] = test_ds[col] / max_value

In [None]:
train_ds

In [None]:
# drop the column with too many zero
features = ['outcome']

for col in train_ds.columns[1:]:
    mask = train_ds[col] != 0.0
    val = mask.sum() / len(mask)
    if val > 0.1:
        features.append(col)

train_ds = train_ds[features]

In [None]:
train_ds

In [None]:
# drop the outlier sample in dataframe (must not drop abnormal sample)
condition = (np.abs(stats.zscore(train_ds[train_ds.columns[1:]].iloc[877:])) < 1.5).all(axis=1)
condition = np.append(np.ones((877)).astype(bool), condition)
train_ds = train_ds[condition]
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# look class distribution of train_ds
series = train_ds['outcome'].value_counts()
print(f'class 0: {round(series[0] / (series[0] + series[1]), 2)}')
print(f'class 1: {round(series[1] / (series[0] + series[1]), 2)}')
plt.bar(x=series.index, height=series.values)

In [None]:
# split val_ds from train_ds with same class distribution => 0.95:0.05 => 950:50
prob = (train_ds['outcome'] == 1.0).values.astype('int')
prob = prob / prob.sum()
idx1 = np.random.choice(np.arange(train_ds.shape[0]), size=20, p=prob)

prob = (train_ds['outcome'] == 0.0).values.astype('int')
prob = prob / prob.sum()
idx2 = np.random.choice(np.arange(train_ds.shape[0]), size=980, p=prob)

val_ds = pd.concat([train_ds.iloc[idx1], train_ds.iloc[idx2]])
val_ds.reset_index(inplace=True, drop=True)

# drop the row in train_ds
train_ds = train_ds.drop(idx1)
train_ds = train_ds.drop(idx2)
train_ds.reset_index(inplace=True, drop=True)

In [None]:
train_ds

In [None]:
# over-sampling train_ds
smt = SMOTETomek()
X = train_ds[train_ds.columns[1:]]
y = train_ds[[train_ds.columns[0]]]
X_res, y_res = smt.fit_resample(X, y)
train_ds = pd.concat([X_res, y_res], axis=1)

In [None]:
# look class distribution
print(f'train_ds shape: {train_ds.shape}')
series = train_ds['outcome'].value_counts()
plt.bar(x=series.index, height=series.values)

In [None]:
# train and validate model
X = train_ds[train_ds.columns[:-1]].values
y = train_ds[train_ds.columns[-1]].values
X, y = sklearn.utils.shuffle(X, y)

val_X = val_ds[val_ds.columns[1:]].values
val_y = val_ds[val_ds.columns[0]].values
val_X, val_y = sklearn.utils.shuffle(val_X, val_y)

In [None]:
# tune hyper-parameter for model
svm = SVC(kernel='poly', gamma=0.3, C=1.0)
svm.fit(X, y)
print(f'train acc: {(svm.predict(X) == y).sum() / y.shape[0]}')

res = svm.predict(val_X) == val_y
val_acc_total = res.sum() / res.shape[0]
print(f'val acc: {val_acc_total}')

res = (svm.predict(val_X) == val_y)[val_y == 1.0]
val_acc_part = res.sum() / res.shape[0]
print(f'val acc (class = 1): {val_acc_part}')

In [None]:
# train model on whole dataset (train_ds + val_ds)
columns = val_ds.columns.tolist()[1:] + val_ds.columns.tolist()[0:1]
val_ds = val_ds[columns]
train_ds = pd.concat([train_ds, val_ds])

X = train_ds[train_ds.columns[:-1]].values
y = train_ds[train_ds.columns[-1]].values
X, y = sklearn.utils.shuffle(X, y)


svm = SVC(kernel='poly', gamma=0.3, C=1.0)
svm.fit(X, y)

In [None]:
# model predict
test_X = test_ds[columns[:-1]].values
svm.predict(test_X)

In [None]:
# save processed training and testing dataset
train_ds = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
train_ds.to_excel('/gdrive/MyDrive/PIJ/dataset/processed/support_vector_machine/train.xlsx')

test_ds = pd.DataFrame(test_X)
test_ds.to_excel('/gdrive/MyDrive/PIJ/dataset/processed/support_vector_machine/test.xlsx')

In [None]:
# Serialize model
initial_type = [('float_input', FloatTensorType([None, test_ds.shape[1]]))]
onx = convert_sklearn(svm, initial_types=initial_type)
with open('/gdrive/MyDrive/PIJ/dataset/processed/support_vector_machine/support_vector_machine' + ".onnx", "wb") as f:
    f.write(onx.SerializeToString())