## Models with POJ Dataset (-O3 and -Oz)

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [3]:
# Without Optimization
df = pd.read_csv("./../../../features/POJ/poj_features_O0.csv", skipinitialspace=True)

# Optimizations
df_O3 = pd.read_csv("./../../../features/POJ/poj_features_O3.csv", skipinitialspace=True)
df_Oz = pd.read_csv("./../../../features/POJ/poj_features_Oz.csv", skipinitialspace=True)

#### ~> Split the datasets into a training and test set

In [4]:
# Without Optimization
train, test = train_test_split(df, test_size=0.2)

# Optimizations
train_O3, test_O3 = train_test_split(df_O3, test_size=0.2)
train_Oz, test_Oz = train_test_split(df_Oz, test_size=0.2)

### Case of Studies

#### ~> Optimizations

In [5]:
# Training and testing database without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

size = len(x_train)

##### **O3**


In [6]:
# O3
x_train_O3, y_train_O3 = train_O3.iloc[:,1:-1].to_numpy(), train_O3.iloc[:,-1].to_numpy()
x_test_O3, y_test_O3 = test_O3.iloc[:,1:-1].to_numpy(), test_O3.iloc[:,-1].to_numpy() 

# train+O3
x_train_O3_all = np.concatenate((x_train[0:int(size/2),:],x_train_O3[0:int(size/2),:]))
y_train_O3_all = np.concatenate((y_train[0:int(size/2)],y_train_O3[0:int(size/2)]))

Using -O3 only in the testing phase

In [7]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_O3, y_test_O3)
# Random Forest
model_rf(x_train, y_train, x_test_O3, y_test_O3)
# SVM
model_svm(x_train, y_train, x_test_O3, y_test_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_O3, y_test_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_O3, y_test_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_O3, y_test_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0553, 0.0713, 0.0520, 0.0325, 0.0083
Random Forest, 0.2083, 0.2676, 0.2068, 0.1835, 3.4152
SVM, 0.0710, 0.1165, 0.0712, 0.0553, 707.7482
SGD, 0.0685, 0.1241, 0.0694, 0.0560, 19.0674
MLP, 0.0436, 0.0457, 0.0421, 0.0337, 160.3469
Regressão Logística, 0.0697, 0.1073, 0.0675, 0.0492, 44.7124


Using -O3 in the testing and training phase

In [8]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3_all, y_train_O3_all, x_test_O3, y_test_O3)
# Random Forest
model_rf(x_train_O3_all, y_train_O3_all, x_test_O3, y_test_O3)
# SVM
model_svm(x_train_O3_all, y_train_O3_all, x_test_O3, y_test_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3_all, y_train_O3_all, x_test_O3, y_test_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3_all, y_train_O3_all, x_test_O3, y_test_O3)
# Regressão Logística
model_regRegression(x_train_O3_all, y_train_O3_all, x_test_O3, y_test_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6637, 0.6758, 0.6629, 0.6601, 0.0063
Random Forest, 0.8224, 0.8222, 0.8213, 0.8197, 2.4759
SVM, 0.4939, 0.5651, 0.4900, 0.4669, 656.6656
SGD, 0.3228, 0.4683, 0.3206, 0.3232, 10.2400
MLP, 0.4736, 0.4667, 0.4735, 0.4589, 292.1415
Regressão Logística, 0.5728, 0.5743, 0.5697, 0.5636, 36.4848


##### **Oz**


In [9]:
# Oz
x_train_Oz, y_train_Oz = train_Oz.iloc[:,0:-1].to_numpy(), train_Oz.iloc[:,-1].to_numpy()
x_test_Oz, y_test_Oz = test_Oz.iloc[:,0:-1].to_numpy(), test_Oz.iloc[:,-1].to_numpy()
# train+Oz
x_train_Oz_all = np.concatenate((x_train[0:int(size/2),:],x_train_Oz[0:int(size/2),:]))
y_train_Oz_all = np.concatenate((y_train[0:int(size/2)],y_train_Oz[0:int(size/2)]))

Using -Oz only in the testing phase

In [12]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_Oz, y_test_Oz)
# Random Forest
model_rf(x_train, y_train, x_test_Oz, y_test_Oz)
# SVM
model_svm(x_train, y_train, x_test_Oz, y_test_Oz)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_Oz, y_test_Oz)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_Oz, y_test_Oz)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_Oz, y_test_Oz)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0456, 0.0683, 0.0439, 0.0248, 0.0079
Random Forest, 0.2206, 0.2903, 0.2203, 0.1932, 2.6743
SVM, 0.0937, 0.1476, 0.0898, 0.0719, 840.6693
SGD, 0.0884, 0.1528, 0.0850, 0.0693, 21.9135
MLP, 0.0617, 0.0974, 0.0594, 0.0504, 595.2360
Regressão Logística, 0.0811, 0.1184, 0.0762, 0.0576, 54.9512


Using -Oz in the testing and training phase

In [11]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_Oz_all, y_train_Oz_all, x_test_Oz, y_test_Oz)
# Random Forest
model_rf(x_train_Oz_all, y_train_Oz_all, x_test_Oz, y_test_Oz)
# SVM
model_svm(x_train_Oz_all, y_train_Oz_all, x_test_Oz, y_test_Oz)
# Stochastic Gradient Descent
model_SGD(x_train_Oz_all, y_train_Oz_all, x_test_Oz, y_test_Oz)
# Multi-layer Perceptron
model_MLP(x_train_Oz_all, y_train_Oz_all, x_test_Oz, y_test_Oz)
# Regressão Logística
model_regRegression(x_train_Oz_all, y_train_Oz_all, x_test_Oz, y_test_Oz)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7036, 0.7179, 0.7023, 0.6996, 0.0067
Random Forest, 0.8430, 0.8421, 0.8426, 0.8407, 2.4940
SVM, 0.5412, 0.6475, 0.5382, 0.5212, 595.5683
SGD, 0.3882, 0.5676, 0.3856, 0.3842, 14.5677
MLP, 0.5339, 0.5281, 0.5313, 0.5178, 434.3872
Regressão Logística, 0.5791, 0.5801, 0.5778, 0.5696, 44.6301
