## Models with OJClone Dataset (-O0 and -O3)

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [3]:
# No obfuscation
df = pd.read_csv("./../../../features/OJClone/ojclone_features_O0.csv", skipinitialspace=True)

# Optimizations
df_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_O3.csv", skipinitialspace=True)

#### ~> Split the datasets into a training and test set

In [4]:
# No obfuscation
train, test = train_test_split(df, test_size=0.2)

# Optimizations
train_O3, test_O3 = train_test_split(df_O3, test_size=0.2)

### Case of Study

#### ~> No Obfuscation (-O0)


In [7]:
# without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

In [6]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test, y_test)
# Random Forest
model_rf(x_train, y_train, x_test, y_test)
# SVM
model_svm(x_train, y_train, x_test, y_test)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test, y_test)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test, y_test)
# Regressão Logística
model_regRegression(x_train, y_train, x_test, y_test)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6439, 0.6534, 0.6424, 0.6356, 0.0721
Random Forest, 0.8101, 0.8054, 0.8077, 0.8041, 3.1203
SVM, 0.3621, 0.5813, 0.3597, 0.3685, 576.7534
SGD, 0.3523, 0.4653, 0.3452, 0.3382, 13.4333
MLP, 0.5416, 0.5269, 0.5383, 0.5210, 209.2756
Regressão Logística, 0.5676, 0.5560, 0.5666, 0.5527, 32.4680


#### ~> Obfuscation (-O3)

In [5]:
# without obfuscation
x_train_O3, y_train_O3 = train_O3.iloc[:,1:-1].to_numpy(), train_O3.iloc[:,-1].to_numpy()
x_test_O3, y_test_O3 = test_O3.iloc[:,1:-1].to_numpy(), test_O3.iloc[:,-1].to_numpy() 

Using O3 only in the testing phase

In [8]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test, y_test)
# Random Forest
model_rf(x_train, y_train, x_test, y_test)
# SVM
model_svm(x_train, y_train, x_test, y_test)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test, y_test)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test, y_test)
# Regressão Logística
model_regRegression(x_train, y_train, x_test, y_test)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6391, 0.6527, 0.6382, 0.6329, 0.2688
Random Forest, 0.8063, 0.8036, 0.8054, 0.8012, 2.2284
SVM, 0.2057, 0.3539, 0.2024, 0.2082, 552.6584
SGD, 0.3189, 0.4101, 0.3166, 0.2928, 15.3395
MLP, 0.5370, 0.5422, 0.5385, 0.5212, 312.6559
Regressão Logística, 0.5554, 0.5413, 0.5551, 0.5390, 35.4521


Using O3 in the testing and training phase

In [9]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_O3, y_test_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_O3, y_test_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_O3, y_test_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_O3, y_test_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_O3, y_test_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_O3, y_test_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7144, 0.7214, 0.7139, 0.7120, 0.0067
Random Forest, 0.8625, 0.8623, 0.8627, 0.8612, 2.4515
SVM, 0.5498, 0.6439, 0.5480, 0.5385, 545.9361
SGD, 0.4041, 0.5204, 0.4020, 0.3969, 10.6054
MLP, 0.5623, 0.5708, 0.5602, 0.5508, 364.8793
Regressão Logística, 0.6506, 0.6429, 0.6480, 0.6393, 40.6874
