## Models with POJ Dataset
Models trained using POJ104 variations and tested with CloneGen

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [3]:
# Without obfuscation
df = pd.read_csv("./../../../features/POJ/poj_features_O0.csv", skipinitialspace=True)

# Optimizations
df_O3 = pd.read_csv("./../../../features/POJ/poj_features_O3.csv", skipinitialspace=True)

# CloneGen
df_drlsg = pd.read_csv("./../../../features/POJ/DRLSG_features_O0.csv", skipinitialspace=True)
df_drlsg_O3 = pd.read_csv("./../../../features/POJ/DRLSG_features_O3.csv", skipinitialspace=True)
df_ga = pd.read_csv("./../../../features/POJ/GA_features_O0.csv", skipinitialspace=True)
df_ga_O3 = pd.read_csv("./../../../features/POJ/GA_features_O3.csv", skipinitialspace=True)
df_mcmc = pd.read_csv("./../../../features/POJ/MCMC_features_O0.csv", skipinitialspace=True)
df_mcmc_O3 = pd.read_csv("./../../../features/POJ/MCMC_features_O3.csv", skipinitialspace=True)
df_ojclone = pd.read_csv("./../../../features/POJ/OJClone_features_O0.csv", skipinitialspace=True)
df_ojclone_O3 = pd.read_csv("./../../../features/POJ/OJClone_features_O3.csv", skipinitialspace=True)
df_rs = pd.read_csv("./../../../features/POJ/RS_features_O0.csv", skipinitialspace=True)
df_rs_O3 = pd.read_csv("./../../../features/POJ/RS_features_O3.csv", skipinitialspace=True)

#### ~> Split the datasets into a training and test set

In [4]:
# Without obfuscation
train, test = train_test_split(df, test_size=0.2)

# Optimizations
train_O3, test_O3 = train_test_split(df_O3, test_size=0.2)

# CloneGen
train_drlsg, test_drlsg = train_test_split(df_drlsg, test_size=0.2)
train_drlsg_O3, test_drlsg_O3 = train_test_split(df_drlsg_O3, test_size=0.2)
train_ga, test_ga = train_test_split(df_ga, test_size=0.2)
train_ga_O3, test_ga_O3 = train_test_split(df_ga_O3, test_size=0.2)
train_mcmc, test_mcmc = train_test_split(df_mcmc, test_size=0.2)
train_mcmc_O3, test_mcmc_O3 = train_test_split(df_mcmc_O3, test_size=0.2)
train_ojclone, test_ojclone = train_test_split(df_ojclone, test_size=0.2)
train_ojclone_O3, test_ojclone_O3 = train_test_split(df_ojclone_O3, test_size=0.2)
train_rs, test_rs = train_test_split(df_rs, test_size=0.2)
train_rs_O3, test_rs_O3 = train_test_split(df_rs_O3, test_size=0.2)

### Case of Studies

#### ~> CloneGen

In [5]:
# O3
x_train_O3, y_train_O3 = train_O3.iloc[:,1:-1].to_numpy(), train_O3.iloc[:,-1].to_numpy()
x_test_O3, y_test_O3 = test_O3.iloc[:,1:-1].to_numpy(), test_O3.iloc[:,-1].to_numpy() 

# Training and testing database without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

size = len(x_train)

##### **DRLSG**

In [6]:
# without O3
x_train_drlsg, y_train_drlsg = train_drlsg.iloc[:,1:-1].to_numpy(), train_drlsg.iloc[:,-1].to_numpy()
x_test_drlsg, y_test_drlsg = test_drlsg.iloc[:,1:-1].to_numpy(), test_drlsg.iloc[:,-1].to_numpy()

# with O3
x_train_drlsg_O3, y_train_drlsg_O3 = train_drlsg_O3.iloc[:,1:-1].to_numpy(), train_drlsg_O3.iloc[:,-1].to_numpy()
x_test_drlsg_O3, y_test_drlsg_O3 = test_drlsg_O3.iloc[:,1:-1].to_numpy(), test_drlsg_O3.iloc[:,-1].to_numpy()

Using DRLSG only in the testing phase in a model trained with O0

In [7]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Random Forest
model_rf(x_train, y_train, x_test_drlsg, y_test_drlsg)
# SVM
model_svm(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_drlsg, y_test_drlsg)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0027, 0.0027, 0.0028, 0.0027, 0.0766
Random Forest, 0.0007, 0.0007, 0.0008, 0.0007, 3.3080
SVM, 0.0081, 0.0012, 0.0084, 0.0012, 530.4946
SGD, 0.0076, 0.0063, 0.0081, 0.0016, 17.4316
MLP, 0.0040, 0.0044, 0.0039, 0.0039, 356.0141
Regressão Logística, 0.0059, 0.0035, 0.0062, 0.0040, 58.6571


Using DRLSG only in the testing phase in a model trained with O3

In [8]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0068, 0.0010, 0.0060, 0.0013, 0.0292
Random Forest, 0.0112, 0.0078, 0.0100, 0.0035, 3.7634
SVM, 0.0094, 0.0016, 0.0091, 0.0021, 525.7817
SGD, 0.0077, 0.0023, 0.0085, 0.0028, 12.2617
MLP, 0.0115, 0.0123, 0.0119, 0.0076, 374.9362
Regressão Logística, 0.0066, 0.0027, 0.0059, 0.0019, 61.4750


Using DRLSG+O3 only in the testing phase in a model trained with O0

In [9]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Random Forest
model_rf(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# SVM
model_svm(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0107, 0.0048, 0.0107, 0.0043, 0.0213
Random Forest, 0.0072, 0.0077, 0.0073, 0.0056, 3.7648
SVM, 0.0061, 0.0009, 0.0062, 0.0014, 481.6119
SGD, 0.0058, 0.0011, 0.0062, 0.0017, 16.4608
MLP, 0.0042, 0.0042, 0.0041, 0.0035, 350.2362
Regressão Logística, 0.0064, 0.0014, 0.0060, 0.0013, 59.9822


Using DRLSG+O3 only in the testing phase in a model trained with O3

In [10]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0022, 0.0020, 0.0022, 0.0019, 0.2559
Random Forest, 0.0010, 0.0011, 0.0011, 0.0011, 4.1311
SVM, 0.0015, 0.0017, 0.0015, 0.0014, 513.3512
SGD, 0.0045, 0.0025, 0.0047, 0.0026, 12.2250
MLP, 0.0033, 0.0035, 0.0035, 0.0031, 331.0829
Regressão Logística, 0.0038, 0.0040, 0.0038, 0.0035, 95.1594


##### **GA**

In [11]:
# without O3
x_train_ga, y_train_ga = train_ga.iloc[:,1:-1].to_numpy(), train_ga.iloc[:,-1].to_numpy()
x_test_ga, y_test_ga = test_ga.iloc[:,1:-1].to_numpy(), test_ga.iloc[:,-1].to_numpy()

# with O3
x_train_ga_O3, y_train_ga_O3 = train_ga_O3.iloc[:,1:-1].to_numpy(), train_ga_O3.iloc[:,-1].to_numpy()
x_test_ga_O3, y_test_ga_O3 = test_ga_O3.iloc[:,1:-1].to_numpy(), test_ga_O3.iloc[:,-1].to_numpy()

Using GA only in the testing phase in a model trained with O0

In [12]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ga, y_test_ga)
# Random Forest
model_rf(x_train, y_train, x_test_ga, y_test_ga)
# SVM
model_svm(x_train, y_train, x_test_ga, y_test_ga)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ga, y_test_ga)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ga, y_test_ga)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ga, y_test_ga)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0059, 0.0056, 0.0059, 0.0050, 0.0068
Random Forest, 0.0081, 0.0023, 0.0077, 0.0024, 3.5211
SVM, 0.0083, 0.0019, 0.0088, 0.0027, 466.4460
SGD, 0.0107, 0.0067, 0.0104, 0.0025, 17.2417
MLP, 0.0047, 0.0113, 0.0062, 0.0035, 332.9040
Regressão Logística, 0.0055, 0.0062, 0.0061, 0.0031, 63.0824


Using GA only in the testing phase in a model trained with O3

In [13]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ga, y_test_ga)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0091, 0.0107, 0.0089, 0.0015, 0.0799
Random Forest, 0.0095, 0.0010, 0.0086, 0.0013, 3.9344
SVM, 0.0119, 0.0013, 0.0134, 0.0020, 514.0499
SGD, 0.0036, 0.0005, 0.0033, 0.0005, 12.7895
MLP, 0.0091, 0.0155, 0.0129, 0.0078, 348.9801
Regressão Logística, 0.0091, 0.0004, 0.0087, 0.0008, 61.4774


Using GA+O3 only in the testing phase in a model trained with O0

In [14]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Random Forest
model_rf(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# SVM
model_svm(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ga_O3, y_test_ga_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0103, 0.0039, 0.0106, 0.0039, 0.0395
Random Forest, 0.0030, 0.0026, 0.0029, 0.0024, 3.6602
SVM, 0.0065, 0.0011, 0.0061, 0.0016, 474.4402
SGD, 0.0069, 0.0033, 0.0062, 0.0018, 16.0502
MLP, 0.0047, 0.0037, 0.0037, 0.0029, 313.8272
Regressão Logística, 0.0065, 0.0012, 0.0079, 0.0014, 56.9964


Using GA only in the testing phase in a model trained with O3

In [15]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0018, 0.0018, 0.0022, 0.0017, 0.0199
Random Forest, 0.0002, 0.0003, 0.0003, 0.0003, 3.7777
SVM, 0.0042, 0.0024, 0.0037, 0.0013, 511.2106
SGD, 0.0049, 0.0071, 0.0051, 0.0038, 12.4390
MLP, 0.0042, 0.0056, 0.0040, 0.0036, 342.8504
Regressão Logística, 0.0018, 0.0024, 0.0019, 0.0019, 52.9009


##### **MCMC**

In [16]:
# without MCMC
x_train_mcmc, y_train_mcmc = train_mcmc.iloc[:,1:-1].to_numpy(), train_mcmc.iloc[:,-1].to_numpy()
x_test_mcmc, y_test_mcmc = test_mcmc.iloc[:,1:-1].to_numpy(), test_mcmc.iloc[:,-1].to_numpy()

# with O3
x_train_mcmc_O3, y_train_mcmc_O3 = train_mcmc_O3.iloc[:,1:-1].to_numpy(), train_mcmc_O3.iloc[:,-1].to_numpy()
x_test_mcmc_O3, y_test_mcmc_O3 = test_mcmc_O3.iloc[:,1:-1].to_numpy(), test_mcmc_O3.iloc[:,-1].to_numpy()

Using MCMC only in the testing phase in a model trained with O0

In [17]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Random Forest
model_rf(x_train, y_train, x_test_mcmc, y_test_mcmc)
# SVM
model_svm(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_mcmc, y_test_mcmc)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0052, 0.0043, 0.0049, 0.0038, 0.0045
Random Forest, 0.0063, 0.0015, 0.0068, 0.0019, 3.4793
SVM, 0.0088, 0.0019, 0.0092, 0.0024, 466.9386
SGD, 0.0052, 0.0034, 0.0058, 0.0020, 16.3040
MLP, 0.0046, 0.0095, 0.0053, 0.0035, 330.5752
Regressão Logística, 0.0053, 0.0061, 0.0056, 0.0038, 62.7063


Using MCMC only in the testing phase in a model trained with O3

In [18]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0077, 0.0019, 0.0069, 0.0013, 0.0431
Random Forest, 0.0141, 0.0110, 0.0135, 0.0030, 3.8145
SVM, 0.0094, 0.0055, 0.0123, 0.0013, 506.2153
SGD, 0.0053, 0.0009, 0.0048, 0.0011, 12.3887
MLP, 0.0078, 0.0172, 0.0081, 0.0053, 366.6095
Regressão Logística, 0.0064, 0.0014, 0.0063, 0.0010, 49.7117


Using MCMC+O3 only in the testing phase in a model trained with O0

In [19]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Random Forest
model_rf(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# SVM
model_svm(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0121, 0.0069, 0.0108, 0.0051, 0.0068
Random Forest, 0.0068, 0.0043, 0.0063, 0.0035, 3.7495
SVM, 0.0118, 0.0019, 0.0103, 0.0028, 469.0862
SGD, 0.0068, 0.0017, 0.0058, 0.0022, 16.1832
MLP, 0.0055, 0.0036, 0.0041, 0.0035, 344.0198
Regressão Logística, 0.0126, 0.0031, 0.0110, 0.0021, 55.1880


Using MCMC+O3 only in the testing phase in a model trained with O3

In [20]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0017, 0.0019, 0.0018, 0.0018, 0.0301
Random Forest, 0.0006, 0.0006, 0.0007, 0.0007, 4.1988
SVM, 0.0053, 0.0029, 0.0051, 0.0032, 515.1199
SGD, 0.0077, 0.0087, 0.0080, 0.0043, 11.7058
MLP, 0.0050, 0.0058, 0.0053, 0.0049, 354.4169
Regressão Logística, 0.0031, 0.0034, 0.0035, 0.0033, 63.4789


##### **OJClone**

In [21]:
# without O3
x_train_ojclone, y_train_ojclone = train_ojclone.iloc[:,1:-1].to_numpy(), train_ojclone.iloc[:,-1].to_numpy()
x_test_ojclone, y_test_ojclone = test_ojclone.iloc[:,1:-1].to_numpy(), test_ojclone.iloc[:,-1].to_numpy()

# with O3
x_train_ojclone_O3, y_train_ojclone_O3 = train_ojclone_O3.iloc[:,1:-1].to_numpy(), train_ojclone_O3.iloc[:,-1].to_numpy()
x_test_ojclone_O3, y_test_ojclone_O3 = test_ojclone_O3.iloc[:,1:-1].to_numpy(), test_ojclone_O3.iloc[:,-1].to_numpy()

Using OJClone only in the testing phase in a model trained with O0

In [22]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ojclone, y_test_ojclone)
# Random Forest
model_rf(x_train, y_train, x_test_ojclone, y_test_ojclone)
# SVM
model_svm(x_train, y_train, x_test_ojclone, y_test_ojclone)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ojclone, y_test_ojclone)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ojclone, y_test_ojclone)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ojclone, y_test_ojclone)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0027, 0.0028, 0.0027, 0.0027, 0.0910
Random Forest, 0.0003, 0.0003, 0.0003, 0.0003, 4.2307
SVM, 0.0089, 0.0009, 0.0093, 0.0011, 473.2594
SGD, 0.0026, 0.0011, 0.0025, 0.0014, 16.3037
MLP, 0.0041, 0.0056, 0.0043, 0.0046, 341.7382
Regressão Logística, 0.0036, 0.0039, 0.0035, 0.0036, 64.0104


Using OJClone only in the testing phase in a model trained with O3

In [23]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0056, 0.0009, 0.0052, 0.0011, 0.0048
Random Forest, 0.0100, 0.0015, 0.0094, 0.0023, 4.0824
SVM, 0.0083, 0.0010, 0.0083, 0.0012, 498.5406
SGD, 0.0036, 0.0014, 0.0036, 0.0008, 11.8458
MLP, 0.0119, 0.0110, 0.0120, 0.0076, 339.7321
Regressão Logística, 0.0057, 0.0009, 0.0056, 0.0013, 49.5471


Using OJClone+O3 only in the testing phase in a model trained with O0

In [24]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# Random Forest
model_rf(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# SVM
model_svm(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0084, 0.0028, 0.0092, 0.0034, 0.0170
Random Forest, 0.0052, 0.0069, 0.0051, 0.0037, 3.6515
SVM, 0.0095, 0.0107, 0.0100, 0.0021, 478.0379
SGD, 0.0062, 0.0020, 0.0069, 0.0026, 15.9718
MLP, 0.0046, 0.0038, 0.0048, 0.0036, 331.9484
Regressão Logística, 0.0088, 0.0031, 0.0098, 0.0014, 63.9753


Using OJClone+O3 only in the testing phase in a model trained with O3

In [25]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0026, 0.0025, 0.0025, 0.0025, 0.3603
Random Forest, 0.0002, 0.0002, 0.0002, 0.0002, 3.9104
SVM, 0.0038, 0.0033, 0.0037, 0.0031, 505.6112
SGD, 0.0078, 0.0066, 0.0078, 0.0043, 11.9691
MLP, 0.0028, 0.0031, 0.0028, 0.0027, 352.3383
Regressão Logística, 0.0027, 0.0032, 0.0027, 0.0029, 81.3427


##### **RS**

In [26]:
# without O3
x_train_rs, y_train_rs = train_rs.iloc[:,1:-1].to_numpy(), train_rs.iloc[:,-1].to_numpy()
x_test_rs, y_test_rs = test_rs.iloc[:,1:-1].to_numpy(), test_rs.iloc[:,-1].to_numpy()

# with O3
x_train_rs_O3, y_train_rs_O3 = train_rs_O3.iloc[:,1:-1].to_numpy(), train_rs_O3.iloc[:,-1].to_numpy()
x_test_rs_O3, y_test_rs_O3 = test_rs_O3.iloc[:,1:-1].to_numpy(), test_rs_O3.iloc[:,-1].to_numpy()

Using RS only in the testing phase in a model trained with O0

In [27]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_rs, y_test_rs)
# Random Forest
model_rf(x_train, y_train, x_test_rs, y_test_rs)
# SVM
model_svm(x_train, y_train, x_test_rs, y_test_rs)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_rs, y_test_rs)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_rs, y_test_rs)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_rs, y_test_rs)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0052, 0.0050, 0.0052, 0.0042, 0.0051
Random Forest, 0.0077, 0.0028, 0.0078, 0.0029, 3.5360
SVM, 0.0080, 0.0025, 0.0090, 0.0026, 469.6057
SGD, 0.0047, 0.0014, 0.0043, 0.0017, 15.8814
MLP, 0.0032, 0.0037, 0.0036, 0.0023, 371.0388
Regressão Logística, 0.0061, 0.0057, 0.0066, 0.0044, 60.9023


Using RS only in the testing phase in a model trained with O3

In [28]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_rs, y_test_rs)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0067, 0.0006, 0.0061, 0.0009, 0.3222
Random Forest, 0.0133, 0.0065, 0.0121, 0.0027, 4.3610
SVM, 0.0080, 0.0028, 0.0093, 0.0020, 500.4905
SGD, 0.0056, 0.0012, 0.0048, 0.0013, 11.9016
MLP, 0.0068, 0.0061, 0.0073, 0.0049, 343.0874
Regressão Logística, 0.0085, 0.0018, 0.0073, 0.0011, 61.3705


Using RS+O3 only in the testing phase in a model trained with O0

In [29]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Random Forest
model_rf(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# SVM
model_svm(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_rs_O3, y_test_rs_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0114, 0.0047, 0.0108, 0.0043, 0.0046
Random Forest, 0.0062, 0.0089, 0.0062, 0.0042, 3.7278
SVM, 0.0102, 0.0034, 0.0103, 0.0031, 464.9887
SGD, 0.0073, 0.0110, 0.0072, 0.0023, 15.7848
MLP, 0.0033, 0.0033, 0.0032, 0.0025, 323.3025
Regressão Logística, 0.0095, 0.0019, 0.0099, 0.0015, 57.7480


Using RS+O3 only in the testing phase in a model trained with O3

In [30]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0012, 0.0012, 0.0012, 0.0012, 0.1199
Random Forest, 0.0008, 0.0008, 0.0007, 0.0008, 4.0879
SVM, 0.0058, 0.0055, 0.0060, 0.0032, 509.3715
SGD, 0.0065, 0.0047, 0.0077, 0.0040, 11.8008
MLP, 0.0047, 0.0046, 0.0046, 0.0041, 346.0376
Regressão Logística, 0.0024, 0.0021, 0.0023, 0.0022, 63.1355
