## Models with POJ Dataset
Models trained using POJ104 variations and tested with CloneGen

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [10]:
# Without obfuscation
df = pd.read_csv("./../../../features/POJ/poj_features_O0.csv", skipinitialspace=True)

# Optimizations
df_O3 = pd.read_csv("./../../../features/POJ/poj_features_O3.csv", skipinitialspace=True)

# CloneGen
df_drlsg = pd.read_csv("./../../../features/OJClone/ojclone_features_DRLSG_O0.csv", skipinitialspace=True)
df_drlsg_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_DRLSG_O3.csv", skipinitialspace=True)
df_ga = pd.read_csv("./../../../features/OJClone/ojclone_features_GA_O0.csv", skipinitialspace=True)
df_ga_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_GA_O3.csv", skipinitialspace=True)
df_mcmc = pd.read_csv("./../../../features/OJClone/ojclone_features_MCMC_O0.csv", skipinitialspace=True)
df_mcmc_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_MCMC_O3.csv", skipinitialspace=True)
df_ojclone = pd.read_csv("./../../../features/OJClone/ojclone_features_O0.csv", skipinitialspace=True)
df_ojclone_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_O3.csv", skipinitialspace=True)
df_rs = pd.read_csv("./../../../features/OJClone/ojclone_features_RS_O0.csv", skipinitialspace=True)
df_rs_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_RS_O3.csv", skipinitialspace=True)

#### ~> Standardizing the classes
The classes of POJ104 range from 0 to 103 and the classes of OJClone range from 1 to 104

In [11]:
df['class'] = df['class'] + 1
df_O3['class'] = df_O3['class'] + 1

#### ~> Split the datasets into a training and test set

In [13]:
# Without obfuscation
train, test = train_test_split(df, test_size=0.2)

# Optimizations
train_O3, test_O3 = train_test_split(df_O3, test_size=0.2)

# CloneGen
train_drlsg, test_drlsg = train_test_split(df_drlsg, test_size=0.2)
train_drlsg_O3, test_drlsg_O3 = train_test_split(df_drlsg_O3, test_size=0.2)
train_ga, test_ga = train_test_split(df_ga, test_size=0.2)
train_ga_O3, test_ga_O3 = train_test_split(df_ga_O3, test_size=0.2)
train_mcmc, test_mcmc = train_test_split(df_mcmc, test_size=0.2)
train_mcmc_O3, test_mcmc_O3 = train_test_split(df_mcmc_O3, test_size=0.2)
train_ojclone, test_ojclone = train_test_split(df_ojclone, test_size=0.2)
train_ojclone_O3, test_ojclone_O3 = train_test_split(df_ojclone_O3, test_size=0.2)
train_rs, test_rs = train_test_split(df_rs, test_size=0.2)
train_rs_O3, test_rs_O3 = train_test_split(df_rs_O3, test_size=0.2)

### Case of Studies

#### ~> CloneGen

In [14]:
# O3
x_train_O3, y_train_O3 = train_O3.iloc[:,1:-1].to_numpy(), train_O3.iloc[:,-1].to_numpy()
x_test_O3, y_test_O3 = test_O3.iloc[:,1:-1].to_numpy(), test_O3.iloc[:,-1].to_numpy() 

# Training and testing database without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

size = len(x_train)

##### **DRLSG**

In [15]:
# without O3
x_train_drlsg, y_train_drlsg = train_drlsg.iloc[:,1:-1].to_numpy(), train_drlsg.iloc[:,-1].to_numpy()
x_test_drlsg, y_test_drlsg = test_drlsg.iloc[:,1:-1].to_numpy(), test_drlsg.iloc[:,-1].to_numpy()

# with O3
x_train_drlsg_O3, y_train_drlsg_O3 = train_drlsg_O3.iloc[:,1:-1].to_numpy(), train_drlsg_O3.iloc[:,-1].to_numpy()
x_test_drlsg_O3, y_test_drlsg_O3 = test_drlsg_O3.iloc[:,1:-1].to_numpy(), test_drlsg_O3.iloc[:,-1].to_numpy()

Using DRLSG only in the testing phase in a model trained with O0

In [16]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Random Forest
model_rf(x_train, y_train, x_test_drlsg, y_test_drlsg)
# SVM
model_svm(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_drlsg, y_test_drlsg)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6486, 0.6599, 0.6471, 0.6438, 0.3141
Random Forest, 0.9068, 0.9099, 0.9060, 0.9053, 2.9255
SVM, 0.3386, 0.4963, 0.3374, 0.3342, 474.0253
SGD, 0.3129, 0.4544, 0.3086, 0.3073, 13.8794
MLP, 0.5063, 0.5176, 0.5080, 0.4980, 237.4795
Regressão Logística, 0.5269, 0.5327, 0.5257, 0.5136, 42.8994


Using DRLSG only in the testing phase in a model trained with O3

In [17]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0396, 0.0920, 0.0419, 0.0390, 0.0156
Random Forest, 0.2257, 0.2610, 0.2324, 0.1982, 2.1099
SVM, 0.0852, 0.0971, 0.0885, 0.0552, 566.0033
SGD, 0.0417, 0.0671, 0.0414, 0.0266, 10.9975
MLP, 0.0743, 0.0908, 0.0744, 0.0618, 333.3146
Regressão Logística, 0.0735, 0.0881, 0.0747, 0.0457, 42.3212


Using DRLSG+O3 only in the testing phase in a model trained with O0

In [18]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Random Forest
model_rf(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# SVM
model_svm(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0407, 0.0558, 0.0407, 0.0276, 0.0054
Random Forest, 0.1824, 0.2537, 0.1831, 0.1655, 2.5745
SVM, 0.0585, 0.1138, 0.0602, 0.0475, 448.4519
SGD, 0.0597, 0.1293, 0.0607, 0.0511, 12.4716
MLP, 0.0581, 0.0537, 0.0593, 0.0416, 244.2089
Regressão Logística, 0.0596, 0.0820, 0.0591, 0.0437, 36.8069


Using DRLSG+O3 only in the testing phase in a model trained with O3

In [19]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6290, 0.6619, 0.6249, 0.6294, 0.0064
Random Forest, 0.8153, 0.8298, 0.8128, 0.8136, 2.2017
SVM, 0.4195, 0.5796, 0.4191, 0.4147, 467.6131
SGD, 0.3262, 0.4679, 0.3233, 0.3251, 7.0422
MLP, 0.4798, 0.5138, 0.4776, 0.4754, 145.0276
Regressão Logística, 0.5514, 0.5725, 0.5480, 0.5427, 29.2237


##### **GA**

In [20]:
# without O3
x_train_ga, y_train_ga = train_ga.iloc[:,1:-1].to_numpy(), train_ga.iloc[:,-1].to_numpy()
x_test_ga, y_test_ga = test_ga.iloc[:,1:-1].to_numpy(), test_ga.iloc[:,-1].to_numpy()

# with O3
x_train_ga_O3, y_train_ga_O3 = train_ga_O3.iloc[:,1:-1].to_numpy(), train_ga_O3.iloc[:,-1].to_numpy()
x_test_ga_O3, y_test_ga_O3 = test_ga_O3.iloc[:,1:-1].to_numpy(), test_ga_O3.iloc[:,-1].to_numpy()

Using GA only in the testing phase in a model trained with O0

In [21]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ga, y_test_ga)
# Random Forest
model_rf(x_train, y_train, x_test_ga, y_test_ga)
# SVM
model_svm(x_train, y_train, x_test_ga, y_test_ga)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ga, y_test_ga)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ga, y_test_ga)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ga, y_test_ga)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.3319, 0.4057, 0.3244, 0.3102, 0.0049
Random Forest, 0.4184, 0.5592, 0.3893, 0.3764, 2.3689
SVM, 0.0748, 0.2020, 0.0564, 0.0637, 399.7630
SGD, 0.1862, 0.2603, 0.1669, 0.1607, 13.5047
MLP, 0.2307, 0.2374, 0.2104, 0.1918, 243.1306
Regressão Logística, 0.3335, 0.4128, 0.3025, 0.3028, 41.3226


Using GA only in the testing phase in a model trained with O3

In [22]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ga, y_test_ga)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0283, 0.0427, 0.0268, 0.0172, 0.0185
Random Forest, 0.1820, 0.2103, 0.1481, 0.1297, 2.5083
SVM, 0.0625, 0.0997, 0.0489, 0.0406, 439.9668
SGD, 0.0408, 0.0605, 0.0417, 0.0246, 8.2044
MLP, 0.0526, 0.0617, 0.0410, 0.0316, 222.7989
Regressão Logística, 0.0805, 0.0682, 0.0636, 0.0379, 43.9243


Using GA+O3 only in the testing phase in a model trained with O0

In [23]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Random Forest
model_rf(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# SVM
model_svm(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ga_O3, y_test_ga_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0578, 0.0567, 0.0455, 0.0296, 0.0113
Random Forest, 0.2291, 0.2495, 0.1929, 0.1761, 3.2174
SVM, 0.0772, 0.1006, 0.0667, 0.0535, 351.1974
SGD, 0.0714, 0.0990, 0.0592, 0.0513, 11.4536
MLP, 0.0679, 0.0485, 0.0594, 0.0395, 187.5140
Regressão Logística, 0.0833, 0.0951, 0.0668, 0.0511, 28.9354


Using GA only in the testing phase in a model trained with O3

In [24]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7060, 0.6924, 0.6845, 0.6755, 0.0056
Random Forest, 0.8752, 0.8741, 0.8654, 0.8612, 2.3799
SVM, 0.5185, 0.6013, 0.4917, 0.4814, 458.4601
SGD, 0.3780, 0.4733, 0.3603, 0.3454, 8.3708
MLP, 0.5369, 0.5208, 0.5010, 0.4911, 211.1759
Regressão Logística, 0.6004, 0.5808, 0.5734, 0.5542, 36.7911


##### **MCMC**

In [25]:
# without MCMC
x_train_mcmc, y_train_mcmc = train_mcmc.iloc[:,1:-1].to_numpy(), train_mcmc.iloc[:,-1].to_numpy()
x_test_mcmc, y_test_mcmc = test_mcmc.iloc[:,1:-1].to_numpy(), test_mcmc.iloc[:,-1].to_numpy()

# with O3
x_train_mcmc_O3, y_train_mcmc_O3 = train_mcmc_O3.iloc[:,1:-1].to_numpy(), train_mcmc_O3.iloc[:,-1].to_numpy()
x_test_mcmc_O3, y_test_mcmc_O3 = test_mcmc_O3.iloc[:,1:-1].to_numpy(), test_mcmc_O3.iloc[:,-1].to_numpy()

Using MCMC only in the testing phase in a model trained with O0

In [26]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Random Forest
model_rf(x_train, y_train, x_test_mcmc, y_test_mcmc)
# SVM
model_svm(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_mcmc, y_test_mcmc)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.3419, 0.4518, 0.3546, 0.3583, 0.0076
Random Forest, 0.3705, 0.7165, 0.3680, 0.3882, 1.9495
SVM, 0.1072, 0.3431, 0.0949, 0.1138, 472.7194
SGD, 0.1684, 0.3646, 0.1628, 0.1648, 14.4547
MLP, 0.2030, 0.3351, 0.1979, 0.1882, 330.3872
Regressão Logística, 0.3504, 0.4852, 0.3376, 0.3506, 52.7876


Using MCMC only in the testing phase in a model trained with O3

In [27]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0231, 0.0597, 0.0237, 0.0183, 0.0047
Random Forest, 0.1639, 0.1826, 0.1502, 0.1206, 4.2726
SVM, 0.0790, 0.1316, 0.0704, 0.0493, 508.5596
SGD, 0.0427, 0.0675, 0.0449, 0.0292, 9.7408
MLP, 0.0515, 0.0735, 0.0460, 0.0356, 374.1442
Regressão Logística, 0.0708, 0.0659, 0.0615, 0.0373, 53.6922


Using MCMC+O3 only in the testing phase in a model trained with O0

In [28]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Random Forest
model_rf(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# SVM
model_svm(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0590, 0.0669, 0.0484, 0.0311, 0.0177
Random Forest, 0.2198, 0.2476, 0.1924, 0.1731, 3.2401
SVM, 0.0765, 0.1008, 0.0661, 0.0525, 449.0830
SGD, 0.0818, 0.1298, 0.0697, 0.0620, 13.7438
MLP, 0.0739, 0.0628, 0.0667, 0.0490, 389.0927
Regressão Logística, 0.0785, 0.0667, 0.0660, 0.0512, 33.4613


Using MCMC+O3 only in the testing phase in a model trained with O3

In [29]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7214, 0.7268, 0.7164, 0.7135, 0.0065
Random Forest, 0.9003, 0.9083, 0.8990, 0.9000, 3.1357
SVM, 0.5245, 0.6139, 0.5072, 0.4958, 471.6886
SGD, 0.3611, 0.5293, 0.3532, 0.3685, 8.6786
MLP, 0.5416, 0.5446, 0.5289, 0.5234, 223.7753
Regressão Logística, 0.6126, 0.6056, 0.6047, 0.5927, 29.0890


##### **OJClone**

In [30]:
# without O3
x_train_ojclone, y_train_ojclone = train_ojclone.iloc[:,1:-1].to_numpy(), train_ojclone.iloc[:,-1].to_numpy()
x_test_ojclone, y_test_ojclone = test_ojclone.iloc[:,1:-1].to_numpy(), test_ojclone.iloc[:,-1].to_numpy()

# with O3
x_train_ojclone_O3, y_train_ojclone_O3 = train_ojclone_O3.iloc[:,1:-1].to_numpy(), train_ojclone_O3.iloc[:,-1].to_numpy()
x_test_ojclone_O3, y_test_ojclone_O3 = test_ojclone_O3.iloc[:,1:-1].to_numpy(), test_ojclone_O3.iloc[:,-1].to_numpy()

Using OJClone only in the testing phase in a model trained with O0

In [31]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ojclone, y_test_ojclone)
# Random Forest
model_rf(x_train, y_train, x_test_ojclone, y_test_ojclone)
# SVM
model_svm(x_train, y_train, x_test_ojclone, y_test_ojclone)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ojclone, y_test_ojclone)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ojclone, y_test_ojclone)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ojclone, y_test_ojclone)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6950, 0.7083, 0.6946, 0.6908, 0.0175
Random Forest, 0.9581, 0.9583, 0.9578, 0.9576, 1.8496
SVM, 0.4641, 0.5645, 0.4618, 0.4379, 401.7142
SGD, 0.3263, 0.4560, 0.3251, 0.3190, 11.9077
MLP, 0.5614, 0.5530, 0.5602, 0.5491, 239.4679
Regressão Logística, 0.5688, 0.5536, 0.5677, 0.5535, 30.8236


Using OJClone only in the testing phase in a model trained with O3

In [32]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ojclone, y_test_ojclone)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0433, 0.1001, 0.0432, 0.0396, 0.0074
Random Forest, 0.2484, 0.3241, 0.2432, 0.2070, 2.4958
SVM, 0.0894, 0.1252, 0.0897, 0.0596, 453.0707
SGD, 0.0556, 0.0996, 0.0550, 0.0397, 9.0113
MLP, 0.0773, 0.0833, 0.0753, 0.0629, 230.0424
Regressão Logística, 0.0787, 0.0773, 0.0775, 0.0493, 36.1879


Using OJClone+O3 only in the testing phase in a model trained with O0

In [33]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# Random Forest
model_rf(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# SVM
model_svm(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ojclone_O3, y_test_ojclone_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0536, 0.0771, 0.0534, 0.0344, 0.0071
Random Forest, 0.2161, 0.2761, 0.2109, 0.1896, 2.2183
SVM, 0.0654, 0.1102, 0.0632, 0.0520, 412.1645
SGD, 0.0642, 0.1356, 0.0624, 0.0579, 12.2352
MLP, 0.0653, 0.0587, 0.0630, 0.0447, 241.4157
Regressão Logística, 0.0726, 0.0679, 0.0686, 0.0531, 36.2051


Using OJClone+O3 only in the testing phase in a model trained with O3

In [34]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ojclone_O3, y_test_ojclone_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7581, 0.7648, 0.7575, 0.7556, 0.0383
Random Forest, 0.9666, 0.9667, 0.9664, 0.9664, 3.0115
SVM, 0.4687, 0.6403, 0.4678, 0.4667, 471.1438
SGD, 0.3854, 0.5417, 0.3813, 0.3820, 8.5949
MLP, 0.5763, 0.5824, 0.5757, 0.5679, 215.8810
Regressão Logística, 0.6606, 0.6566, 0.6591, 0.6502, 31.1977


##### **RS**

In [35]:
# without O3
x_train_rs, y_train_rs = train_rs.iloc[:,1:-1].to_numpy(), train_rs.iloc[:,-1].to_numpy()
x_test_rs, y_test_rs = test_rs.iloc[:,1:-1].to_numpy(), test_rs.iloc[:,-1].to_numpy()

# with O3
x_train_rs_O3, y_train_rs_O3 = train_rs_O3.iloc[:,1:-1].to_numpy(), train_rs_O3.iloc[:,-1].to_numpy()
x_test_rs_O3, y_test_rs_O3 = test_rs_O3.iloc[:,1:-1].to_numpy(), test_rs_O3.iloc[:,-1].to_numpy()

Using RS only in the testing phase in a model trained with O0

In [36]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_rs, y_test_rs)
# Random Forest
model_rf(x_train, y_train, x_test_rs, y_test_rs)
# SVM
model_svm(x_train, y_train, x_test_rs, y_test_rs)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_rs, y_test_rs)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_rs, y_test_rs)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_rs, y_test_rs)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.3632, 0.4833, 0.3707, 0.3829, 0.0218
Random Forest, 0.4607, 0.7497, 0.4603, 0.4993, 2.5880
SVM, 0.1944, 0.3940, 0.1766, 0.1780, 492.9888
SGD, 0.2146, 0.4274, 0.2061, 0.2161, 14.6711
MLP, 0.2591, 0.3830, 0.2510, 0.2595, 284.1695
Regressão Logística, 0.3537, 0.4779, 0.3440, 0.3648, 39.3461


Using RS only in the testing phase in a model trained with O3

In [37]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_rs, y_test_rs)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0273, 0.0802, 0.0267, 0.0215, 0.0227
Random Forest, 0.1694, 0.2068, 0.1562, 0.1307, 2.4632
SVM, 0.0694, 0.0753, 0.0633, 0.0441, 506.0781
SGD, 0.0400, 0.0807, 0.0416, 0.0256, 10.8411
MLP, 0.0521, 0.0678, 0.0472, 0.0388, 329.5151
Regressão Logística, 0.0714, 0.0656, 0.0635, 0.0372, 55.0989


Using RS+O3 only in the testing phase in a model trained with O0

In [38]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Random Forest
model_rf(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# SVM
model_svm(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_rs_O3, y_test_rs_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0544, 0.0685, 0.0466, 0.0314, 0.1587
Random Forest, 0.2079, 0.2486, 0.1890, 0.1701, 2.4335
SVM, 0.0671, 0.0878, 0.0597, 0.0482, 495.8631
SGD, 0.0767, 0.0879, 0.0677, 0.0554, 14.1962
MLP, 0.0732, 0.0596, 0.0662, 0.0477, 272.0573
Regressão Logística, 0.0818, 0.0994, 0.0677, 0.0526, 34.8583


Using RS+O3 only in the testing phase in a model trained with O3

In [39]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7090, 0.7177, 0.7031, 0.7005, 0.0077
Random Forest, 0.8998, 0.9045, 0.8978, 0.8972, 2.5216
SVM, 0.4940, 0.6120, 0.4692, 0.4645, 461.0165
SGD, 0.4045, 0.5010, 0.3968, 0.3907, 8.7271
MLP, 0.5452, 0.5541, 0.5372, 0.5308, 215.8709
Regressão Logística, 0.6105, 0.6095, 0.6069, 0.5951, 30.4803
