## Models with OJClone Dataset (CloneGen)
Models trained and tested with CloneGen obfuscation methods

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [3]:
# Without Obfuscations
df = pd.read_csv("./../../../features/OJClone/ojclone_features_O0.csv", skipinitialspace=True)

# Optimizations
df_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_O3.csv", skipinitialspace=True)

# Obfuscations
df_drlsg = pd.read_csv("./../../../features/OJClone/ojclone_features_DRLSG_O0.csv", skipinitialspace=True)
df_ga = pd.read_csv("./../../../features/OJClone/ojclone_features_GA_O0.csv", skipinitialspace=True)
df_mcmc = pd.read_csv("./../../../features/OJClone/ojclone_features_MCMC_O0.csv", skipinitialspace=True)
df_rs = pd.read_csv("./../../../features/OJClone/ojclone_features_RS_O0.csv", skipinitialspace=True)

#### ~> Split the datasets into a training and test set

In [4]:
# Without Obfuscations
train, test = train_test_split(df, test_size=0.2)

# Optimizations
train_O3, test_O3 = train_test_split(df_O3, test_size=0.2)

# Obfuscations
train_drlsg, test_drlsg = train_test_split(df_drlsg, test_size=0.2)
train_ga, test_ga = train_test_split(df_ga, test_size=0.2)
train_mcmc, test_mcmc = train_test_split(df_mcmc, test_size=0.2)
train_rs, test_rs = train_test_split(df_rs, test_size=0.2)

### Case of Studies

#### ~> Obfuscations

In [5]:
# Training and testing database without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

# O3
x_train_O3, y_train_O3 = train_O3.iloc[:,1:-1].to_numpy(), train_O3.iloc[:,-1].to_numpy()
x_test_O3, y_test_O3 = test_O3.iloc[:,1:-1].to_numpy(), test_O3.iloc[:,-1].to_numpy() 

size = len(x_train)

##### **DRLSG**


In [7]:
# DRLSG
x_train_drlsg, y_train_drlsg = train_drlsg.iloc[:,1:-1].to_numpy(), train_drlsg.iloc[:,-1].to_numpy()
x_test_drlsg, y_test_drlsg = test_drlsg.iloc[:,1:-1].to_numpy(), test_drlsg.iloc[:,-1].to_numpy()

# DRLSG and original 
x_train_drlsg_all = np.concatenate((x_train[0:int(size/2),:],x_train_drlsg[0:int(size/2),:]))
y_train_drlsg_all = np.concatenate((y_train[0:int(size/2)],y_train_drlsg[0:int(size/2)]))

Using DRLSG only in the testing phase

In [7]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Random Forest
model_rf(x_train, y_train, x_test_drlsg, y_test_drlsg)
# SVM
model_svm(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_drlsg, y_test_drlsg)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_drlsg, y_test_drlsg)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6430, 0.6557, 0.6431, 0.6380, 0.0080
Random Forest, 0.9052, 0.9082, 0.9029, 0.9028, 2.2449
SVM, 0.2600, 0.5005, 0.2650, 0.2664, 527.2596
SGD, 0.3034, 0.4798, 0.3039, 0.3028, 15.3073
MLP, 0.4890, 0.5044, 0.4880, 0.4733, 264.1336
Regressão Logística, 0.5215, 0.5266, 0.5196, 0.5071, 32.7296


Using DRLSG in the testing and training phase

In [8]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_drlsg_all, y_train_drlsg_all, x_test_drlsg, y_test_drlsg)
# Random Forest
model_rf(x_train_drlsg_all, y_train_drlsg_all, x_test_drlsg, y_test_drlsg)
# SVM
model_svm(x_train_drlsg_all, y_train_drlsg_all, x_test_drlsg, y_test_drlsg)
# Stochastic Gradient Descent
model_SGD(x_train_drlsg_all, y_train_drlsg_all, x_test_drlsg, y_test_drlsg)
# Multi-layer Perceptron
model_MLP(x_train_drlsg_all, y_train_drlsg_all, x_test_drlsg, y_test_drlsg)
# Regressão Logística
model_regRegression(x_train_drlsg_all, y_train_drlsg_all, x_test_drlsg, y_test_drlsg)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6282, 0.6390, 0.6272, 0.6215, 0.0065
Random Forest, 0.8539, 0.8513, 0.8524, 0.8507, 2.1165
SVM, 0.2511, 0.4306, 0.2533, 0.2659, 497.3099
SGD, 0.2725, 0.4010, 0.2767, 0.2541, 13.3762
MLP, 0.5021, 0.5115, 0.4994, 0.4875, 239.8843
Regressão Logística, 0.5432, 0.5395, 0.5415, 0.5294, 31.9107


Using DRLSG only in the testing phase in a model trained with O3

In [8]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_drlsg, y_test_drlsg)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0471, 0.0894, 0.0478, 0.0441, 0.1092
Random Forest, 0.2135, 0.2380, 0.2151, 0.1816, 2.7790
SVM, 0.0978, 0.1320, 0.0990, 0.0642, 583.2069
SGD, 0.0346, 0.0946, 0.0330, 0.0301, 9.6509
MLP, 0.0689, 0.0850, 0.0667, 0.0553, 247.6990
Regressão Logística, 0.0739, 0.0831, 0.0722, 0.0466, 35.2024


##### **GA**

In [9]:
# GA
x_train_ga, y_train_ga = train_ga.iloc[:,1:-1].to_numpy(), train_ga.iloc[:,-1].to_numpy()
x_test_ga, y_test_ga = test_ga.iloc[:,1:-1].to_numpy(), test_ga.iloc[:,-1].to_numpy()

# GA and original 
x_train_ga_all = np.concatenate((x_train[0:int(size/2),:],x_train_ga[0:int(size/2),:]))
y_train_ga_all = np.concatenate((y_train[0:int(size/2)],y_train_ga[0:int(size/2)]))

Using GA only in the testing phase

In [10]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ga, y_test_ga)
# Random Forest
model_rf(x_train, y_train, x_test_ga, y_test_ga)
# SVM
model_svm(x_train, y_train, x_test_ga, y_test_ga)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ga, y_test_ga)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ga, y_test_ga)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ga, y_test_ga)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.3369, 0.4057, 0.3303, 0.3169, 0.0069
Random Forest, 0.4206, 0.5657, 0.3953, 0.3814, 2.1516
SVM, 0.1147, 0.2301, 0.0937, 0.0916, 576.9508
SGD, 0.1685, 0.3043, 0.1465, 0.1523, 15.1617
MLP, 0.2101, 0.2681, 0.1947, 0.1853, 280.7227
Regressão Logística, 0.3359, 0.4335, 0.3059, 0.3104, 37.0305


Using GA in the testing and training phase

In [11]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_ga_all, y_train_ga_all, x_test_ga, y_test_ga)
# Random Forest
model_rf(x_train_ga_all, y_train_ga_all, x_test_ga, y_test_ga)
# SVM
model_svm(x_train_ga_all, y_train_ga_all, x_test_ga, y_test_ga)
# Stochastic Gradient Descent
model_SGD(x_train_ga_all, y_train_ga_all, x_test_ga, y_test_ga)
# Multi-layer Perceptron
model_MLP(x_train_ga_all, y_train_ga_all, x_test_ga, y_test_ga)
# Regressão Logística
model_regRegression(x_train_ga_all, y_train_ga_all, x_test_ga, y_test_ga)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.5988, 0.5791, 0.5677, 0.5576, 0.0068
Random Forest, 0.7822, 0.7614, 0.7495, 0.7483, 3.3513
SVM, 0.2071, 0.3599, 0.1646, 0.1791, 612.1969
SGD, 0.3418, 0.3654, 0.2920, 0.2656, 14.1171
MLP, 0.5618, 0.5141, 0.5123, 0.4990, 237.9616
Regressão Logística, 0.5656, 0.5544, 0.5212, 0.5194, 31.9303


Using GA only in the testing phase in a model trained with O3

In [11]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ga, y_test_ga)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ga, y_test_ga)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0285, 0.0557, 0.0258, 0.0199, 0.0086
Random Forest, 0.1879, 0.1832, 0.1511, 0.1339, 2.8812
SVM, 0.0708, 0.0989, 0.0512, 0.0419, 677.4367
SGD, 0.0328, 0.0645, 0.0392, 0.0241, 8.7618
MLP, 0.0459, 0.0728, 0.0355, 0.0290, 283.5257
Regressão Logística, 0.0823, 0.0553, 0.0639, 0.0388, 35.1984


##### **MCMC**

In [10]:
# MCMC
x_train_mcmc, y_train_mcmc = train_mcmc.iloc[:,1:-1].to_numpy(), train_mcmc.iloc[:,-1].to_numpy()
x_test_mcmc, y_test_mcmc = test_mcmc.iloc[:,1:-1].to_numpy(), test_mcmc.iloc[:,-1].to_numpy()

# MCMC and original 
x_train_mcmc_all = np.concatenate((x_train[0:int(size/2),:],x_train_mcmc[0:int(size/2),:]))
y_train_mcmc_all = np.concatenate((y_train[0:int(size/2)],y_train_mcmc[0:int(size/2)]))

Using MCMC only in the testing phase

In [13]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Random Forest
model_rf(x_train, y_train, x_test_mcmc, y_test_mcmc)
# SVM
model_svm(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_mcmc, y_test_mcmc)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_mcmc, y_test_mcmc)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.3369, 0.4475, 0.3456, 0.3504, 0.0068
Random Forest, 0.3885, 0.7170, 0.3851, 0.4043, 2.1630
SVM, 0.1078, 0.3395, 0.0915, 0.1075, 577.8695
SGD, 0.2027, 0.3430, 0.1904, 0.1831, 14.4103
MLP, 0.1904, 0.3231, 0.1862, 0.1793, 286.6292
Regressão Logística, 0.3345, 0.4718, 0.3271, 0.3378, 33.0423


Using MCMC in the testing and training phase

In [14]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_mcmc_all, y_train_mcmc_all, x_test_mcmc, y_test_mcmc)
# Random Forest
model_rf(x_train_mcmc_all, y_train_mcmc_all, x_test_mcmc, y_test_mcmc)
# SVM
model_svm(x_train_mcmc_all, y_train_mcmc_all, x_test_mcmc, y_test_mcmc)
# Stochastic Gradient Descent
model_SGD(x_train_mcmc_all, y_train_mcmc_all, x_test_mcmc, y_test_mcmc)
# Multi-layer Perceptron
model_MLP(x_train_mcmc_all, y_train_mcmc_all, x_test_mcmc, y_test_mcmc)
# Regressão Logística
model_regRegression(x_train_mcmc_all, y_train_mcmc_all, x_test_mcmc, y_test_mcmc)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.5240, 0.5355, 0.5130, 0.5057, 0.0106
Random Forest, 0.7162, 0.7017, 0.7022, 0.6958, 2.0998
SVM, 0.1950, 0.3474, 0.1674, 0.1637, 634.2993
SGD, 0.2755, 0.3975, 0.2593, 0.2390, 17.1624
MLP, 0.4998, 0.4837, 0.4791, 0.4637, 326.4365
Regressão Logística, 0.5151, 0.4936, 0.4986, 0.4842, 38.4094


Using MCMC only in the testing phase in a model trained with O3

In [12]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_mcmc, y_test_mcmc)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0284, 0.0727, 0.0281, 0.0224, 0.0067
Random Forest, 0.1713, 0.2034, 0.1583, 0.1299, 2.3952
SVM, 0.0796, 0.1017, 0.0747, 0.0490, 541.0737
SGD, 0.0331, 0.0564, 0.0399, 0.0216, 8.8514
MLP, 0.0465, 0.0946, 0.0457, 0.0343, 300.3045
Regressão Logística, 0.0689, 0.0704, 0.0638, 0.0359, 34.8614


##### **RS**

In [13]:
# RS
x_train_rs, y_train_rs = train_rs.iloc[:,1:-1].to_numpy(), train_rs.iloc[:,-1].to_numpy()
x_test_rs, y_test_rs = test_rs.iloc[:,1:-1].to_numpy(), test_rs.iloc[:,-1].to_numpy()

# RS and original 
x_train_rs_all = np.concatenate((x_train[0:int(size/2),:],x_train_rs[0:int(size/2),:]))
y_train_rs_all = np.concatenate((y_train[0:int(size/2)],y_train_rs[0:int(size/2)]))

Using RS only in the testing phase

In [16]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_rs, y_test_rs)
# Random Forest
model_rf(x_train, y_train, x_test_rs, y_test_rs)
# SVM
model_svm(x_train, y_train, x_test_rs, y_test_rs)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_rs, y_test_rs)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_rs, y_test_rs)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_rs, y_test_rs)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.3722, 0.4782, 0.3768, 0.3860, 0.0079
Random Forest, 0.4692, 0.7356, 0.4652, 0.4993, 2.4462
SVM, 0.2629, 0.4660, 0.2571, 0.2660, 529.6838
SGD, 0.2320, 0.4507, 0.2134, 0.2177, 12.6970
MLP, 0.2440, 0.3723, 0.2379, 0.2452, 319.7244
Regressão Logística, 0.3598, 0.4707, 0.3517, 0.3674, 36.9420


Using RS in the testing and training phase

In [17]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_rs_all, y_train_rs_all, x_test_rs, y_test_rs)
# Random Forest
model_rf(x_train_rs_all, y_train_rs_all, x_test_rs, y_test_rs)
# SVM
model_svm(x_train_rs_all, y_train_rs_all, x_test_rs, y_test_rs)
# Stochastic Gradient Descent
model_SGD(x_train_rs_all, y_train_rs_all, x_test_rs, y_test_rs)
# Multi-layer Perceptron
model_MLP(x_train_rs_all, y_train_rs_all, x_test_rs, y_test_rs)
# Regressão Logística
model_regRegression(x_train_rs_all, y_train_rs_all, x_test_rs, y_test_rs)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.5004, 0.5061, 0.4927, 0.4835, 0.0060
Random Forest, 0.7174, 0.7089, 0.7050, 0.7004, 3.4776
SVM, 0.3644, 0.4873, 0.3396, 0.3276, 635.1598
SGD, 0.2878, 0.3570, 0.2628, 0.2376, 16.7394
MLP, 0.4776, 0.4502, 0.4554, 0.4413, 242.0287
Regressão Logística, 0.5152, 0.4965, 0.5024, 0.4902, 35.0606


Using RS only in the testing phase in a model trained with O3

In [14]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_rs, y_test_rs)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_rs, y_test_rs)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0306, 0.0726, 0.0306, 0.0247, 0.0064
Random Forest, 0.1653, 0.2123, 0.1559, 0.1330, 3.8673
SVM, 0.0765, 0.1129, 0.0703, 0.0458, 589.9705
SGD, 0.0362, 0.0820, 0.0420, 0.0209, 11.1637
MLP, 0.0488, 0.0656, 0.0493, 0.0360, 285.2802
Regressão Logística, 0.0635, 0.0631, 0.0607, 0.0341, 35.0562
