## Models with OJClone Dataset (CloneGen+O3)
Models trained and tested with CloneGen obfuscation methods + O3

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [3]:
# Without Obfuscations
df = pd.read_csv("./../../../features/OJClone/ojclone_features_O0.csv", skipinitialspace=True)

# Optimizations
df_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_O3.csv", skipinitialspace=True)

# Obfuscations+O3
df_drlsg_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_DRLSG_O3.csv", skipinitialspace=True)
df_ga_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_GA_O3.csv", skipinitialspace=True)
df_mcmc_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_MCMC_O3.csv", skipinitialspace=True)
df_rs_O3 = pd.read_csv("./../../../features/OJClone/ojclone_features_RS_O3.csv", skipinitialspace=True)

#### ~> Split the datasets into a training and test set

In [4]:
# Without Obfuscations
train, test = train_test_split(df, test_size=0.2)

# Optimizations
train_O3, test_O3 = train_test_split(df_O3, test_size=0.2)

# CloneGen
train_drlsg_O3, test_drlsg_O3 = train_test_split(df_drlsg_O3, test_size=0.2)
train_ga_O3, test_ga_O3 = train_test_split(df_ga_O3, test_size=0.2)
train_mcmc_O3, test_mcmc_O3 = train_test_split(df_mcmc_O3, test_size=0.2)
train_rs_O3, test_rs_O3 = train_test_split(df_rs_O3, test_size=0.2)

### Case of Studies

#### ~> Obfuscations

In [5]:
# O3
x_train_O3, y_train_O3 = train_O3.iloc[:,1:-1].to_numpy(), train_O3.iloc[:,-1].to_numpy()
x_test_O3, y_test_O3 = test_O3.iloc[:,1:-1].to_numpy(), test_O3.iloc[:,-1].to_numpy() 

# Training and testing database without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

##### **DRLSG+O3**


In [6]:
# DRLSG + O3
x_train_drlsg_O3, y_train_drlsg_O3 = train_drlsg_O3.iloc[:,1:-1].to_numpy(), train_drlsg_O3.iloc[:,-1].to_numpy()
x_test_drlsg_O3, y_test_drlsg_O3 = test_drlsg_O3.iloc[:,1:-1].to_numpy(), test_drlsg_O3.iloc[:,-1].to_numpy()

Using DRLSG+O3 only in the testing phase in a model trained with O0

In [7]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Random Forest
model_rf(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# SVM
model_svm(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_drlsg_O3, y_test_drlsg_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0443, 0.0606, 0.0432, 0.0280, 0.4286
Random Forest, 0.1862, 0.2471, 0.1848, 0.1675, 2.2807
SVM, 0.0626, 0.1283, 0.0620, 0.0502, 496.5516
SGD, 0.0524, 0.1017, 0.0519, 0.0439, 13.1461
MLP, 0.0455, 0.0631, 0.0451, 0.0373, 270.0722
Regressão Logística, 0.0628, 0.0921, 0.0605, 0.0437, 33.5647


Using DRLSG+O3 only in the testing phase in a model trained with O3

In [8]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_drlsg_O3, y_test_drlsg_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.6388, 0.6703, 0.6394, 0.6410, 0.0302
Random Forest, 0.8169, 0.8352, 0.8176, 0.8158, 2.2665
SVM, 0.3447, 0.5775, 0.3431, 0.3495, 483.9092
SGD, 0.3339, 0.5253, 0.3344, 0.3493, 9.3635
MLP, 0.4632, 0.4914, 0.4646, 0.4559, 201.8056
Regressão Logística, 0.5370, 0.5646, 0.5362, 0.5325, 32.9396


##### **GA+O3**

In [9]:
# GA + O3
x_train_ga_O3, y_train_ga_O3 = train_ga_O3.iloc[:,1:-1].to_numpy(), train_ga_O3.iloc[:,-1].to_numpy()
x_test_ga_O3, y_test_ga_O3 = test_ga_O3.iloc[:,1:-1].to_numpy(), test_ga_O3.iloc[:,-1].to_numpy()

Using GA+O3 only in the testing phase in a model trained with O0

In [10]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Random Forest
model_rf(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# SVM
model_svm(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ga_O3, y_test_ga_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ga_O3, y_test_ga_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0590, 0.0578, 0.0489, 0.0323, 0.0063
Random Forest, 0.2344, 0.2506, 0.1944, 0.1773, 2.0462
SVM, 0.0797, 0.0980, 0.0675, 0.0530, 433.9901
SGD, 0.0904, 0.1146, 0.0772, 0.0592, 11.9325
MLP, 0.0518, 0.0514, 0.0485, 0.0395, 236.3459
Regressão Logística, 0.0874, 0.1030, 0.0689, 0.0514, 33.7084


Using GA+O3 only in the testing phase in a model trained with O3

In [11]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ga_O3, y_test_ga_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7096, 0.7086, 0.7017, 0.6891, 0.0082
Random Forest, 0.8781, 0.8887, 0.8764, 0.8725, 2.2871
SVM, 0.4431, 0.5392, 0.4080, 0.4088, 480.5647
SGD, 0.3937, 0.4944, 0.3832, 0.3753, 8.3529
MLP, 0.5290, 0.5164, 0.4940, 0.4852, 319.5197
Regressão Logística, 0.5893, 0.5796, 0.5716, 0.5549, 33.6989


##### **MCMC+O3**

In [12]:
# MCMC + O3
x_train_mcmc_O3, y_train_mcmc_O3 = train_mcmc_O3.iloc[:,1:-1].to_numpy(), train_mcmc_O3.iloc[:,-1].to_numpy()
x_test_mcmc_O3, y_test_mcmc_O3 = test_mcmc_O3.iloc[:,1:-1].to_numpy(), test_mcmc_O3.iloc[:,-1].to_numpy()

Using MCMC+O3 only in the testing phase in a model trained with O0

In [13]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Random Forest
model_rf(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# SVM
model_svm(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_mcmc_O3, y_test_mcmc_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0549, 0.0890, 0.0486, 0.0346, 0.0070
Random Forest, 0.2347, 0.2813, 0.2095, 0.1884, 1.8245
SVM, 0.0722, 0.1337, 0.0666, 0.0534, 407.9540
SGD, 0.0739, 0.0907, 0.0687, 0.0555, 9.9238
MLP, 0.0559, 0.0629, 0.0527, 0.0430, 207.1046
Regressão Logística, 0.0772, 0.0761, 0.0679, 0.0530, 30.3435


Using MCMC+O3 only in the testing phase in a model trained with O3

In [14]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_mcmc_O3, y_test_mcmc_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7077, 0.7151, 0.7012, 0.6985, 0.0086
Random Forest, 0.8998, 0.9068, 0.8985, 0.8990, 2.5823
SVM, 0.4732, 0.6346, 0.4479, 0.4572, 508.1566
SGD, 0.3848, 0.5227, 0.3749, 0.3804, 9.0417
MLP, 0.5276, 0.5298, 0.5056, 0.4997, 239.9482
Regressão Logística, 0.6036, 0.6060, 0.5964, 0.5867, 33.3417


##### **RS+O3**

In [15]:
# RS + O3
x_train_rs_O3, y_train_rs_O3 = train_rs_O3.iloc[:,1:-1].to_numpy(), train_rs_O3.iloc[:,-1].to_numpy()
x_test_rs_O3, y_test_rs_O3 = test_rs_O3.iloc[:,1:-1].to_numpy(), test_rs_O3.iloc[:,-1].to_numpy()

Using RS+O3 only in the testing phase in a model trained with O0

In [16]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Random Forest
model_rf(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# SVM
model_svm(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_rs_O3, y_test_rs_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_rs_O3, y_test_rs_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0533, 0.0634, 0.0454, 0.0312, 0.0058
Random Forest, 0.2165, 0.2631, 0.1954, 0.1741, 2.0691
SVM, 0.0740, 0.1081, 0.0675, 0.0531, 471.0453
SGD, 0.0683, 0.1033, 0.0611, 0.0520, 11.7921
MLP, 0.0535, 0.0627, 0.0502, 0.0413, 289.4929
Regressão Logística, 0.0776, 0.0870, 0.0661, 0.0499, 32.1724


Using RS+O3 only in the testing phase in a model trained with O3

In [17]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_rs_O3, y_test_rs_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.7180, 0.7244, 0.7130, 0.7108, 0.0100
Random Forest, 0.9056, 0.9111, 0.9055, 0.9050, 2.3166
SVM, 0.5189, 0.6492, 0.5045, 0.5026, 569.9685
SGD, 0.3904, 0.5310, 0.3821, 0.3758, 10.0657
MLP, 0.5371, 0.5382, 0.5210, 0.5133, 242.5313
Regressão Logística, 0.6166, 0.6166, 0.6123, 0.6010, 33.2844
