## Models with POJ Dataset (OLLVM)

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [3]:
# Without obfuscation
df = pd.read_csv("./../../../features/POJ/poj_features_O0.csv", skipinitialspace=True)

# OLLVM
df_ollvm = pd.read_csv("./../../../features/POJ/poj_features_ollvm.csv", skipinitialspace=True)
df_ollvm_fla = pd.read_csv("./../../../features/POJ/poj_features_ollvm_fla.csv", skipinitialspace=True)
df_ollvm_sub = pd.read_csv("./../../../features/POJ/poj_features_ollvm_sub.csv", skipinitialspace=True)
df_ollvm_bcf = pd.read_csv("./../../../features/POJ/poj_features_ollvm_bcf.csv", skipinitialspace=True)

#### ~> Split the datasets into a training and test set

In [4]:
# Without obfuscation
train, test = train_test_split(df, test_size=0.2)

# OLLVM
train_ollvm, test_ollvm = train_test_split(df_ollvm, test_size=0.2)
train_ollvm_fla, test_ollvm_fla = train_test_split(df_ollvm_fla, test_size=0.2)
train_ollvm_sub, test_ollvm_sub = train_test_split(df_ollvm_sub, test_size=0.2)
train_ollvm_bcf, test_ollvm_bcf = train_test_split(df_ollvm_bcf, test_size=0.2)

### Case of Studies

#### ~> OLLVM

In [5]:
# Training and testing database without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

size = len(x_train)

##### **OLLVM (sub+fla+bcf)**

In [6]:
# ollvm (fla+sub+bcf)
x_train_ollvm, y_train_ollvm = train_ollvm.iloc[:,1:-1].to_numpy(), train_ollvm.iloc[:,-1].to_numpy()
x_test_ollvm, y_test_ollvm = test_ollvm.iloc[:,1:-1].to_numpy(), test_ollvm.iloc[:,-1].to_numpy()

# train+ollvm
x_train_ollvm_all = np.concatenate((x_train[0:int(size/2),:],x_train_ollvm[0:int(size/2),:]))
y_train_ollvm_all = np.concatenate((y_train[0:int(size/2)],y_train_ollvm[0:int(size/2)]))

Using OLLVM only in the testing phase

In [7]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ollvm, y_test_ollvm)
# Random Forest
model_rf(x_train, y_train, x_test_ollvm, y_test_ollvm)
# SVM
model_svm(x_train, y_train, x_test_ollvm, y_test_ollvm)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ollvm, y_test_ollvm)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ollvm, y_test_ollvm)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ollvm, y_test_ollvm)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0320, 0.0972, 0.0336, 0.0219, 0.0071
Random Forest, 0.1544, 0.2712, 0.1556, 0.1169, 2.8186
SVM, 0.0211, 0.0563, 0.0214, 0.0138, 841.6456
SGD, 0.0294, 0.0967, 0.0308, 0.0172, 18.3199
MLP, 0.0201, 0.0369, 0.0195, 0.0104, 408.6895
Regressão Logística, 0.0267, 0.0712, 0.0266, 0.0166, 42.8034


Using OLLVM in the testing and training phase

In [8]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_ollvm_all, y_train_ollvm_all, x_test_ollvm, y_test_ollvm)
# Random Forest
model_rf(x_train_ollvm_all, y_train_ollvm_all, x_test_ollvm, y_test_ollvm)
# SVM
model_svm(x_train_ollvm_all, y_train_ollvm_all, x_test_ollvm, y_test_ollvm)
# Stochastic Gradient Descent
model_SGD(x_train_ollvm_all, y_train_ollvm_all, x_test_ollvm, y_test_ollvm)
# Multi-layer Perceptron
model_MLP(x_train_ollvm_all, y_train_ollvm_all, x_test_ollvm, y_test_ollvm)
# Regressão Logística
model_regRegression(x_train_ollvm_all, y_train_ollvm_all, x_test_ollvm, y_test_ollvm)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.2234, 0.2576, 0.2237, 0.2253, 0.0065
Random Forest, 0.5536, 0.5481, 0.5552, 0.5448, 5.4987
SVM, 0.1855, 0.3418, 0.1850, 0.1704, 1261.6316
SGD, 0.1667, 0.3116, 0.1660, 0.1517, 28.7273
MLP, 0.3998, 0.3902, 0.4032, 0.3856, 414.2007
Regressão Logística, 0.3004, 0.2886, 0.3017, 0.2827, 59.6846


##### **OLLVM Flattening (-fla)**



In [9]:
# ollvm fla
x_train_ollvm_fla, y_train_ollvm_fla = train_ollvm_fla.iloc[:,0:-1].to_numpy(), train_ollvm_fla.iloc[:,-1].to_numpy()
x_test_ollvm_fla, y_test_ollvm_fla = test_ollvm_fla.iloc[:,0:-1].to_numpy(), test_ollvm_fla.iloc[:,-1].to_numpy() 
# train+fla
x_train_ollvm_fla_all = np.concatenate((x_train[0:int(size/2),:],x_train_ollvm_fla[0:int(size/2),:]))
y_train_ollvm_fla_all = np.concatenate((y_train[0:int(size/2)],y_train_ollvm_fla[0:int(size/2)]))

Using OLLVM `-fla` only in the testing phase

In [10]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ollvm_fla, y_test_ollvm_fla)
# Random Forest
model_rf(x_train, y_train, x_test_ollvm_fla, y_test_ollvm_fla)
# SVM
model_svm(x_train, y_train, x_test_ollvm_fla, y_test_ollvm_fla)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ollvm_fla, y_test_ollvm_fla)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ollvm_fla, y_test_ollvm_fla)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ollvm_fla, y_test_ollvm_fla)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.1624, 0.2857, 0.1632, 0.1593, 0.0052
Random Forest, 0.8402, 0.8709, 0.8428, 0.8461, 3.5051
SVM, 0.0706, 0.1591, 0.0712, 0.0581, 536.9073
SGD, 0.1896, 0.3296, 0.1877, 0.1693, 16.3115
MLP, 0.0849, 0.1324, 0.0838, 0.0711, 404.6331
Regressão Logística, 0.1742, 0.2860, 0.1726, 0.1368, 49.7755


Using OLLVM `-fla` in the testing and training phase

In [11]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_ollvm_fla_all, y_train_ollvm_fla_all, x_test_ollvm_fla, y_test_ollvm_fla)
# Random Forest
model_rf(x_train_ollvm_fla_all, y_train_ollvm_fla_all, x_test_ollvm_fla, y_test_ollvm_fla)
# SVM
model_svm(x_train_ollvm_fla_all, y_train_ollvm_fla_all, x_test_ollvm_fla, y_test_ollvm_fla)
# Stochastic Gradient Descent
model_SGD(x_train_ollvm_fla_all, y_train_ollvm_fla_all, x_test_ollvm_fla, y_test_ollvm_fla)
# Multi-layer Perceptron
model_MLP(x_train_ollvm_fla_all, y_train_ollvm_fla_all, x_test_ollvm_fla, y_test_ollvm_fla)
# Regressão Logística
model_regRegression(x_train_ollvm_fla_all, y_train_ollvm_fla_all, x_test_ollvm_fla, y_test_ollvm_fla)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.5710, 0.5933, 0.5736, 0.5660, 0.0067
Random Forest, 0.8221, 0.8229, 0.8245, 0.8210, 3.3738
SVM, 0.2874, 0.5072, 0.2879, 0.2882, 610.7839
SGD, 0.3167, 0.4256, 0.3166, 0.2827, 20.4140
MLP, 0.4375, 0.4334, 0.4377, 0.4169, 429.1061
Regressão Logística, 0.5491, 0.5398, 0.5521, 0.5358, 45.4069


##### **OLLVM Instruction Substitution (-sub)**



In [12]:
# ollvm sub
x_train_ollvm_sub, y_train_ollvm_sub = train_ollvm_sub.iloc[:,0:-1].to_numpy(), train_ollvm_sub.iloc[:,-1].to_numpy()
x_test_ollvm_sub, y_test_ollvm_sub = test_ollvm_sub.iloc[:,0:-1].to_numpy(), test_ollvm_sub.iloc[:,-1].to_numpy()
# train+sub
x_train_ollvm_sub_all = np.concatenate((x_train[0:int(size/2),:],x_train_ollvm_sub[0:int(size/2),:]))
y_train_ollvm_sub_all = np.concatenate((y_train[0:int(size/2)],y_train_ollvm_sub[0:int(size/2)]))

Using OLLVM `-sub` only in the testing phase

In [13]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ollvm_sub, y_test_ollvm_sub)
# Random Forest
thaisRF = model_rf(x_train, y_train, x_test_ollvm_sub, y_test_ollvm_sub)
# SVM
model_svm(x_train, y_train, x_test_ollvm_sub, y_test_ollvm_sub)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ollvm_sub, y_test_ollvm_sub)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ollvm_sub, y_test_ollvm_sub)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ollvm_sub, y_test_ollvm_sub)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.2736, 0.4427, 0.2760, 0.2923, 0.0056
Random Forest, 0.7653, 0.8248, 0.7676, 0.7642, 3.8532
SVM, 0.1782, 0.3224, 0.1822, 0.1572, 597.9117
SGD, 0.1678, 0.3211, 0.1723, 0.1598, 17.3683
MLP, 0.1914, 0.2255, 0.1929, 0.1785, 332.2771
Regressão Logística, 0.1237, 0.2823, 0.1266, 0.1330, 44.7855


Using OLLVM `-sub` in the testing and training phase

In [14]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_ollvm_sub_all, y_train_ollvm_sub_all, x_test_ollvm_sub, y_test_ollvm_sub)
# Random Forest
model_rf(x_train_ollvm_sub_all, y_train_ollvm_sub_all, x_test_ollvm_sub, y_test_ollvm_sub)
# SVM
model_svm(x_train_ollvm_sub_all, y_train_ollvm_sub_all, x_test_ollvm_sub, y_test_ollvm_sub)
# Stochastic Gradient Descent
model_SGD(x_train_ollvm_sub_all, y_train_ollvm_sub_all, x_test_ollvm_sub, y_test_ollvm_sub)
# Multi-layer Perceptron
model_MLP(x_train_ollvm_sub_all, y_train_ollvm_sub_all, x_test_ollvm_sub, y_test_ollvm_sub)
# Regressão Logística
model_regRegression(x_train_ollvm_sub_all, y_train_ollvm_sub_all, x_test_ollvm_sub, y_test_ollvm_sub)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.5488, 0.5702, 0.5483, 0.5438, 0.0044
Random Forest, 0.8457, 0.8465, 0.8453, 0.8440, 3.4838
SVM, 0.2338, 0.4988, 0.2354, 0.2494, 544.1370
SGD, 0.2769, 0.4302, 0.2740, 0.2726, 17.7742
MLP, 0.5158, 0.5126, 0.5155, 0.5036, 390.0472
Regressão Logística, 0.5205, 0.5164, 0.5209, 0.5085, 41.1766


##### **OLLVM Bogus Control Flow (-bcf)**


In [15]:
# ollvm bcf
x_train_ollvm_bcf, y_train_ollvm_bcf = train_ollvm_bcf.iloc[:,0:-1].to_numpy(), train_ollvm_bcf.iloc[:,-1].to_numpy()
x_test_ollvm_bcf, y_test_ollvm_bcf = test_ollvm_bcf.iloc[:,0:-1].to_numpy(), test_ollvm_bcf.iloc[:,-1].to_numpy() 
#train+bcf
x_train_ollvm_bcf_all = np.concatenate((x_train[0:int(size/2),:],x_train_ollvm_bcf[0:int(size/2),:]))
y_train_ollvm_bcf_all = np.concatenate((y_train[0:int(size/2)],y_train_ollvm_bcf[0:int(size/2)]))

Using OLLVM `-bcf` only in the testing phase

In [16]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ollvm_bcf, y_test_ollvm_bcf)
# Random Forest
model_rf(x_train, y_train, x_test_ollvm_bcf, y_test_ollvm_bcf)
# SVM
model_svm(x_train, y_train, x_test_ollvm_bcf, y_test_ollvm_bcf)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ollvm_bcf, y_test_ollvm_bcf)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ollvm_bcf, y_test_ollvm_bcf)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ollvm_bcf, y_test_ollvm_bcf)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0732, 0.2242, 0.0748, 0.0679, 0.0055
Random Forest, 0.2303, 0.3627, 0.2358, 0.2059, 3.3511
SVM, 0.0461, 0.1267, 0.0453, 0.0298, 482.8144
SGD, 0.0652, 0.1984, 0.0676, 0.0478, 17.4223
MLP, 0.0776, 0.0921, 0.0787, 0.0535, 379.9012
Regressão Logística, 0.0679, 0.1500, 0.0679, 0.0486, 58.5566


Using OLLVM `-bcf` in the testing and training phase

In [17]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_ollvm_bcf_all, y_train_ollvm_bcf_all, x_test_ollvm_bcf, y_test_ollvm_bcf)
# Random Forest
model_rf(x_train_ollvm_bcf_all, y_train_ollvm_bcf_all, x_test_ollvm_bcf, y_test_ollvm_bcf)
# SVM
model_svm(x_train_ollvm_bcf_all, y_train_ollvm_bcf_all, x_test_ollvm_bcf, y_test_ollvm_bcf)
# Stochastic Gradient Descent
model_SGD(x_train_ollvm_bcf_all, y_train_ollvm_bcf_all, x_test_ollvm_bcf, y_test_ollvm_bcf)
# Multi-layer Perceptron
model_MLP(x_train_ollvm_bcf_all, y_train_ollvm_bcf_all, x_test_ollvm_bcf, y_test_ollvm_bcf)
# Regressão Logística
model_regRegression(x_train_ollvm_bcf_all, y_train_ollvm_bcf_all, x_test_ollvm_bcf, y_test_ollvm_bcf)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.3556, 0.3819, 0.3577, 0.3527, 0.0052
Random Forest, 0.5873, 0.5836, 0.5888, 0.5800, 4.2110
SVM, 0.1018, 0.2715, 0.1028, 0.1109, 605.8398
SGD, 0.1474, 0.3370, 0.1492, 0.1638, 18.5848
MLP, 0.4445, 0.4532, 0.4480, 0.4328, 431.1286
Regressão Logística, 0.4262, 0.4219, 0.4283, 0.4122, 48.5192
