## Models with POJ Dataset (OLLVM + O3)

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [3]:
# Without obfuscation
df = pd.read_csv("./../../../features/POJ/poj_features_O0.csv", skipinitialspace=True)

# Optimizations
df_O3 = pd.read_csv("./../../../features/POJ/poj_features_O3.csv", skipinitialspace=True)

# OLLVM
df_ollvm_O3 = pd.read_csv("./../../../features/POJ/poj_features_ollvm_O3.csv", skipinitialspace=True)

#### ~> Split the datasets into a training and test set

In [4]:
# Without obfuscation
train, test = train_test_split(df, test_size=0.2)

# Optimizations
train_O3, test_O3 = train_test_split(df_O3, test_size=0.2)

# OLLVM
train_ollvm_O3, test_ollvm_O3 = train_test_split(df_ollvm_O3, test_size=0.2)

### Case of Studies

#### ~> OLLVM + O3

In [5]:
# Training and testing database without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

size = len(x_train)

# O3
x_train_O3, y_train_O3 = train_O3.iloc[:,1:-1].to_numpy(), train_O3.iloc[:,-1].to_numpy()
x_test_O3, y_test_O3 = test_O3.iloc[:,1:-1].to_numpy(), test_O3.iloc[:,-1].to_numpy() 

# ollvm with O3 opt
x_train_ollvm_O3, y_train_ollvm_O3 = train_ollvm_O3.iloc[:,0:-1].to_numpy(), train_ollvm_O3.iloc[:,-1].to_numpy()
x_test_ollvm_O3, y_test_ollvm_O3 = test_ollvm_O3.iloc[:,0:-1].to_numpy(), test_ollvm_O3.iloc[:,-1].to_numpy()

# train+(ollvm + o3)
x_train_ollvm_O3_all = np.concatenate((x_train[0:int(size/2),:],x_train_ollvm_O3[0:int(size/2),:]))
y_train_ollvm_O3_all = np.concatenate((y_train[0:int(size/2)],y_train_ollvm_O3[0:int(size/2)]))

Using OLLVM+O3 only in the testing phase

In [6]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train, y_train, x_test_ollvm_O3, y_test_ollvm_O3)
# Random Forest
model_rf(x_train, y_train, x_test_ollvm_O3, y_test_ollvm_O3)
# SVM
model_svm(x_train, y_train, x_test_ollvm_O3, y_test_ollvm_O3)
# Stochastic Gradient Descent
model_SGD(x_train, y_train, x_test_ollvm_O3, y_test_ollvm_O3)
# Multi-layer Perceptron
model_MLP(x_train, y_train, x_test_ollvm_O3, y_test_ollvm_O3)
# Regressão Logística
model_regRegression(x_train, y_train, x_test_ollvm_O3, y_test_ollvm_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0344, 0.0860, 0.0349, 0.0264, 0.6207
Random Forest, 0.2131, 0.3134, 0.2117, 0.1904, 3.5334
SVM, 0.0357, 0.0521, 0.0357, 0.0232, 530.4117
SGD, 0.0722, 0.1464, 0.0725, 0.0561, 17.0893
MLP, 0.0265, 0.0434, 0.0279, 0.0162, 363.3837
Regressão Logística, 0.0431, 0.0875, 0.0430, 0.0277, 44.4315


Using OLLVM+O3 in the testing and training phase

In [7]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_ollvm_O3_all, y_train_ollvm_O3_all, x_test_ollvm_O3, y_test_ollvm_O3)
# Random Forest
model_rf(x_train_ollvm_O3_all, y_train_ollvm_O3_all, x_test_ollvm_O3, y_test_ollvm_O3)
# SVM
model_svm(x_train_ollvm_O3_all, y_train_ollvm_O3_all, x_test_ollvm_O3, y_test_ollvm_O3)
# Stochastic Gradient Descent
model_SGD(x_train_ollvm_O3_all, y_train_ollvm_O3_all, x_test_ollvm_O3, y_test_ollvm_O3)
# Multi-layer Perceptron
model_MLP(x_train_ollvm_O3_all, y_train_ollvm_O3_all, x_test_ollvm_O3, y_test_ollvm_O3)
# Regressão Logística
model_regRegression(x_train_ollvm_O3_all, y_train_ollvm_O3_all, x_test_ollvm_O3, y_test_ollvm_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.3542, 0.3866, 0.3579, 0.3542, 0.0327
Random Forest, 0.6251, 0.6215, 0.6263, 0.6185, 4.2145
SVM, 0.1576, 0.4413, 0.1585, 0.1721, 699.7807
SGD, 0.2318, 0.3964, 0.2352, 0.2359, 18.3272
MLP, 0.4350, 0.4289, 0.4370, 0.4202, 349.3578
Regressão Logística, 0.4051, 0.3919, 0.4076, 0.3840, 48.4023


Using OLLVM+O3 in the testing phase in a model trained with O3

In [8]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_O3, y_train_O3, x_test_ollvm_O3, y_test_ollvm_O3)
# Random Forest
model_rf(x_train_O3, y_train_O3, x_test_ollvm_O3, y_test_ollvm_O3)
# SVM
model_svm(x_train_O3, y_train_O3, x_test_ollvm_O3, y_test_ollvm_O3)
# Stochastic Gradient Descent
model_SGD(x_train_O3, y_train_O3, x_test_ollvm_O3, y_test_ollvm_O3)
# Multi-layer Perceptron
model_MLP(x_train_O3, y_train_O3, x_test_ollvm_O3, y_test_ollvm_O3)
# Regressão Logística
model_regRegression(x_train_O3, y_train_O3, x_test_ollvm_O3, y_test_ollvm_O3)

model, accuracy, precision, recall, fscore, time (s)
KNN, 0.0280, 0.1192, 0.0290, 0.0267, 0.0063
Random Forest, 0.1551, 0.2201, 0.1552, 0.1299, 3.6090
SVM, 0.0342, 0.1366, 0.0357, 0.0282, 528.8298
SGD, 0.0204, 0.0886, 0.0216, 0.0179, 11.2371
MLP, 0.0655, 0.0908, 0.0657, 0.0501, 348.6260
Regressão Logística, 0.0319, 0.1182, 0.0338, 0.0239, 36.7364
