## Models with POJ Dataset

---

### Setup

#### ~> Installing Libraries

In [1]:
!pip3 install -q seaborn
!pip3 install -q sklearn
!pip3 install -q keras

#### ~> Imports of the project

In [2]:
from models import model_knn, model_MLP, model_SGD, model_regRegression, model_svm, model_rf
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### ~> Loading the Datasets

In [None]:
# Without obfuscation
df = pd.read_csv("./../../../features/POJ/poj_features_O0.csv", skipinitialspace=True)

# Optimizations
df_O3 = pd.read_csv("./../../../features/POJ/poj_features_O3.csv", skipinitialspace=True)

# OLLVM
df_ollvm = pd.read_csv("./../../../features/POJ/poj_features_ollvm.csv", skipinitialspace=True)
df_ollvm_O3 = pd.read_csv("./../../../features/POJ/poj_features_ollvm_O3.csv", skipinitialspace=True)

#### ~> Split the datasets into a training and test set

In [None]:
# Without obfuscation
train, test = train_test_split(df, test_size=0.2)

# Optimizations
train_O3, test_O3 = train_test_split(df_O3, test_size=0.2)

# OLLVM
train_ollvm, test_ollvm = train_test_split(df_ollvm, test_size=0.2)
train_ollvm_O3, test_ollvm_O3 = train_test_split(df_ollvm_O3, test_size=0.2)

### Case of Studies

#### ~> Combination

In [None]:
# Training and testing database without obfuscation
x_train, y_train = train.iloc[:,1:-1].to_numpy(), train.iloc[:,-1].to_numpy()
x_test, y_test = test.iloc[:,1:-1].to_numpy(), test.iloc[:,-1].to_numpy()  

# Training and testing database with O3
x_train_O3, y_train_O3 = train_O3.iloc[:,1:-1].to_numpy(), train_O3.iloc[:,-1].to_numpy()
x_test_O3, y_test_O3 = test_O3.iloc[:,1:-1].to_numpy(), test_O3.iloc[:,-1].to_numpy() 

# Training and testing database with ollvm (fla+sub+bcf)
x_train_ollvm, y_train_ollvm = train_ollvm.iloc[:,1:-1].to_numpy(), train_ollvm.iloc[:,-1].to_numpy()
x_test_ollvm, y_test_ollvm = test_ollvm.iloc[:,1:-1].to_numpy(), test_ollvm.iloc[:,-1].to_numpy()

size = len(x_train)

**Combination (33% no obfuscation, 33% O3, 33% OLLVM)**

In [None]:
# comb (33% no obfuscation, 33% O3, 33% OLLVM)
x_train_comb = np.concatenate((x_train[0:int(size/3),:],x_train_O3[0:int(size/3),:]))
x_train_comb = np.concatenate((x_train_comb, x_train_ollvm[0:int(size/3),:]))

y_train_comb = np.concatenate((y_train[0:int(size/3)],y_train_O3[0:int(size/3)]))
y_train_comb = np.concatenate((y_train_comb, y_train_ollvm[0:int(size/3)]))

Testing with no obfuscation programs

In [None]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_comb, y_train_comb, x_test, y_test)
# Random Forest
model_rf(x_train_comb, y_train_comb, x_test, y_test)
# SVM
model_svm(x_train_comb, y_train_comb, x_test, y_test)
# Stochastic Gradient Descent
model_SGD(x_train_comb, y_train_comb, x_test, y_test)
# Multi-layer Perceptron
model_MLP(x_train_comb, y_train_comb, x_test, y_test)
# Regressão Logística
model_regRegression(x_train_comb, y_train_comb, x_test, y_test)

Testing with O3 program

In [None]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_comb, y_train_comb, x_test_O3, y_test_O3)
# Random Forest
model_rf(x_train_comb, y_train_comb, x_test_O3, y_test_O3)
# SVM
model_svm(x_train_comb, y_train_comb, x_test_O3, y_test_O3)
# Stochastic Gradient Descent
model_SGD(x_train_comb, y_train_comb, x_test_O3, y_test_O3)
# Multi-layer Perceptron
model_MLP(x_train_comb, y_train_comb, x_test_O3, y_test_O3)
# Regressão Logística
model_regRegression(x_train_comb, y_train_comb, x_test_O3, y_test_O3)

Testing with OLLVM programs

In [None]:
print("model, accuracy, precision, recall, fscore, time (s)")
# KNN
model_knn(x_train_comb, y_train_comb, x_test_ollvm, y_test_ollvm)
# Random Forest
model_rf(x_train_comb, y_train_comb, x_test_ollvm, y_test_ollvm)
# SVM
model_svm(x_train_comb, y_train_comb, x_test_ollvm, y_test_ollvm)
# Stochastic Gradient Descent
model_SGD(x_train_comb, y_train_comb, x_test_ollvm, y_test_ollvm)
# Multi-layer Perceptron
model_MLP(x_train_comb, y_train_comb, x_test_ollvm, y_test_ollvm)
# Regressão Logística
model_regRegression(x_train_comb, y_train_comb, x_test_ollvm, y_test_ollvm)