# Imports

In [1]:
import yaml

In [106]:
from itertools import product

In [2]:
import logging

In [3]:
import logging.config

In [4]:
import torch

In [5]:
import numpy as np

In [6]:
import pandas as pd

In [7]:
import torch.nn as nn

In [8]:
import torch.utils.data

In [9]:
from os import makedirs

In [10]:
from datetime import datetime

In [11]:
from os.path import join, exists

In [12]:
from torch.autograd import Variable

In [13]:
from sklearn.decomposition import PCA

In [14]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# Utils Functions  

In [15]:
def epochs(train_dataset, n_iters, batch_size):
    num_of_epochs = int(n_iters/(len(train_dataset)/batch_size))
    return num_of_epochs

In [16]:
def feature_from_correlation(correlation_map, lower_range, upper_range):
    filtered_features = []
    corresponding_correlations = []
    
    for index, row in correlation_map.iterrows():
        correlation = round(row.URL_Type_obf_Type, 4)
        
        if correlation<lower_range or  correlation>upper_range:
            filtered_features.append(index)
            corresponding_correlations.append(correlation)
    
    filtered_features.pop(-1)
    corresponding_correlations.pop(-1)
    
    dataframe = pd.DataFrame(data = {"Features":filtered_features, "Correlations":corresponding_correlations})
    return dataframe

In [17]:
def split_dataset(dataset, training_allocation, testing_allocation):
    training_dataset_length = int(training_allocation*len(dataset))    
    dataset_training = dataset[:training_dataset_length]
    dataset_training = dataset_training.to_numpy().astype("float32") 
    dataset_training = torch.tensor(dataset_training)
    
    print("Training set shape: {}".format(dataset_training.shape))
    
    testing_dataset_length = int(testing_allocation*len(dataset))
    dataset_testing = dataset[-testing_dataset_length:]
    dataset_testing = dataset_testing.to_numpy().astype("float32") 
    dataset_testing = torch.tensor(dataset_testing)
    
    print("Testing set shape: {}".format(dataset_testing.shape))
    
    return dataset_training, dataset_testing

# Load Data

In [18]:
dataset = pd.read_csv("all.csv", low_memory = False, na_values = [-1, "nan", "Infinity"])

**Explore dataset**

In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36707 entries, 0 to 36706
Data columns (total 80 columns):
Querylength                        36707 non-null int64
domain_token_count                 36707 non-null int64
path_token_count                   36707 non-null int64
avgdomaintokenlen                  36707 non-null float64
longdomaintokenlen                 36707 non-null int64
avgpathtokenlen                    36427 non-null float64
tld                                36707 non-null int64
charcompvowels                     36707 non-null int64
charcompace                        36707 non-null int64
ldl_url                            36707 non-null int64
ldl_domain                         36707 non-null int64
ldl_path                           36707 non-null int64
ldl_filename                       36707 non-null int64
ldl_getArg                         36707 non-null int64
dld_url                            36707 non-null int64
dld_domain                         36707 non-nu

In [20]:
dataset["URL_Type_obf_Type"].value_counts()

Defacement    7930
benign        7781
phishing      7586
malware       6712
spam          6698
Name: URL_Type_obf_Type, dtype: int64

# Clean data

**Replace target categorical data with encoded label**

In [21]:
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("Defacement", 0.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("benign", 1.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("phishing", 2.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("malware", 3.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("spam", 4.0)

**Fill NaN values with zeros**

In [22]:
dataset = dataset.fillna(0.0)

**Convert entire dataframe to numeric** 

In [23]:
# By examining the output of dataset.info() you can see that 
# there are columns within the dataset that are of type object 
#(they are a string).
dataset = dataset.apply(pd.to_numeric)

In [24]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36707 entries, 0 to 36706
Data columns (total 80 columns):
Querylength                        36707 non-null int64
domain_token_count                 36707 non-null int64
path_token_count                   36707 non-null int64
avgdomaintokenlen                  36707 non-null float64
longdomaintokenlen                 36707 non-null int64
avgpathtokenlen                    36707 non-null float64
tld                                36707 non-null int64
charcompvowels                     36707 non-null int64
charcompace                        36707 non-null int64
ldl_url                            36707 non-null int64
ldl_domain                         36707 non-null int64
ldl_path                           36707 non-null int64
ldl_filename                       36707 non-null int64
ldl_getArg                         36707 non-null int64
dld_url                            36707 non-null int64
dld_domain                         36707 non-nu

**Find all zero columns**

In [25]:
column_names = dataset.columns.tolist()

In [26]:
for name in column_names:
    mean = dataset[name].mean()
    if mean == 0.0:
        print("Column >>{}<< is all zeros".format(name))

Column >>isPortEighty<< is all zeros
Column >>ISIpAddressInDomainName<< is all zeros


Since **isPortEighty** & **ISIpAddressInDomainName** is all zeros, there is no useful information that the model can generalize from those columns. Therefore, they are removed.

In [27]:
dataset = dataset.drop(["isPortEighty", "ISIpAddressInDomainName"], axis=1)

# Feature Selection

**Based on PCA Algorithm**

In [28]:
dataset_PCA = PCA()

**Based on Correlation Map**

In [29]:
correlation_map = dataset.corr(method = "pearson")

In [30]:
correlation_map

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
Querylength,1.000000,0.045174,0.004259,-0.056154,0.008193,-0.122085,0.045174,0.791216,0.844110,0.953382,...,0.116011,0.109792,0.094518,-0.369418,0.001953,0.036747,-0.122276,0.036866,0.175750,0.145545
domain_token_count,0.045174,1.000000,-0.101013,-0.212984,0.068065,-0.024911,1.000000,0.001817,0.039905,0.053445,...,0.108333,0.075094,0.076662,-0.012043,-0.503515,-0.016174,0.076851,0.113176,0.119657,0.249154
path_token_count,0.004259,-0.101013,1.000000,-0.105089,-0.099711,-0.104183,-0.101013,0.459537,0.373155,-0.033006,...,0.409953,0.423085,0.460825,-0.580160,0.060732,-0.032844,-0.310913,-0.101780,0.269168,0.014509
avgdomaintokenlen,-0.056154,-0.212984,-0.105089,1.000000,0.895807,-0.024258,-0.212984,-0.084126,-0.074916,-0.049760,...,-0.054421,-0.027865,-0.007838,-0.144895,-0.220215,-0.130713,-0.054456,-0.122259,-0.152891,-0.195149
longdomaintokenlen,0.008193,0.068065,-0.099711,0.895807,1.000000,-0.042643,0.068065,-0.023848,-0.006579,0.011607,...,0.023638,0.043043,0.049421,-0.176775,-0.366726,-0.138398,-0.042160,-0.082679,-0.073875,-0.113577
avgpathtokenlen,-0.122085,-0.024911,-0.104183,-0.024258,-0.042643,1.000000,-0.024911,0.008739,0.032319,0.039688,...,-0.171966,-0.145126,-0.217422,-0.025996,0.035597,0.177189,0.042192,-0.138430,-0.229490,0.133935
tld,0.045174,1.000000,-0.101013,-0.212984,0.068065,-0.024911,1.000000,0.001817,0.039905,0.053445,...,0.108333,0.075094,0.076662,-0.012043,-0.503515,-0.016174,0.076851,0.113176,0.119657,0.249154
charcompvowels,0.791216,0.001817,0.459537,-0.084126,-0.023848,0.008739,0.001817,1.000000,0.946030,0.739051,...,0.306298,0.311852,0.292281,-0.688249,0.022991,0.004405,-0.246180,-0.083550,0.202443,0.064909
charcompace,0.844110,0.039905,0.373155,-0.074916,-0.006579,0.032319,0.039905,0.946030,1.000000,0.830184,...,0.240647,0.244763,0.216296,-0.638226,0.003651,0.085476,-0.213121,-0.056297,0.188906,0.170606
ldl_url,0.953382,0.053445,-0.033006,-0.049760,0.011607,0.039688,0.053445,0.739051,0.830184,1.000000,...,0.003677,0.001920,-0.026279,-0.331338,0.002309,0.116951,-0.104020,0.011805,0.111187,0.232795


In [31]:
correlated_features = feature_from_correlation(correlation_map, -0.15, 0.25)

In [32]:
correlated_features

Unnamed: 0,Features,Correlations
0,avgdomaintokenlen,-0.1951
1,dld_url,0.2562
2,dld_path,0.2577
3,dld_getArg,0.2568
4,URL_DigitCount,0.2966
5,Extension_DigitCount,0.2823
6,Arguments_LongestWordLength,-0.1934
7,spcharUrl,0.2603
8,delimeter_path,-0.1592
9,NumberRate_URL,0.2727


In [33]:
correlation_based_dataset = dataset[correlated_features.Features.tolist()].sample(frac = 1).reset_index(drop = True)

**Split Data**

In [34]:
correlation_based_dataset_training, correlation_based_dataset_testing = split_dataset(correlation_based_dataset, 0.80, 0.20)

Training set shape: torch.Size([29365, 13])
Testing set shape: torch.Size([7341, 13])


## General Dataset

In [35]:
general_shuffled_dataset = dataset.sample(frac = 1).reset_index(drop = True)

**Split Data**

In [42]:
general_training_set, general_testing_set = split_dataset(general_shuffled_dataset, 0.80, 0.20)

Training set shape: torch.Size([29365, 78])
Testing set shape: torch.Size([7341, 78])


# Logistic Regression

**Define the Network**

In [37]:
class LogisticRegressionModel(nn.Module):
    
    def __init__(self, in_dim, num_classes):
        super().__init__()
        self.linear = nn.Linear(in_dim, num_classes)

    def forward(self, x):
        out = self.linear(x)
        return out

**Train Class**

In [84]:
class Train(LogisticRegressionModel):
    
    def __init__(self, in_dim, num_classes, lr, batch_size):
        super().__init__(in_dim, num_classes)
        self.batch_size = batch_size
        self.learning_rate = lr
        self.input_layer_dim = in_dim
        self.output_layer_dim = num_classes
        self.criterion = nn.CrossEntropyLoss()
        self.model = LogisticRegressionModel(self.input_layer_dim, self.output_layer_dim)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = self.learning_rate)
        
        self.model_directory = "./models"
        if not exists(self.model_directory):
            makedirs(self.model_directory)
        
    def train_model(self, sub_directory, training_data, encoded_labels, unencoded_labels, n_iters, save_iterations = 20):
        
        subset = str(self.learning_rate)+"_"+str(n_iters)
        sub_dir = join(sub_directory, subset)
        directory = join(self.model_directory, sub_dir)
        
        if not exists(directory):
            makedirs(directory)
        
        report = None
        precision = None
        recall = None
        f1score = None
        accuracy = None
        report = None
        model_names = []
        
        batch_size = self.batch_size
        num_epochs = range(epochs(training_data, n_iters, batch_size))
        training_data = torch.utils.data.DataLoader(dataset=training_data, batch_size=batch_size, shuffle=True)
        
        file_name = join(directory, "training_progress.txt")
        with open(file_name, "a") as txt_file:
            
            time = datetime.strftime(datetime.now(), "%Y-%m-%d_%H:%M:%S")
            txt_file.write("Initiate Training: {}".format(time) + "\n\n")
            
            txt_file.write("Learning Rate: {}".format(self.learning_rate) + "\n")
            txt_file.write("Number of  Epochs: {}".format(num_epochs) + "\n")
            txt_file.write("Batch Size: {}".format(self.batch_size) + "\n")
            txt_file.write(30*"--")
            txt_file.write("\n\n")
            
            for epoch in num_epochs:

                for i, data in enumerate(training_data):

                    X_train = data[:, :-1]
                    Y_train = data[:, -1]

                    if torch.cuda.is_available():
                        x = Variable(X_train).cuda()
                        y = Variable(Y_train).cuda()

                    else:
                        x = X_train.float()
                        y = Y_train.long()

                    out = self.model(x)
                    assert not torch.isnan(out).any(), out
                    loss = self.criterion(out, y)
                    pred = torch.max(out.data, 1)[1]
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                if epoch%20==0 or epoch==num_epochs[-1]:

                    precision = round(precision_score(y, pred, average="macro")*100, 5)
                    recall = round(recall_score(y, pred, average="macro")*100, 5)
                    f1score = round(f1_score(y, pred, average="macro")*100, 5)
                    accuracy = round(accuracy_score(y, pred)*100, 5)
                    report = classification_report(y, pred, encoded_labels, unencoded_labels)
                    
                    txt_file.write("Progress: {}".format(time)+"\n")
                    txt_file.write("Epoch: {}, Running Loss: {}".format(epoch, loss)+"\n")
                    txt_file.write("Running overall precision: {}%".format(precision)+"\n")
                    txt_file.write("Running overall recall: {}%".format(recall)+"\n")
                    txt_file.write("Running overall F1-Score: {}%".format(f1score)+"\n")
                    txt_file.write("Running overall accuracy: {}%".format(accuracy)+"\n")
                    txt_file.write("\n")

                    
                    if epoch != 0:

                        time = datetime.strftime(datetime.now(), "%Y-%m-%d_%H:%M:%S")

                        model_name = join(directory, time)
                        torch.save(self.model.state_dict(), model_name+".pkl")
                        model_names.append(model_name)

                        txt_file.write("Classification Report:"+"\n")
                        txt_file.write(report)
                        txt_file.write("\n")
                        
                        txt_file.write(40*"#"+"\n")
                        txt_file.write("Model {} saved".format(time)+"\n")
                        txt_file.write(40*"#"+"\n\n")

            model_names.append(model_name)
    
        return model_names


**Test Class**

In [162]:
class Test(LogisticRegressionModel):
    
    def __init__(self, input_size, num_classes):
        super().__init__(input_size, num_classes)
        self.input_size = input_size
        self.num_classes = num_classes
        self.model = LogisticRegressionModel(self.input_size, self.num_classes)
        

    def evaluate_model(self, testing_data, path, model_name, encoded_labels, unencoded_labels):
        model_name = name+".pkl"
        self.model.load_state_dict(torch.load(model_name))
        
        X_test = testing_data[:, :-1]
        Y_test = testing_data[:, -1]

        if torch.cuda.is_available():
            x = Variable(X_test).cuda()
            y = Variable(Y_test).cuda()
            
        else:
            x = X_test.float()
            y = Y_test.long()

        out = self.model(x)
        assert not torch.isnan(out).any(), out
        pred = torch.max(out.data, 1)[1]
        
        precision = round(precision_score(y, pred, average="macro")*100, 5)
        recall = round(recall_score(y, pred, average="macro")*100, 5)
        f1score = round(f1_score(y, pred, average="macro")*100, 5)
        accuracy = round(accuracy_score(y, pred)*100, 5)
        report = classification_report(y, pred, encoded_labels, unencoded_labels)
        
        file_name = join(path, "evaluations.txt")
        with open(file_name, "a") as txt_file:
            txt_file.write(35*"--"+"\n")
            txt_file.write("Model Name: {}".format(model_name)+"\n")
            txt_file.write(35*"--"+"\n")
            txt_file.write("Precision: {}%".format(precision)+"\n")
            txt_file.write("Recall: {}%".format(recall)+"\n")
            txt_file.write("F1-Score: {}%".format(f1score)+"\n")
            txt_file.write("Accuracy: {}%".format(accuracy)+"\n\n")

            txt_file.write("Classification Report:"+"\n")
            txt_file.write(report)
            txt_file.write("\n")

### GENERAL DATASET: Training & Testing Models

In [143]:
# These Global variables represent the target labels and their coorresponding encoded values
encoded_labels = [0.0, 1.0, 2.0, 3.0, 4.0]
unencoded_labels = ["Defacement", "benign", "phishing", "malware", "spam"]

**Define model parameters**

In [144]:
# Note that the last column of the training set is the target column. So the input dimensions is one less the 
# total number of columns so that the dimensions of the feature data are only considered
batch_size = 1000
targets = len(encoded_labels)
in_dim = general_training_set.shape[1]-1

- **Learning rate range = [0.01 --> 0.05]**
- **n_iterations range = [3500 --> 4000]**

In [120]:
learning_rates = np.arange(0.01, 0.06, 0.01).tolist()
n_iterations = list(range(3500, 4000, 200))
combination = list(product(learning_rates, n_iterations))

In [124]:
# Instantiate model & Train models
model_names = []
for comb in combination:
    general_dataset_model_train = Train(in_dim, targets, comb[0], batch_size)
    generated_model_names = general_dataset_model_train.train_model("general_dataset", general_training_set, encoded_labels, unencoded_labels, comb[1])
    model_names.extend(generated_model_names)

In [163]:
general_dataset_model_test = Test(in_dim, targets)

for name in model_names:
    general_dataset_model_test.evaluate_model(general_testing_set, "./models/general_dataset", name, encoded_labels, unencoded_labels)