# Imports

In [1]:
import torch

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
import torch.nn as nn

In [6]:
from torch.autograd import Variable

# Load Data

In [8]:
dataset = pd.read_csv("all.csv", low_memory = False)

**Explore dataset**

In [9]:
dataset.dtypes

Querylength                          int64
domain_token_count                   int64
path_token_count                     int64
avgdomaintokenlen                  float64
longdomaintokenlen                   int64
avgpathtokenlen                    float64
tld                                  int64
charcompvowels                       int64
charcompace                          int64
ldl_url                              int64
ldl_domain                           int64
ldl_path                             int64
ldl_filename                         int64
ldl_getArg                           int64
dld_url                              int64
dld_domain                           int64
dld_path                             int64
dld_filename                         int64
dld_getArg                           int64
urlLen                               int64
domainlength                         int64
pathLength                           int64
subDirLen                            int64
fileNameLen

In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36707 entries, 0 to 36706
Data columns (total 80 columns):
Querylength                        36707 non-null int64
domain_token_count                 36707 non-null int64
path_token_count                   36707 non-null int64
avgdomaintokenlen                  36707 non-null float64
longdomaintokenlen                 36707 non-null int64
avgpathtokenlen                    36707 non-null float64
tld                                36707 non-null int64
charcompvowels                     36707 non-null int64
charcompace                        36707 non-null int64
ldl_url                            36707 non-null int64
ldl_domain                         36707 non-null int64
ldl_path                           36707 non-null int64
ldl_filename                       36707 non-null int64
ldl_getArg                         36707 non-null int64
dld_url                            36707 non-null int64
dld_domain                         36707 non-nu

In [11]:
dataset["URL_Type_obf_Type"].value_counts()

Defacement    7930
benign        7781
phishing      7586
malware       6712
spam          6698
Name: URL_Type_obf_Type, dtype: int64

In [12]:
dataset.shape

(36707, 80)

**Clean data-set**

In [26]:
dataset = dataset.fillna(0)

In [29]:
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("Defacement", 1.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("benign", 2.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("phishing", 3.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("malware", 4.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("spam", 5.0)

# Feature Selection

**Correlation Map**

In [31]:
corr_table = dataset.corr(method = "pearson")

In [32]:
corr_table

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
Querylength,1.000000,0.045174,0.004259,-0.056154,0.008193,-0.122085,0.045174,0.791216,0.844110,0.953382,...,0.116514,0.110574,0.115224,-0.369418,0.001953,0.042716,-0.045895,0.043262,0.219868,0.145545
domain_token_count,0.045174,1.000000,-0.101013,-0.212984,0.068065,-0.024911,1.000000,0.001817,0.039905,0.053445,...,0.109263,0.076454,0.084297,-0.012043,-0.503515,0.006740,0.062294,0.106323,0.121748,0.249154
path_token_count,0.004259,-0.101013,1.000000,-0.105089,-0.099711,-0.104183,-0.101013,0.459537,0.373155,-0.033006,...,0.400531,0.413817,0.459120,-0.580160,0.060732,-0.080279,-0.228183,-0.132102,0.289849,0.014509
avgdomaintokenlen,-0.056154,-0.212984,-0.105089,1.000000,0.895807,-0.024258,-0.212984,-0.084126,-0.074916,-0.049760,...,-0.056301,-0.030137,-0.022116,-0.144895,-0.220215,-0.120673,-0.055794,-0.117593,-0.143593,-0.195149
longdomaintokenlen,0.008193,0.068065,-0.099711,0.895807,1.000000,-0.042643,0.068065,-0.023848,-0.006579,0.011607,...,0.021979,0.041137,0.040299,-0.176775,-0.366726,-0.117265,-0.038615,-0.078006,-0.060054,-0.113577
avgpathtokenlen,-0.122085,-0.024911,-0.104183,-0.024258,-0.042643,1.000000,-0.024911,0.008739,0.032319,0.039688,...,-0.166743,-0.140164,-0.228598,-0.025996,0.035597,0.167814,0.064083,-0.074968,-0.243176,0.133935
tld,0.045174,1.000000,-0.101013,-0.212984,0.068065,-0.024911,1.000000,0.001817,0.039905,0.053445,...,0.109263,0.076454,0.084297,-0.012043,-0.503515,0.006740,0.062294,0.106323,0.121748,0.249154
charcompvowels,0.791216,0.001817,0.459537,-0.084126,-0.023848,0.008739,0.001817,1.000000,0.946030,0.739051,...,0.303838,0.309776,0.302103,-0.688249,0.022991,0.006576,-0.124820,-0.062384,0.253196,0.064909
charcompace,0.844110,0.039905,0.373155,-0.074916,-0.006579,0.032319,0.039905,0.946030,1.000000,0.830184,...,0.239746,0.244214,0.230076,-0.638226,0.003651,0.076410,-0.096484,-0.032214,0.237791,0.170606
ldl_url,0.953382,0.053445,-0.033006,-0.049760,0.011607,0.039688,0.053445,0.739051,0.830184,1.000000,...,0.005754,0.004120,-0.004670,-0.331338,0.002309,0.107878,-0.030034,0.028661,0.148875,0.232795


# Data Split

In [75]:
shuffled_dataset = dataset.sample(frac = 1).reset_index(drop = True)

In [76]:
number_of_target_labels = len(shuffled_dataset.URL_Type_obf_Type.value_counts().index.values.tolist())

**Training Dataset**

In [77]:
training_length = int(0.8*len(shuffled_dataset))

In [78]:
training_set = shuffled_dataset[:training_length]

In [79]:
training_set = training_set.to_numpy().astype('float16') 

In [80]:
training_set = torch.tensor(training_set)

In [90]:
training_set.shape

torch.Size([29365, 80])

**Testing Dataset**

In [82]:
testing_length = int(0.2*len(shuffled_dataset))

In [83]:
testing_set = shuffled_dataset[:testing_length]

In [84]:
testing_set = testing_set.to_numpy().astype('float16') 

In [85]:
testing_set = torch.tensor(testing_set)

In [91]:
testing_set.shape

torch.Size([7341, 80])

# Logistic Regression

**Define the Network**

In [86]:
class LogisticRegressionModel(nn.Module):
    
    def __init__(self, in_dim, num_classes):
        super().__init__()
        self.linear = nn.Linear(in_dim, num_classes)

    def forward(self, x):
        out = self.linear(x)
        return out

**Train Model**

In [127]:
class Train(LogisticRegressionModel):

    def __init__(self, in_dim, num_classes, lr, batch_size):
        super().__init__(in_dim, num_classes)
        self.batch_size = batch_size
        self.learning_rate = lr
        self.input_layer_dim = in_dim
        self.output_layer_dim = num_classes
        
        self.criterion = nn.CrossEntropyLoss()
        self.model = LogisticRegressionModel(self.input_layer_dim, self.output_layer_dim)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = self.learning_rate)
#         self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.learning_rate)  
    
    def epochs(self, iterations, train_dataset, batch_size):
        epochs = int(iterations/(len(train_dataset)/batch_size))
        return epochs
    
    def train_model(self, training_data, n_iters):
        batch = self.batch_size
        epochs = self.epochs(n_iters, training_data, batch)
        
        training_data = torch.utils.data.DataLoader(dataset = training_data, batch_size = batch, shuffle = False)
        
        for epoch in range(epochs):
            
            for i, data in enumerate(training_data):

                X_train = data[:, :-1]
                Y_train = data[:, -1]

                if torch.cuda.is_available():
                    x = Variable(X_train).cuda()
                    y = Variable(Y_train).cuda()
                    
                else:
                    x = X_train.float()
                    y = Y_train.type(torch.LongTensor)

                # Clear gradients
                self.optimizer.zero_grad()
                
                # Forward propagation
                out = self.model(x).data
                
                # Calculate softmax and cross entropy loss
                loss = self.criterion(out, y)
                
                print(loss)

                # Calculate gradient
                loss.backward()
                
                # Update parameters
                self.optimizer.step()
                

In [128]:
train_class = Train((training_set.shape[1]-1), number_of_target_labels+1, 0.001, batch_size)
temp = train_class.train_model(training_set, batch_size)

tensor(18.8713)


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn