# Multilayer Perceptron Modeling Phishing Data

In [1]:
import numpy as np
import pandas as pd
import csv
import random
from sklearn.model_selection import train_test_split

## Model Class from Titanic Dataset

In [2]:
# Activation Functions
def sigmoid(activation):
    return 1.0 / (1.0 + np.exp(-activation))

def ReLU(activation):
    return max(0, activation)

def leaky_ReLU(activation):
    return max(0.01 * activation, activation)





# Derivatives of Activation Functions
def d_sm(output):
    return output * (1 - output)

def d_ReLU(output):
    return 1 if output > 0 else 0

def d_leaky(output):
    return 1 if output > 0 else 0.01

In [64]:
# Create Neural Network class 
class NN:
    
    # Init arguments
    def __init__(self, n_inputs, n_hidden, n_outputs):
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_outputs = n_outputs
    
    
    
    
    
    # Function for initializing network - when called creates an empty list 'network' and creates hidden layers
    def initialize_network(self):
        network = []

        # Create multiple hidden layers based on n_hidden 
        for n in range(len(self.n_hidden)):
            # For first hidden layer - creates random weights for hidden layer by looping through 'n_inputs + 1' to add a weight to every input and a bias 
            if n == 0:
                hidden_layer = [{'weights': [np.random.rand() for i in range(self.n_inputs + 1)]} for i in range(self.n_hidden[n])]
            # For any other hidden layers defined by n_hidden - 'n-1' is used since we want to create weights for all the connections from row n-1 and row n with a bias
            else:
                hidden_layer = [{'weights':[np.random.rand() for i in range(self.n_hidden[n-1] + 1)]} for i in range(self.n_hidden[n])]
            
            # Appends the hidden layer to empty list - creating the network
            network.append(hidden_layer)

        
        output_layer = [{'weights': [np.random.rand() for i in range(self.n_hidden[-1] + 1)]} for i in range(self.n_outputs)]
        network.append(output_layer)

        
        return network
    
    
    
    
    # Output of a single neuron
    def activate_neuron(self, weights, inputs):
        
        # Adding the bias weight to activation before adding the neuron activations
        bias = weights[-1]
        activation = np.dot(weights[:-1], inputs) + bias
            
        return activation
    
    
    
    
    
    # Forward Pass
    def forward_pass(self, network, inputs, activation_function):
        
        # Loops through each layer aka list item within an arg 'network' which will be our initialized network from before
        for layer in network:
            new_inputs = []
            count = 0
            
            # Loops through each neuron of the layer with activation functions and appends the outputs of that layer's neurons into 'new_inputs' to become the next layers inputs
            for neuron in layer:
                neuron_output = self.activate_neuron(neuron['weights'], inputs)
                neuron['output'] = activation_function(neuron_output)
                new_inputs.append(neuron['output'])

            inputs = new_inputs

        return inputs # Output of output layer
                

    

    
    
    # Backpropagation Error

    def backprop(self, network, expected, d_activation):

        # Loops through the reversed range of network in int - important so that the 'layer' variable isnt messed up
        for i in reversed(range(len(network))):
            layer = network[i]
            error_layer = [] # List of errors for each neuron in layer i 

            if i != len(network) - 1:

                # Loops through 'range(len(layer))' aka all the neurons of the layer once again in int - for 'neuron' variable 
                for j in range(len(layer)):
                    error = 0.0

                    for neuron in network[i + 1]:
                        error += (neuron['weights'][j] * neuron['delta'])
                    error_layer.append(error) # Appends error for each neuron in 'error_layer' list

            else:
                for j in range(len(layer)):
                    neuron = layer[j]
                    error_layer.append(neuron['output'] - expected)


            for j in range(len(layer)):
                neuron = layer[j]
                neuron['delta'] = error_layer[j] * d_activation(neuron['output'])
                

    
        

    
    
    # Update Neuron Weights
    def update_weights(self, network, row, l_rate):
        
        # Loops through all layers of the 'network' variable
        for i in range(len(network)):
            inputs = row + [1.0]
            
            # For all layers of 'network' except the first hidden layer
            if i != 0:
                inputs = [neuron['output'] for neuron in network[i - 1]] + [1.0] 
                
            # Loop through every neuron in the ith layer of 'network'
            for neuron in network[i]:
                
                # Update neuron weights by looping through all the weights for a neuron (except the last one since it's the bias weight)
                for w in range(len(neuron['weights']) - 1): 
                    neuron['weights'][w] -= l_rate * neuron['delta'] * inputs[w] # 'inputs[w]' ensures that the input is the correct one to correspond with the weights and neuron
                    
                # Updating the bias weight in each neuron
                neuron['weights'][-1] -= l_rate * neuron['delta']
                
    
    
                

    # Train network
    def train_network(self, network, train, expected, activation_function, d_activation, l_rate, n_epoch, n_outputs):
        
        # Loops through every 'epoch' - amount set in the arg input
        for epoch in range(n_epoch):
            sum_error = 0

            # Loops through all the rows of the training data
            for x, row in enumerate(train):
                outputs = self.forward_pass(network, row, activation_function) # Outputs of output layer for that row
                mse = sum([(expected[x] - outputs[i])**2 for i in range(len(outputs))]) / len(outputs) # Had errors with normal squared loss error function and MSE fixed it
                self.backprop(network, expected[x], d_activation)
                sum_error += mse
                self.update_weights(network, row, l_rate)

            if epoch % 50 == 0:
                print('>epoch=%d, error=%.3f' % (epoch, sum_error)) 
            
        return
    
    

            
    # Predict with the network
    def predict(self, network, row, activation_function):
        outputs = self.forward_pass(network, row, activation_function)
        return outputs.index(max(outputs))


### Load and Prepare Data

In [4]:
phish_df = pd.read_csv('/Users/jake/Downloads/Phishing_Legitimate_full.csv')
phish_df.head()

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,1,3,1,5,72,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,2,3,1,3,144,0,0,0,0,2,...,0,0,0,1,-1,1,1,1,1,1
2,3,3,1,2,58,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,4,3,1,6,79,1,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,5,3,0,4,46,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


In [5]:
phish_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  10000 non-null  int64  
 1   NumDots                             10000 non-null  int64  
 2   SubdomainLevel                      10000 non-null  int64  
 3   PathLevel                           10000 non-null  int64  
 4   UrlLength                           10000 non-null  int64  
 5   NumDash                             10000 non-null  int64  
 6   NumDashInHostname                   10000 non-null  int64  
 7   AtSymbol                            10000 non-null  int64  
 8   TildeSymbol                         10000 non-null  int64  
 9   NumUnderscore                       10000 non-null  int64  
 10  NumPercent                          10000 non-null  int64  
 11  NumQueryComponents                  10000 

In [6]:
phish_df.describe()

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,2.4451,0.5868,3.3003,70.2641,1.818,0.1389,0.0003,0.0131,0.3232,...,0.3396,0.0322,0.0304,0.9566,0.0202,0.3533,0.7932,0.1734,0.3141,0.5
std,2886.89568,1.346836,0.751214,1.863241,33.369877,3.106258,0.545744,0.017319,0.113709,1.11466,...,0.473597,0.17654,0.171694,0.248037,0.820036,0.888908,0.521019,0.755771,0.897843,0.500025
min,1.0,1.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,2500.75,2.0,0.0,2.0,48.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.0,-1.0,1.0,0.0,-1.0,0.0
50%,5000.5,2.0,1.0,3.0,62.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.5
75%,7500.25,3.0,1.0,4.0,84.0,2.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,10000.0,21.0,14.0,18.0,253.0,55.0,9.0,1.0,1.0,18.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Create Expected Output DataFrame
expected_out = phish_df['CLASS_LABEL']
expected_out.head()

0    1
1    1
2    1
3    1
4    1
Name: CLASS_LABEL, dtype: int64

In [8]:
phish_df = phish_df.drop(['id', 'CLASS_LABEL', 'HttpsInHostname'], axis = 1)
phish_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 47 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   NumDots                             10000 non-null  int64  
 1   SubdomainLevel                      10000 non-null  int64  
 2   PathLevel                           10000 non-null  int64  
 3   UrlLength                           10000 non-null  int64  
 4   NumDash                             10000 non-null  int64  
 5   NumDashInHostname                   10000 non-null  int64  
 6   AtSymbol                            10000 non-null  int64  
 7   TildeSymbol                         10000 non-null  int64  
 8   NumUnderscore                       10000 non-null  int64  
 9   NumPercent                          10000 non-null  int64  
 10  NumQueryComponents                  10000 non-null  int64  
 11  NumAmpersand                        10000 

In [17]:
# Normalize data

norm_phish = phish_df.copy()
norm_phish = (norm_phish - norm_phish.min()) / (norm_phish.max() - norm_phish.min())
norm_phish.head()

Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT
0,0.1,0.071429,0.277778,0.248963,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.5,1.0,1.0,0.0,1.0
1,0.1,0.071429,0.166667,0.547718,0.0,0.0,0.0,0.0,0.111111,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
2,0.1,0.071429,0.111111,0.190871,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.5,0.0,1.0,0.0,0.5
3,0.1,0.071429,0.333333,0.278008,0.018182,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
4,0.1,0.0,0.222222,0.141079,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.5,0.0,0.0


In [21]:
# Splitting the Data for Testing/Training

X_train, X_test, y_train, y_test = train_test_split(norm_phish, expected_out, random_state = 50, test_size = 0.25)
print('Train Inputs Shape: ', X_train.shape)
print('Train Labels Shape: ', y_train.shape)
y_train.head()

Train Inputs Shape:  (7500, 47)
Train Labels Shape:  (7500,)


7777    0
4998    1
4165    1
9645    0
1707    1
Name: CLASS_LABEL, dtype: int64

In [22]:
# Convert DataFrames to NP arrays - Train inputs

inputs = X_train.values
labels = y_train.values

In [88]:
# Convert DataFrames to NP arrays - Train inputs

test_inputs = X_test.values
test_labels = y_test.values

In [108]:
# Initialize Hyperparameters for training

n_inputs = len(inputs[0]) # Number of neurons in input layer
n_hidden = [10, 2, 2] # List of number of neurons in each hidden layer
n_outputs = len(set(labels)) # Number of neurons in output layer
learning_rate = 0.005
n_epochs = 301

print(n_inputs, n_outputs)

print(inputs[0])

47 2
[0.         0.         0.22222222 0.2406639  0.03636364 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         1.         0.         0.         0.
 1.         0.09774436 0.28571429 0.         0.         0.
 0.         0.025      0.98461538 1.         0.         0.
 1.         0.         0.01428571 0.         0.         0.
 0.         1.         1.         0.         0.         1.
 0.5        0.         0.5        0.         1.        ]


In [109]:
%%time
# Run Network

network = NN(n_inputs, n_hidden, n_outputs)
net_init = network.initialize_network()
forward = network.forward_pass(net_init, inputs[0], leaky_ReLU)
back = network.backprop(net_init, labels[0], d_leaky)
update = network.update_weights(net_init, inputs[0], learning_rate)
train = network.train_network(net_init, inputs, labels, leaky_ReLU, d_leaky, learning_rate, n_epochs, n_outputs)

>epoch=0, error=3102.527
>epoch=50, error=393.154
>epoch=100, error=366.979
>epoch=150, error=356.336
>epoch=200, error=350.929
>epoch=250, error=344.330
>epoch=300, error=341.811
CPU times: user 9min 4s, sys: 1.12 s, total: 9min 5s
Wall time: 9min 5s


In [110]:
# Prediction and Accuracy for Training sets

correct = 0
incorrect = 0

for x, row in enumerate(inputs):
    prediction = network.predict(net_init, row, leaky_ReLU)
    #print('Expected=%d, Got=%d' % (labels[x], prediction))
    

    if labels[x] == prediction:
        correct += 1
    else:
        incorrect += 1
print('Correct: ', correct)
print('Incorrect: ', incorrect)

accuracy = correct / len(labels)
print('Accuracy for training data', accuracy)

Correct:  3039
Incorrect:  4461
Accuracy for training data 0.4052


In [111]:
# Prediction and Accuracy for Test sets

preds = []

for x, row in enumerate(test_inputs):
    prediction = network.predict(net_init, row, leaky_ReLU)
    #print('Expected=%d, Got=%d' % (labels[x], prediction))
    preds.append(prediction)

    if labels[x] == prediction:
        correct += 1
    else:
        incorrect += 1
        
print(preds)
print('Correct: ', correct)
print('Incorrect: ', incorrect)

accuracy = correct / len(labels)
print('Accuracy for training data', accuracy)

[1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 

In [112]:


correct = 0
incorrect = 0

for x, row in enumerate(preds):
    if row == test_labels[x]:
        correct += 1
    else:
        incorrect += 1
        
print('Correct: ', correct)
print('Incorrect: ', incorrect)

accuracy = correct / len(test_labels)
print('Accuracy for test data', accuracy)
    

Correct:  1017
Incorrect:  1483
Accuracy for test data 0.4068
