In [7]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, List
from tqdm import tqdm
import os
import json
import random

import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load('spm_model.model')

text = "This is a sample sentence to tokenize."
tokens = sp.encode_as_pieces(text)
print(tokens)
print(sp.encode_as_ids(text))

['▁This', '▁is', '▁a', '▁sa', 'mp', 'le', '▁sentence', '▁to', '▁to', 'ken', 'ize', '.']
[102, 22, 14, 1693, 2265, 421, 7627, 6, 6, 4296, 3055, 3]


In [8]:
def find_max_sequence_length():
    max_length = 0
    prefix = './PhishingEmails/'  # Adjust this to your file path
    
    for file in tqdm(os.listdir(prefix)):
        with open(prefix + file) as jsonFile:
            jsonStr = jsonFile.read()
            email_dict = json.loads(jsonStr)
            setupData = sp.encode_as_ids(
                email_dict['email_subject'].strip() + ". " + email_dict['email_body'].strip()
            )
            max_length = max(max_length, len(setupData))
    
    return max_length


In [9]:

def load_emails(batch_size, input_size):

    max_sequence_length = input_size
    dataset = []
    batch_counter = 0
    batch = []

    count = 0 #just to limit dataset size at first
    
    prefix = './PhishingEmails/' #change this to the prefile thing such as './celebA'

    for file in tqdm(os.listdir(prefix)):

        # if count >= 200: #comment this out to get full data
        #     break
        
        with open(prefix + file) as jsonFile:
            jsonStr = jsonFile.read() #json file as a string
            email_dict = json.loads(jsonStr) #converts to dictionary
        
        setupData = sp.encode_as_ids(email_dict['email_subject'].strip() + ". " + email_dict['email_body'].strip())
        pad_length = max_sequence_length - len(setupData)

        if pad_length > 0:
            setupData += [-1] * pad_length


        if batch_counter < batch_size:
            batch.append(setupData)
            batch_counter += 1
        else:
            dataset.append(batch)
            batch = []
            batch_counter = 0
       
    return np.array(dataset)

In [10]:
#Initialize helpful functions for math
def sigmoid(x: np.ndarray):
    return 1/(1+np.exp(-1*x))

def sigmoid_derivative(x: np.ndarray):
    return sigmoid(x)*(1-sigmoid(x))

def tanh(x: np.ndarray):
    return (np.exp(x) - np.exp(-x))/(np.exp(x)+np.exp(-x))
    
def tanh_derivative(x:np.ndarray):
    return 1-np.square(tanh(x))

def softmax(x: np.ndarray):
    return np.exp(x)/np.sum(np.exp(x))

def cross_entropy(yhat, y):
    loss = 0
     
    for i in range(len(yhat)):

        loss = loss + (-1 * y[i]*np.log(yhat[i]))
 
    return loss

In [11]:
#initializes the weights of the network
def initialize_cell(input_size, hidden_size):

    print(input_size)

    cell = {}

    # W = np.random.normal(0, 1, (hidden_size, input_size))  # Dimensions hidden_size x input_size
    # U = np.random.normal(0, 1, (hidden_size, hidden_size))  # Dimensions hidden_size x hidden_size

    # # Check the dimensions
    # print("W shape:", W.shape)  # Should print (256, 28756)
    # print("U shape:", U.shape)  # Should print (256, 256)

    # # Vertically stack W and U
    # input_data = np.vstack((W, U))
    # print("Stacked input_data shape:", input_data.shape)  # Should print (512, 28756)
    print(input_size,"-----")

    cell["W_i"] = np.hstack((np.random.normal(0, 1, (hidden_size, hidden_size)), np.random.normal(0, 1, (hidden_size, input_size)))) #input gate weights
    cell["W_f"] = np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size)))) #forget gate weights
    cell["W_c"] = np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size)))) #candidate gate weights
    cell["W_o"] = np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size)))) #output gate weights
    cell["W_y"] = (np.random.normal(0,1,(hidden_size,hidden_size)))#final gate weights

    #not sure if the biases need to be 3d...
    cell["b_i"] = np.zeros(hidden_size) #input gate biases
    cell["b_f"] = np.zeros(hidden_size) #forget gate biases
    cell["b_c"] = np.zeros(hidden_size) #candidate gate biases
    cell["b_o"] = np.zeros(hidden_size) #output gate biases
    cell["b_y"] = np.zeros(hidden_size) #final gate biases

    return cell

In [12]:
#forward pass of all gates
def forward_pass(cell, prevA, prevC, X):

    # print(X, "/n______-")
    
    input = np.hstack((prevA, X))

    forward = {}

    forward["F"] = sigmoid(cell["W_f"].dot(input) + cell["b_i"])
    
    forward["C"] = tanh(cell["W_c"].dot(input) + cell["b_c"])

    forward["I"] = sigmoid(cell["W_i"].dot(input) + cell["b_i"])

    forward["O"] = sigmoid(cell["W_o"].dot(input) + cell["b_o"])


    forward["prevA"] = prevA
    forward["prevC"] = prevC
    forward["C_t"] = (forward["prevC"] * forward["F"]) + (forward["I"] * forward["C"])
    forward["A_t"] = forward["O"] * tanh(forward["C_t"])

    forward["Z_t"] = cell["W_y"].dot(forward["C_t"] * forward["O"]) + cell["b_y"]
    forward["Yhat"] = softmax(forward["Z_t"])

    return forward

In [13]:
def gradient(forward, cell, X, Y, lprimea, lprimec):

    grads = {}

    print("\nLELELELEL")
    input = np.hstack((forward["prevA"], X))

    dldA_t = np.transpose(cell["W_y"]).dot(forward["Yhat"]-Y) + lprimea
    dldC_t = lprimec + (forward["O"] * tanh_derivative(forward["C_t"])) * dldA_t 

    TdLdw_f = (dldC_t * forward["prevC"] * forward["F"]*(1-forward["F"])) 
    TdLdw_c = (dldC_t * forward["I"])
    TdLdw_o = (dldA_t * tanh(forward["C_t"]) * forward["O"] * (1-forward["O"]))
    TdLdw_i = (dldC_t * forward["C"] * forward["I"] * (1-forward["I"]))

    print(np.transpose(cell["W_y"]).size, " wyT")
    print((forward["Yhat"]-Y).size, " error")
    print(dldC_t.size, "c_t")
    print(forward["prevC"].size, "prevC")
    print(forward["F"].size, " F")
    print(TdLdw_f.size, " Tdldwf")
    print(input.size, " input")

    # np.atleast2d(a).T

    woa = cell["W_o"][:, :128]
    wca = cell["W_c"][:, :128]
    wia = cell["W_i"][:, :128]
    wfa = cell["W_f"][:, :128]

    print(woa.size, " woa")

    grads["dLda_prev"] = woa.T * TdLdw_o + wca.T * TdLdw_c + wia.T * TdLdw_i + wfa.T * TdLdw_f
    grads["dLdc_prev"] = (lprimec + forward["O"] * 1-np.square(tanh(forward["C_t"])) * dldA_t) * forward["F"]

    print(grads["dLda_prev"].size, " dlda")
    print(grads["dLdc_prev"].size, " dldc")

    #not sure which side to transpose.
    grads["dLdw_f"] = np.atleast_2d(TdLdw_f).dot(np.atleast_2d(input).T)
    grads["dLdw_c"] = np.atleast_2d(TdLdw_c).dot(np.atleast_2d(input).T)
    grads["dLdw_o"] = np.atleast_2d(TdLdw_o).dot(np.atleast_2d(input).T)
    grads["dLdw_i"] = np.atleast_2d(TdLdw_i).dot(np.atleast_2d(input).T)
    grads["dLdw_y"] = (forward["Yhat"] - Y).dot(np.transpose(forward["A_t"]))

    grads["dLdb_f"] = 1
    grads["dLdb_c"] = 1
    grads["dLdb_o"] = 1
    grads["dLdb_i"] = 1
    grads["dLdb_y"] = 1
    
    
    

    loss = cross_entropy(forward["Yhat"], Y)

    return grads, loss

In [14]:
def descent(cell, X, input_size, hidden_size, lr, batch_size):
   
    

    for b in range(0, batch_size):

        prevA = np.zeros(hidden_size)
        prevC = np.zeros(hidden_size)

        gradientTot = {}
        lossTot = 0

        allForwards = []
        labels = []

        lprimea = np.zeros(hidden_size)
        lprimec = np.zeros(hidden_size)

        gradientTot["dLdw_f"] = np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size))))
        gradientTot["dLdw_c"] = np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size))))
        gradientTot["dLdw_o"] = np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size))))
        gradientTot["dLdw_i"] = np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size))))
        gradientTot["dLdw_y"] = (np.random.normal(0,1,(hidden_size,hidden_size)))
        gradientTot["dLdb_f"] = np.zeros(hidden_size)
        gradientTot["dLdb_c"] = np.zeros(hidden_size)
        gradientTot["dLdb_o"] = np.zeros(hidden_size)
        gradientTot["dLdb_i"] = np.zeros(hidden_size)
        gradientTot["dLdb_y"] = np.zeros(hidden_size)

        X_b = X[b]

        for i in range(1, len(X_b)-1):  #tqdm will create a loading bar for your loop

            curData = X_b[0:i]

            length = len(curData)

            pad_length = input_size - length

            if pad_length > 0:
                curData = np.concatenate((curData,np.array([0] * pad_length)))
            
            forward = forward_pass(cell, prevA, prevC, curData)

            allForwards.append(forward)

            prevA = forward["A_t"]
            prevC = forward["C_t"]

            curData[length] = X_b[i+1]

            labels.append(curData)

        print("BACKTIME")
        for i in range(0, len(allForwards)):

            grad, loss = gradient(allForwards[i], cell, labels[i], allForwards[i]["Yhat"], lprimea, lprimec)
            lprimea = grad["dLda_prev"]
            lprimec = grad["dLdc_prev"]

            gradientTot["dLdw_f"] += grad["dLdw_f"]
            gradientTot["dLdw_c"] += grad["dLdw_c"]
            gradientTot["dLdw_o"] += grad["dLdw_o"]
            gradientTot["dLdw_i"] += grad["dLdw_i"]
            gradientTot["dLdw_y"] += grad["dLdw_y"]
            
            gradientTot["dLdb_f"] += grad["dLdb_f"]
            gradientTot["dLdb_c"] += grad["dLdb_c"]
            gradientTot["dLdb_o"] += grad["dLdb_o"]
            gradientTot["dLdb_i"] += grad["dLdb_i"]
            gradientTot["dLdb_y"] += grad["dLdb_y"]

            lossTot += loss
        

        cell["W_f"] = cell["W_f"] - gradientTot["dLdw_f"] * lr
        cell["W_c"] = cell["W_c"] - gradientTot["dLdw_c"] * lr
        cell["W_o"] = cell["W_o"] - gradientTot["dLdw_o"] * lr
        cell["W_i"] = cell["W_i"] - gradientTot["dLdw_i"] * lr
        cell["W_y"] = cell["W_y"] - gradientTot["dLdw_y"] * lr

        cell["b_f"] = cell["b_f"] - gradientTot["dLdb_f"] * lr
        cell["b_c"] = cell["b_c"] - gradientTot["dLdb_c"] * lr
        cell["b_o"] = cell["b_o"] - gradientTot["dLdb_o"] * lr
        cell["b_i"] = cell["b_i"] - gradientTot["dLdb_i"] * lr
        cell["b_y"] = cell["b_y"] - gradientTot["dLdb_y"] * lr

    return lossTot


In [15]:
def train_LSTM(input_size, hidden_size, dataset, batch_size):

    cell = initialize_cell(input_size, hidden_size)

    losses = []

    for data in dataset:
        loss = descent(cell, data, input_size, hidden_size, 0.0001, batch_size)
        losses.append(loss)

    return cell
    

In [16]:

batch_size = 16
vocab_size = 28500

hidden_size = 128
input_size = hidden_size + 28500

dataset = load_emails(batch_size, input_size)

100%|██████████| 3332/3332 [00:03<00:00, 863.34it/s]


In [17]:

ex_email = dataset[random.randint(0, 50)]



finalCell = train_LSTM(input_size, hidden_size, dataset, batch_size)

28628
28628 -----


  return 1/(1+np.exp(-1*x))
  return (np.exp(x) - np.exp(-x))/(np.exp(x)+np.exp(-x))
  return (np.exp(x) - np.exp(-x))/(np.exp(x)+np.exp(-x))


BACKTIME

LELELELEL
16384  wyT
128  error
128 c_t
128 prevC
128  F
128  Tdldwf
28756  input
16384  woa
16384  dlda
128  dldc

LELELELEL
16384  wyT
128  error
16384 c_t
128 prevC
128  F
16384  Tdldwf
28756  input
16384  woa
16384  dlda
16384  dldc


ValueError: shapes (128,128) and (1,28756) not aligned: 128 (dim 1) != 1 (dim 0)