In [277]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, List
from tqdm import tqdm
import os
import json
import random
from numpy import longdouble

import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load('spm_model.model')

text = "This is a sample sentence to tokenize."
tokens = sp.encode_as_pieces(text)
print(tokens)
print(sp.encode_as_ids(text))
print(sp.decode(sp.encode_as_ids(text)))

['▁This', '▁is', '▁a', '▁sa', 'mp', 'le', '▁sentence', '▁to', '▁to', 'ken', 'ize', '.']
[102, 22, 14, 1693, 2265, 421, 7627, 6, 6, 4296, 3055, 3]
This is a sample sentence to tokenize.


In [278]:
def find_max_sequence_length():
    maxlen = 0
    prefix = './PhishingEmails/'  # Adjust this to your file path
    
    for file in tqdm(os.listdir(prefix)):
        with open(prefix + file) as jsonFile:
            jsonStr = jsonFile.read()
            email_dict = json.loads(jsonStr)
            setupData = sp.encode_as_ids(
                email_dict['email_subject'].strip() + ". " + email_dict['email_body'].strip()
            )

            if(len(setupData) < 725):
                maxlen = max(maxlen, len(setupData))

    return maxlen

In [279]:
def load_emails(batch_size):

    max_sequence_length = find_max_sequence_length()
    dataset = []
    batch_counter = 0
    batch = []
    ctr = 0
    
    prefix = './PhishingEmails/' #change this to the prefile thing such as './celebA'

    for file in tqdm(os.listdir(prefix)):
        
        with open(prefix + file) as jsonFile:
            jsonStr = jsonFile.read() #json file as a string
            email_dict = json.loads(jsonStr) #converts to dictionary
        
        setupData = sp.encode_as_ids(email_dict['email_subject'].strip() + ". " + email_dict['email_body'].strip())
        pad_length = max_sequence_length - len(setupData)

        if(len(setupData) <= max_sequence_length and "nan" not in setupData):
            # print(setupData,"\n\n\n")
            ctr+=1
            if pad_length > 0:
                setupData += [1] * pad_length

            if batch_counter < batch_size:
                # setupData = setupData[np.isfinite(setupData)]
                batch.append(setupData)
                batch_counter += 1
            else:
                dataset.append(batch)
                batch = []
                batch_counter = 0
    toRet = np.array(dataset, dtype=longdouble)
    print(ctr)
    # toRet = toRet[~np.isnan(toRet).any(axis=1)]
    np.random.shuffle(toRet)
    # print(toRet[0])
    return toRet

In [280]:
#Initialize helpful functions for math
def sigmoid(x: np.ndarray):
    return 1/(1+np.exp(-1*x))

def sigmoid_derivative(x: np.ndarray):
    return sigmoid(x)*(1-sigmoid(x))

def tanh(x: np.ndarray):
    return np.tanh(x)
    
def tanh_derivative(x:np.ndarray):
    return 1-np.square(tanh(x))

def softmax(x: np.ndarray):
    return np.exp(x)/np.sum(np.exp(x))

def cross_entropy(yhat, y, epsilon=1e-10):
    yhat_clipped = np.clip(yhat, epsilon, 1 - epsilon)  # Clip yhat to avoid zeros
    return -np.sum(y * np.log(yhat_clipped))

def initWeights(input_size, output_size):
    return np.random.uniform(-1, 1, (input_size, output_size)) * np.sqrt(6 / (input_size + output_size))


In [281]:
#initializes the weights of the network
def initialize_cell(input_size, hidden_size):
    

    cell = {}

    cell["W_i"] = np.hstack((initWeights(hidden_size, hidden_size), initWeights(hidden_size, input_size))) #input gate weights
    cell["W_f"] = np.hstack((initWeights(hidden_size, hidden_size), initWeights(hidden_size, input_size))) #forget gate weights
    cell["W_c"] = np.hstack((initWeights(hidden_size, hidden_size), initWeights(hidden_size, input_size))) #candidate gate weights
    cell["W_o"] = np.hstack((initWeights(hidden_size, hidden_size), initWeights(hidden_size, input_size))) #output gate weights
    cell["W_y"] = initWeights(10000, hidden_size)#final gate weights

    #not sure if the biases need to be 3d...
    cell["b_i"] = np.zeros(hidden_size,dtype=longdouble) #input gate biases
    cell["b_f"] = np.zeros(hidden_size,dtype=longdouble) #forget gate biases
    cell["b_c"] = np.zeros(hidden_size,dtype=longdouble) #candidate gate biases
    cell["b_o"] = np.zeros(hidden_size,dtype=longdouble) #output gate biases
    cell["b_y"] = np.zeros(10000) #final gate biases

    return cell

In [282]:
#forward pass of all gates
def forward_pass(cell, prevA, prevC, X):

    # print(X, "/n______-")
    
    input = np.hstack((prevA, X))

    forward = {}
    # print(cell["W_f"])

    forward["F"] = sigmoid(cell["W_f"].dot(input) + cell["b_f"])

    forward["_c"] = cell["W_c"].dot(input) + cell["b_c"]
    
    forward["C"] = tanh(forward["_c"])

    forward["I"] = sigmoid(cell["W_i"].dot(input) + cell["b_i"])

    forward["O"] = sigmoid(cell["W_o"].dot(input) + cell["b_o"])


    forward["prevA"] = prevA
    forward["prevC"] = prevC
    forward["C_t"] = (forward["prevC"] * forward["F"]) + (forward["I"] * forward["C"])
    forward["A_t"] = forward["O"] * tanh(forward["C_t"])

    forward["Z_t"] = cell["W_y"].dot(forward["A_t"]) 
    # + cell["b_y"]
    
    forward["Yhat"] = softmax(forward["Z_t"])
    # 
    # print(forward["Yhat"].size)
    # print(forward["Yhat"], "  Yhat")
    return forward

In [283]:
def gradient(forward, cell, X, Y, lprimea, lprimec):

    grads = {}

    # print("BackProp")
    input = np.hstack((forward["prevA"], X))
    # print((forward["Yhat"]-Y).size, "yhat-y")
    dldA_t = np.transpose(cell["W_y"]).dot(forward["Yhat"]-Y) + lprimea
    print((forward["Yhat"]))
    dldC_t = lprimec + (forward["O"] * tanh_derivative(forward["C_t"])) * dldA_t 
    # print(forward["Yhat"]-Y)

    TdLdw_f = (dldC_t * forward["prevC"] * forward["F"]*(1-forward["F"])) 
    # TdLdw_c = (dldC_t * forward["I"])
    TdLdw_c = (dldC_t * forward["I"]*tanh_derivative(forward["_c"]))
    TdLdw_o = (dldA_t * tanh(forward["C_t"]) * forward["O"] * (1-forward["O"]))
    TdLdw_i = (dldC_t * forward["C"] * forward["I"] * (1-forward["I"]))
    TdLdw_y = (forward["Yhat"] - Y)

    

    # np.atleast2d(a).T

    woa = cell["W_o"][:, :128]
    wca = cell["W_c"][:, :128]
    wia = cell["W_i"][:, :128]
    wfa = cell["W_f"][:, :128]


    grads["dLda_prev"] = woa.T.dot(TdLdw_o) + wca.T.dot(TdLdw_c) + wia.T.dot(TdLdw_i) + wfa.T.dot(TdLdw_f)
    grads["dLdc_prev"] = (lprimec + (forward["O"] * tanh_derivative(forward["C_t"]) * dldA_t)) * forward["F"]


    #not sure which side to transpose.
    grads["dLdw_f"] = np.atleast_2d(TdLdw_f).T.dot(np.atleast_2d(input))
    grads["dLdw_c"] = np.atleast_2d(TdLdw_c).T.dot(np.atleast_2d(input))
    grads["dLdw_o"] = np.atleast_2d(TdLdw_o).T.dot(np.atleast_2d(input))
    grads["dLdw_i"] = np.atleast_2d(TdLdw_i).T.dot(np.atleast_2d(input))
    grads["dLdw_y"] = (np.atleast_2d(TdLdw_y)).T.dot(np.atleast_2d(forward["A_t"]))

    grads["dLdb_f"] = TdLdw_f
    grads["dLdb_c"] = TdLdw_c
    grads["dLdb_o"] = TdLdw_o
    grads["dLdb_i"] = TdLdw_i
    grads["dLdb_y"] = TdLdw_y


    
    loss = cross_entropy(forward["Yhat"], Y)
    # print(loss)

    return grads, loss

In [284]:
def descent(cell, X, input_size, hidden_size, lr, batch_size):
   
    

    # for b in range(0, batch_size):

    prevA = np.zeros(hidden_size)
    prevC = np.zeros(hidden_size)

    gradientTot = {}
    lossTot = 0

    allForwards = []
    labels = []

    lprimea = np.zeros(hidden_size)
    lprimec = np.zeros(hidden_size)

                            # np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size))))
    gradientTot["dLdw_f"] = np.hstack((np.zeros((hidden_size,hidden_size)), np.zeros((hidden_size,input_size))))
    gradientTot["dLdw_c"] = np.hstack((np.zeros((hidden_size,hidden_size)), np.zeros((hidden_size,input_size))))
    gradientTot["dLdw_o"] = np.hstack((np.zeros((hidden_size,hidden_size)), np.zeros((hidden_size,input_size))))
    gradientTot["dLdw_i"] = np.hstack((np.zeros((hidden_size,hidden_size)), np.zeros((hidden_size,input_size))))
    gradientTot["dLdw_y"] = np.zeros((10000,hidden_size),dtype=longdouble)
    gradientTot["dLdb_f"] = np.zeros(hidden_size,dtype=longdouble)
    gradientTot["dLdb_c"] = np.zeros(hidden_size,dtype=longdouble)
    gradientTot["dLdb_o"] = np.zeros(hidden_size,dtype=longdouble)
    gradientTot["dLdb_i"] = np.zeros(hidden_size,dtype=longdouble)
    gradientTot["dLdb_y"] = np.zeros(10000,dtype=longdouble)

    X_b = X[0]

    for i in range(1, len(X_b)-1): 
    
        curData = X_b[0:i]
        d = (X_b[0:i].astype(np.int64).tolist())

        length = len(curData)

        pad_length = input_size - length

        if pad_length > 0:
            curData = np.concatenate((curData,np.array([0] * pad_length)))
        
        forward = forward_pass(cell, prevA, prevC, curData)

        allForwards.append(forward)

        # next = np.random.choice(len(forward["Yhat"]), p=forward["Yhat"])
    
        prevA = forward["A_t"]
        prevC = forward["C_t"]

        label = np.zeros(10000, dtype=longdouble)
       
        label[int(X_b[i+1])] = 1.0

        labels.append(label)

    

    


    # print("BACKPROP!!!!!!!!\n\n\n\n")
    for i in range(0, len(allForwards)):
        # print(allForwards[i]["Yhat"].size)
        grad, loss = gradient(allForwards[i], cell, labels[i], allForwards[i]["Yhat"], lprimea, lprimec)
        lprimea = grad["dLda_prev"]
        lprimec = grad["dLdc_prev"]

        gradientTot["dLdw_f"] += grad["dLdw_f"]
        gradientTot["dLdw_c"] += grad["dLdw_c"]
        gradientTot["dLdw_o"] += grad["dLdw_o"]
        gradientTot["dLdw_i"] += grad["dLdw_i"]
        gradientTot["dLdw_y"] += grad["dLdw_y"]
        
        gradientTot["dLdb_f"] += grad["dLdb_f"]
        gradientTot["dLdb_c"] += grad["dLdb_c"]
        gradientTot["dLdb_o"] += grad["dLdb_o"]
        gradientTot["dLdb_i"] += grad["dLdb_i"]
        gradientTot["dLdb_y"] += grad["dLdb_y"]

        # print(grad["dLdw_f"], grad["dLdw_c"], grad["dLdw_o"], grad["dLdw_i"])

        lossTot += loss

    cell["W_f"] = cell["W_f"] - gradientTot["dLdw_f"] * lr
    # print(gradientTot["dLdw_f"])
    cell["W_c"] = cell["W_c"] - gradientTot["dLdw_c"] * lr
    cell["W_o"] = cell["W_o"] - gradientTot["dLdw_o"] * lr
    cell["W_i"] = cell["W_i"] - gradientTot["dLdw_i"] * lr
    cell["W_y"] = cell["W_y"] - gradientTot["dLdw_y"] * lr

    # cell["b_f"] = cell["b_f"] - gradientTot["dLdb_f"]/f_length * lr
    # cell["b_c"] = cell["b_c"] - gradientTot["dLdb_c"]/f_length * lr
    # cell["b_o"] = cell["b_o"] - gradientTot["dLdb_o"]/f_length * lr
    # cell["b_i"] = cell["b_i"] - gradientTot["dLdb_i"]/f_length * lr
    # cell["b_y"] = cell["b_y"] - gradientTot["dLdb_y"]/f_length * lr
    cell["b_f"] = cell["b_f"] - gradientTot["dLdb_f"] * lr
    cell["b_c"] = cell["b_c"] - gradientTot["dLdb_c"] * lr
    cell["b_o"] = cell["b_o"] - gradientTot["dLdb_o"] * lr
    cell["b_i"] = cell["b_i"] - gradientTot["dLdb_i"] * lr
    cell["b_y"] = cell["b_y"] - gradientTot["dLdb_y"] * lr

    return lossTot


In [285]:
def train_LSTM(input_size, hidden_size, dataset, batch_size):

    cell = initialize_cell(input_size, hidden_size)

    losses = []

    for data in dataset:
        # print(data)
        loss = descent(cell, data, input_size, hidden_size, 0.001, batch_size)
        print(loss)
        losses.append(loss)

    return cell
    

In [286]:

batch_size = 1
vocab_size = 10000

hidden_size = 128
input_size = vocab_size
# + hidden_size

dataset = load_emails(batch_size)

100%|██████████| 3332/3332 [00:02<00:00, 1596.19it/s]
100%|██████████| 3332/3332 [00:02<00:00, 1517.01it/s]


2676


In [287]:

ex_email = dataset[random.randint(0, 50)]



finalCell = train_LSTM(input_size, hidden_size, dataset, batch_size)

  return 1/(1+np.exp(-1*x))


[9.77189381e-05 9.90774001e-05 1.00632028e-04 ... 9.63852839e-05
 1.02588473e-04 9.82675712e-05]
[9.44961997e-05 9.64165526e-05 1.02821962e-04 ... 1.02119483e-04
 1.03263192e-04 9.55917704e-05]
[9.84303371e-05 9.65716920e-05 9.63237568e-05 ... 1.14025055e-04
 1.10009974e-04 1.02876787e-04]
[1.02477683e-04 9.76625507e-05 9.74235832e-05 ... 1.16975727e-04
 1.10792189e-04 1.01206920e-04]
[1.03228282e-04 9.82606946e-05 9.74637059e-05 ... 1.17925766e-04
 1.11426555e-04 9.98534008e-05]
[1.03225172e-04 9.83934999e-05 9.68869459e-05 ... 1.17279304e-04
 1.10925598e-04 9.87789747e-05]
[1.04087092e-04 9.97515154e-05 9.86972536e-05 ... 1.20807434e-04
 1.07840725e-04 9.68281125e-05]
[1.04310957e-04 1.01154836e-04 9.95912550e-05 ... 1.21566923e-04
 1.09518972e-04 9.55706519e-05]
[1.02878085e-04 1.04434266e-04 1.01811094e-04 ... 1.20312805e-04
 1.09754699e-04 9.02008883e-05]
[1.03724609e-04 1.03070518e-04 1.02842973e-04 ... 1.19231213e-04
 1.08769059e-04 9.23026326e-05]
[1.05324707e-04 9.55551364e-05

KeyboardInterrupt: 