In [432]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, List
from tqdm import tqdm
import os
import json
import random
from numpy import longdouble

import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load('spm_model.model')

text = "This is a sample sentence to tokenize."
tokens = sp.encode_as_pieces(text)
print(tokens)
print(sp.encode_as_ids(text))

['▁This', '▁is', '▁a', '▁sa', 'mp', 'le', '▁sentence', '▁to', '▁to', 'ken', 'ize', '.']
[102, 22, 14, 1693, 2265, 421, 7627, 6, 6, 4296, 3055, 3]


In [433]:
def find_max_sequence_length():
    maxlen = 0
    prefix = './PhishingEmails/'  # Adjust this to your file path
    
    for file in tqdm(os.listdir(prefix)):
        with open(prefix + file) as jsonFile:
            jsonStr = jsonFile.read()
            email_dict = json.loads(jsonStr)
            setupData = sp.encode_as_ids(
                email_dict['email_subject'].strip() + ". " + email_dict['email_body'].strip()
            )

            if(len(setupData) < 725):
                maxlen = max(maxlen, len(setupData))

    return maxlen

In [434]:
load_emails(16)

100%|██████████| 3332/3332 [00:02<00:00, 1512.80it/s]
  0%|          | 0/3332 [00:00<?, ?it/s]


TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
def load_emails(batch_size):

    max_sequence_length = find_max_sequence_length()
    dataset = []
    batch_counter = 0
    batch = []
    ctr = 0
    
    prefix = './PhishingEmails/' #change this to the prefile thing such as './celebA'

    for file in tqdm(os.listdir(prefix)):
        
        with open(prefix + file) as jsonFile:
            jsonStr = jsonFile.read() #json file as a string
            email_dict = json.loads(jsonStr) #converts to dictionary
        
        setupData = sp.encode_as_ids(email_dict['email_subject'].strip() + ". " + email_dict['email_body'].strip())
        pad_length = max_sequence_length - len(setupData)

        if(len(setupData) <= max_sequence_length):
            ctr+=1
            if pad_length > 0:
                setupData += [-1] * pad_length

            if batch_counter < batch_size:
                # setupData = setupData[np.isfinite(setupData)]
                batch.append(setupData)
                batch_counter += 1
            else:
                dataset.append(batch)
                batch = []
                batch_counter = 0
    toRet = np.array(dataset, dtype=longdouble)
    toRet = toRet[~np.isnan(toRet).any(axis=1)]
    return toRet

In [None]:
#Initialize helpful functions for math
def sigmoid(x: np.ndarray):
    return 1/(1+np.exp(-1*x))

def sigmoid_derivative(x: np.ndarray):
    return sigmoid(x)*(1-sigmoid(x))

def tanh(x: np.ndarray):
    return (np.exp(x) - np.exp(-x))/(np.exp(x)+np.exp(-x))
    
def tanh_derivative(x:np.ndarray):
    return 1-np.square(tanh(x))

def softmax(x: np.ndarray):
    return np.exp(x)/np.sum(np.exp(x))

def cross_entropy(yhat, y, epsilon=1e-10):
    yhat_clipped = np.clip(yhat, epsilon, 1 - epsilon)  # Clip yhat to avoid zeros
    return -np.sum(y * np.log(yhat_clipped))

In [None]:
#initializes the weights of the network
def initialize_cell(input_size, hidden_size):

    print(input_size)

    cell = {}

    cell["W_i"] = np.hstack((np.random.normal(0, 0.01, (hidden_size, hidden_size)), np.random.normal(0, 0.01, (hidden_size, input_size)))) #input gate weights
    cell["W_f"] = np.hstack((np.random.normal(0,0.01,(hidden_size,hidden_size)), np.random.normal(0,0.01,(hidden_size,input_size)))) #forget gate weights
    cell["W_c"] = np.hstack((np.random.normal(0,0.01,(hidden_size,hidden_size)), np.random.normal(0,0.01,(hidden_size,input_size)))) #candidate gate weights
    cell["W_o"] = np.hstack((np.random.normal(0,0.01,(hidden_size,hidden_size)), np.random.normal(0,0.01,(hidden_size,input_size)))) #output gate weights
    cell["W_y"] = (np.random.normal(0,1,(hidden_size,hidden_size)))#final gate weights

    #not sure if the biases need to be 3d...
    cell["b_i"] = np.zeros(hidden_size) #input gate biases
    cell["b_f"] = np.zeros(hidden_size) #forget gate biases
    cell["b_c"] = np.zeros(hidden_size) #candidate gate biases
    cell["b_o"] = np.zeros(hidden_size) #output gate biases
    cell["b_y"] = np.zeros(hidden_size) #final gate biases

    return cell

In [None]:
#forward pass of all gates
def forward_pass(cell, prevA, prevC, X):

    # print(X, "/n______-")
    
    input = np.hstack((prevA, X))

    forward = {}

    forward["F"] = sigmoid(cell["W_f"].dot(input.T) + cell["b_i"])
    
    forward["C"] = tanh(cell["W_c"].dot(input.T) + cell["b_c"])

    forward["I"] = sigmoid(cell["W_i"].dot(input.T) + cell["b_i"])

    forward["O"] = sigmoid(cell["W_o"].dot(input.T) + cell["b_o"])


    forward["prevA"] = prevA
    forward["prevC"] = prevC
    forward["C_t"] = (forward["prevC"] * forward["F"]) + (forward["I"] * forward["C"])
    forward["A_t"] = forward["O"] * tanh(forward["C_t"])

    forward["Z_t"] = cell["W_y"].dot(forward["C_t"] * forward["O"]) + cell["b_y"]
    # print(forward["Z_t"], "  Z")
    forward["Yhat"] = softmax(forward["Z_t"])

    return forward

In [None]:
def gradient(forward, cell, X, Y, lprimea, lprimec):

    grads = {}

    # print("BackProp")
    input = np.hstack((forward["prevA"], X))

    dldA_t = np.transpose(cell["W_y"]).dot(forward["Yhat"]-Y) + lprimea
    dldC_t = lprimec + (forward["O"] * tanh_derivative(forward["C_t"])) * dldA_t 

    TdLdw_f = (dldC_t * forward["prevC"] * forward["F"]*(1-forward["F"])) 
    TdLdw_c = (dldC_t * forward["I"])
    TdLdw_o = (dldA_t * tanh(forward["C_t"]) * forward["O"] * (1-forward["O"]))
    TdLdw_i = (dldC_t * forward["C"] * forward["I"] * (1-forward["I"]))

    # np.atleast2d(a).T

    woa = cell["W_o"][:, :128]
    wca = cell["W_c"][:, :128]
    wia = cell["W_i"][:, :128]
    wfa = cell["W_f"][:, :128]


    grads["dLda_prev"] = woa.T.dot(TdLdw_o) + wca.T.dot(TdLdw_c) + wia.T.dot(TdLdw_i) + wfa.T.dot(TdLdw_f)
    grads["dLdc_prev"] = (lprimec + forward["O"] * 1-np.square(tanh(forward["C_t"])) * dldA_t) * forward["F"]


    #not sure which side to transpose.
    grads["dLdw_f"] = np.atleast_2d(TdLdw_f).T.dot(np.atleast_2d(input))
    grads["dLdw_c"] = np.atleast_2d(TdLdw_c).T.dot(np.atleast_2d(input))
    grads["dLdw_o"] = np.atleast_2d(TdLdw_o).T.dot(np.atleast_2d(input))
    grads["dLdw_i"] = np.atleast_2d(TdLdw_i).T.dot(np.atleast_2d(input))
    grads["dLdw_y"] = (forward["Yhat"] - Y).T.dot(np.transpose(forward["A_t"]))

    grads["dLdb_f"] = 1
    grads["dLdb_c"] = 1
    grads["dLdb_o"] = 1
    grads["dLdb_i"] = 1
    grads["dLdb_y"] = 1


    
    loss = cross_entropy(forward["Yhat"], Y)
    print(loss)

    return grads, loss

In [None]:
def descent(cell, X, input_size, hidden_size, lr, batch_size):
   
    

    # for b in range(0, batch_size):

    prevA = np.zeros(hidden_size)
    prevC = np.zeros(hidden_size)

    gradientTot = {}
    lossTot = 0

    allForwards = []
    labels = []

    lprimea = np.zeros(hidden_size)
    lprimec = np.zeros(hidden_size)

                            # np.hstack((np.random.normal(0,1,(hidden_size,hidden_size)), np.random.normal(0,1,(hidden_size,input_size))))
    gradientTot["dLdw_f"] = np.hstack((np.zeros((hidden_size,hidden_size)), np.zeros((hidden_size,input_size))))
    gradientTot["dLdw_c"] = np.hstack((np.zeros((hidden_size,hidden_size)), np.zeros((hidden_size,input_size))))
    gradientTot["dLdw_o"] = np.hstack((np.zeros((hidden_size,hidden_size)), np.zeros((hidden_size,input_size))))
    gradientTot["dLdw_i"] = np.hstack((np.zeros((hidden_size,hidden_size)), np.zeros((hidden_size,input_size))))
    gradientTot["dLdw_y"] = np.zeros((hidden_size,hidden_size))
    gradientTot["dLdb_f"] = np.zeros(hidden_size)
    gradientTot["dLdb_c"] = np.zeros(hidden_size)
    gradientTot["dLdb_o"] = np.zeros(hidden_size)
    gradientTot["dLdb_i"] = np.zeros(hidden_size)
    gradientTot["dLdb_y"] = np.zeros(hidden_size)

    X_b = X[0]

    for i in range(1, len(X_b)-1):  #tqdm will create a loading bar for your loop
    
        curData = X_b[0:i]

        length = len(curData)

        pad_length = input_size - length

        if pad_length > 0:
            curData = np.concatenate((curData,np.array([0] * pad_length)))
        
        forward = forward_pass(cell, prevA, prevC, curData)

        allForwards.append(forward)

        prevA = forward["A_t"]
        prevC = forward["C_t"]

        curData[length] = X_b[i+1]

        labels.append(curData)


    for i in range(0, len(allForwards)):
      


        grad, loss = gradient(allForwards[i], cell, labels[i], allForwards[i]["Yhat"], lprimea, lprimec)
        lprimea = grad["dLda_prev"]
        lprimec = grad["dLdc_prev"]

        gradientTot["dLdw_f"] += grad["dLdw_f"]
        gradientTot["dLdw_c"] += grad["dLdw_c"]
        gradientTot["dLdw_o"] += grad["dLdw_o"]
        gradientTot["dLdw_i"] += grad["dLdw_i"]
        gradientTot["dLdw_y"] += grad["dLdw_y"]
        
        gradientTot["dLdb_f"] += grad["dLdb_f"]
        gradientTot["dLdb_c"] += grad["dLdb_c"]
        gradientTot["dLdb_o"] += grad["dLdb_o"]
        gradientTot["dLdb_i"] += grad["dLdb_i"]
        gradientTot["dLdb_y"] += grad["dLdb_y"]

        # print(grad["dLdw_f"], grad["dLdw_c"], grad["dLdw_o"], grad["dLdw_i"])

        lossTot += loss
    

    cell["W_f"] = cell["W_f"] - gradientTot["dLdw_f"] * lr
    cell["W_c"] = cell["W_c"] - gradientTot["dLdw_c"] * lr
    cell["W_o"] = cell["W_o"] - gradientTot["dLdw_o"] * lr
    cell["W_i"] = cell["W_i"] - gradientTot["dLdw_i"] * lr
    cell["W_y"] = cell["W_y"] - gradientTot["dLdw_y"] * lr

    cell["b_f"] = cell["b_f"] - gradientTot["dLdb_f"] * lr
    cell["b_c"] = cell["b_c"] - gradientTot["dLdb_c"] * lr
    cell["b_o"] = cell["b_o"] - gradientTot["dLdb_o"] * lr
    cell["b_i"] = cell["b_i"] - gradientTot["dLdb_i"] * lr
    cell["b_y"] = cell["b_y"] - gradientTot["dLdb_y"] * lr

    return lossTot


In [None]:
def train_LSTM(input_size, hidden_size, dataset, batch_size):

    cell = initialize_cell(input_size, hidden_size)

    losses = []

    for data in dataset:
        # print(data)
        loss = descent(cell, data, input_size, hidden_size, 0.0000001, batch_size)
        print(loss)
        losses.append(loss)

    return cell
    

In [None]:

batch_size = 1
vocab_size = 724

hidden_size = 128
input_size = hidden_size + vocab_size

dataset = load_emails(batch_size)

  0%|          | 0/3332 [00:00<?, ?it/s]

100%|██████████| 3332/3332 [00:02<00:00, 1651.86it/s]
100%|██████████| 3332/3332 [00:02<00:00, 1645.81it/s]


In [418]:

ex_email = dataset[random.randint(0, 50)]



finalCell = train_LSTM(input_size, hidden_size, dataset, batch_size)

852


  return np.exp(x)/np.sum(np.exp(x))
  return np.exp(x)/np.sum(np.exp(x))
  return 1/(1+np.exp(-1*x))
  return (np.exp(x) - np.exp(-x))/(np.exp(x)+np.exp(-x))
  return (np.exp(x) - np.exp(-x))/(np.exp(x)+np.exp(-x))


0.876121606335744
1.4331230783156856
0.38752456947956376
0.37423282399255664
0.2131663227673421
0.0008166267040099707
0.4497566096131024
0.789059926341296
0.4115581851074741
0.016417552868998785
0.405844054846491
0.8810087593251433
0.07115235950728982
0.5883263456794929
0.36866683539906975
0.027959955399407967
0.0009139286299569124
0.6463220131131662
0.42174844207763196
7.707939338415451e-09
8.882065671790448e-07
1.353630758834512e-08
0.5806968824756492
0.27073174663353894
0.07352383862880611
0.30212141784940105
0.01965305166806664
0.0007884166475583399
0.01928658779216722
1.1440148075844668e-08
0.23898164259542992
0.14081254101665183
0.013285247371717067
0.1271080638736051
0.7757456081675826
1.0025727927915076e-10
0.37507155784594176
0.8637299744597082
1.9359015282914037e-08
0.6386361331522326
0.01203311738601679
3.6081493216157306e-06
2.7156436439857272e-05
0.0001138014767083307
0.00043263349625422855
8.973828415458993e-06
0.0008438769705098503
0.0009053876082199869
0.081056542567667

KeyboardInterrupt: 