In [1]:
!pip install flax optax unidecode



In [53]:
import re
import pandas as pd
import flax.linen as nn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from jax import numpy as jnp, random, value_and_grad, jit, tree_map
from jax.typing import ArrayLike
from typing import Callable
from pprint import pprint
from unidecode import unidecode
from tqdm import tqdm
plt.style.use("ggplot")

In [3]:
!wget 'https://drive.google.com/uc?export=view&id=154lbEOzdKaKF0hGonuwn-Ff3-YIfcD1x' -O data.zip

--2023-08-18 22:10:49--  https://drive.google.com/uc?export=view&id=154lbEOzdKaKF0hGonuwn-Ff3-YIfcD1x
Resolving drive.google.com (drive.google.com)... 74.125.200.138, 74.125.200.100, 74.125.200.113, ...
Connecting to drive.google.com (drive.google.com)|74.125.200.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-4g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/7etbnnj7hmf752pbdo05ne900jod9td8/1692396600000/16848862265445619282/*/154lbEOzdKaKF0hGonuwn-Ff3-YIfcD1x?e=view&uuid=fa26bba5-57f2-47d6-9ccb-d17249a3a6e4 [following]
--2023-08-18 22:10:54--  https://doc-08-4g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/7etbnnj7hmf752pbdo05ne900jod9td8/1692396600000/16848862265445619282/*/154lbEOzdKaKF0hGonuwn-Ff3-YIfcD1x?e=view&uuid=fa26bba5-57f2-47d6-9ccb-d17249a3a6e4
Resolving doc-08-4g-docs.googleusercontent.com (doc-08-4g-docs.googleusercontent.com)... 172.253.118.132, 2404:6800:4003:c

In [4]:
![[ -f Phishing_Email.csv ]] && rm *.csv
!unzip data.zip

Archive:  data.zip
  inflating: Phishing_Email.csv      


In [5]:
data = (
    pd
    .read_csv("Phishing_Email.csv", index_col=0)
    .rename(columns={"Email Text": "text", "Email Type": "label"})
)
data.head()

Unnamed: 0,text,label
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [6]:
x = "En 1905, cuando era un joven físico desconocido, empleado en la Oficina de Patentes de Berna, publicó su teoría de la relatividad especial. En ella incorporó, en un marco teórico simple fundamentado en postulados físicos sencillos, conceptos y fenómenos estudiados antes por Henri Poincaré y Hendrik Lorentz. Como una consecuencia lógica de esta teoría, dedujo la ecuación de la física más conocida a nivel popular: la equivalencia masa-energía, E=mc². Ese año, publicó otros trabajos que sentarían algunas de las bases de la física estadística y de la mecánica cuántica."
pprint(x)

('En 1905, cuando era un joven físico desconocido, empleado en la Oficina de '
 'Patentes de Berna, publicó su teoría de la relatividad especial. En ella '
 'incorporó, en un marco teórico simple fundamentado en postulados físicos '
 'sencillos, conceptos y fenómenos estudiados antes por Henri Poincaré y '
 'Hendrik Lorentz. Como una consecuencia lógica de esta teoría, dedujo la '
 'ecuación de la física más conocida a nivel popular: la equivalencia '
 'masa-energía, E=mc². Ese año, publicó otros trabajos que sentarían algunas '
 'de las bases de la física estadística y de la mecánica cuántica.')


In [7]:
norm_text = unidecode(x)
pprint(norm_text)

('En 1905, cuando era un joven fisico desconocido, empleado en la Oficina de '
 'Patentes de Berna, publico su teoria de la relatividad especial. En ella '
 'incorporo, en un marco teorico simple fundamentado en postulados fisicos '
 'sencillos, conceptos y fenomenos estudiados antes por Henri Poincare y '
 'Hendrik Lorentz. Como una consecuencia logica de esta teoria, dedujo la '
 'ecuacion de la fisica mas conocida a nivel popular: la equivalencia '
 'masa-energia, E=mc2. Ese ano, publico otros trabajos que sentarian algunas '
 'de las bases de la fisica estadistica y de la mecanica cuantica.')


In [8]:
pat_schars = re.compile(r"[^a-z]")
pat_spaces = re.compile(r"\s+")

In [9]:
def preprocess(doc: str) -> str:
    lower_text = doc.lower()
    norm_text = unidecode(lower_text)
    text_no_chars = re.sub(pat_schars, " ", norm_text)
    text_no_spaces = re.sub(pat_spaces, " ", text_no_chars)
    words = text_no_spaces.split(" ")
    filtered_words = filter(lambda word: len(word) > 3, words)
    return " ".join(filtered_words)

In [10]:
preprocess(data.text.iloc[0])

'disc uniformitarianism lang dick hudson observations aughter vocative very thought provoking sure that fair attribute this sons being treated like senior relatives thing normally brother this more than aughter hard imagine natural class comprising senior relatives excluding brother another there seem differences here imagining distinction that there seems that senior relative terms used wider variety contexts calling from distance someone attention hence beginning utterance whereas seems more natural utterances like hand that than ones like help although perhaps these latter ones completely impossible alexis'

In [11]:
data.shape

(18650, 2)

In [12]:
data.label.value_counts()

Safe Email        11322
Phishing Email     7328
Name: label, dtype: int64

In [13]:
label2int = {"Safe Email": 0, "Phishing Email": 1}

In [14]:
preprocess_data = (
    data
    .dropna()
    .assign(
        text=lambda df: df.text.apply(preprocess),
        label=lambda df: df.label.map(label2int)
        )
)

In [15]:
preprocess_data.head()

Unnamed: 0,text,label
0,disc uniformitarianism lang dick hudson observ...,0
1,other side galicismos galicismo spanish term w...,0
2,equistar deal tickets still available assist r...,0
3,hello your horny dream about very open minded ...,1
4,software incredibly prices lower drapery seven...,1


In [16]:
vect = TfidfVectorizer(max_features=500).fit(preprocess_data.text)

In [17]:
features = vect.transform(preprocess_data.text).toarray()

In [18]:
features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
features.sum(axis=0)

array([ 108.73272176,  416.9052205 ,  109.4684337 ,   47.29327349,
         60.34047005,  129.1482436 ,  155.30900295,   57.58548145,
         89.52434313,  242.88377102,   90.33632876,  185.69170595,
        143.44133377,   70.32829442,   92.52792269,   99.5008633 ,
        297.36175693,  103.99349909,   98.47725298,   84.90341628,
         86.37365718,  119.07540596,  143.33243668,  101.90835834,
         95.08118291,   65.85364962,  113.65063835,   83.76629967,
         50.84278047,  104.50949125,  122.91807188,   50.5977273 ,
        230.66340053,  199.91619138,   97.78662823,  142.87730546,
        197.49080338,  350.00869624,  178.57890675,  162.3248913 ,
         98.29778885,  182.82785036,  290.45212948,  136.11350853,
        126.50528291,   42.43702162,  145.22666495,  139.92107815,
         55.15601684,  288.67402568,  128.91264047,  265.75215991,
         71.85196197,   94.79376688,  114.9806908 ,  121.2575295 ,
         97.83758472,   97.56829954,   49.16836382,  175.89479

In [20]:
class MyNet(nn.Module):
    def setup(self: "MyNet"):
        self.layer1 = nn.Dense(features=8)
        self.layer2 = nn.Dense(features=1)

    def __call__(self: "MyNet", x: ArrayLike) -> ArrayLike:
        h1 = nn.sigmoid(self.layer1(x))
        return nn.sigmoid(self.layer2(h1))

In [34]:
features = jnp.array(features, dtype=jnp.float32)
labels = jnp.array(preprocess_data.label).reshape(-1, 1)

In [35]:
labels

Array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]], dtype=int32)

In [36]:
x = jnp.linspace(0, 1, 10).reshape(-1, 1)
y = jnp.linspace(0, 1, 10)

In [37]:
x.shape

(10, 1)

In [38]:
y.shape

(10,)

In [39]:
x * y

Array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.01234568, 0.02469136, 0.03703704, 0.04938272,
        0.0617284 , 0.07407407, 0.08641975, 0.09876543, 0.11111111],
       [0.        , 0.02469136, 0.04938272, 0.07407407, 0.09876543,
        0.1234568 , 0.14814815, 0.17283951, 0.19753087, 0.22222222],
       [0.        , 0.03703704, 0.07407407, 0.11111112, 0.14814815,
        0.1851852 , 0.22222224, 0.25925928, 0.2962963 , 0.33333334],
       [0.        , 0.04938272, 0.09876543, 0.14814815, 0.19753087,
        0.2469136 , 0.2962963 , 0.34567901, 0.39506173, 0.44444445],
       [0.        , 0.0617284 , 0.1234568 , 0.1851852 , 0.2469136 ,
        0.308642  , 0.3703704 , 0.4320988 , 0.4938272 , 0.5555556 ],
       [0.        , 0.07407407, 0.14814815, 0.22222224, 0.2962963 ,
        0.3703704 , 0.44444448, 0.51851857, 0.5925926 , 0.6666667 ],
       [0.        , 0.08641975, 0.1728395

In [40]:
key = random.PRNGKey(1211)
model = MyNet()

In [41]:
print(model.tabulate(key, features[:32]))


[3m                                MyNet Summary                                 [0m
┏━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mpath  [0m[1m [0m┃[1m [0m[1mmodule[0m[1m [0m┃[1m [0m[1minputs         [0m[1m [0m┃[1m [0m[1moutputs      [0m[1m [0m┃[1m [0m[1mparams                [0m[1m [0m┃
┡━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩
│        │ MyNet  │ [2mfloat32[0m[32,500] │ [2mfloat32[0m[32,1] │                        │
├────────┼────────┼─────────────────┼───────────────┼────────────────────────┤
│ layer1 │ Dense  │ [2mfloat32[0m[32,500] │ [2mfloat32[0m[32,8] │ bias: [2mfloat32[0m[8]       │
│        │        │                 │               │ kernel: [2mfloat32[0m[500,8] │
│        │        │                 │               │                        │
│        │        │                 │               │ [1m4,008 [0m[1;2m(16.0 KB)[0m        │
├────────┼─────

In [42]:
def binary_crossentropy(y: ArrayLike, y_pred: ArrayLike) -> ArrayLike:
    return - (
        y * jnp.log(y_pred) +
        (1 - y) * jnp.log(1 - y_pred)
    ).mean()

In [47]:
def get_loss(
    features: ArrayLike,
    labels: ArrayLike,
    model: nn.Module
    ) -> Callable:
    @jit
    def loss(params):
        y_pred = model.apply(params, features)
        loss_value = binary_crossentropy(labels, y_pred)
        return loss_value
    return loss

In [48]:
loss_fn = get_loss(features, labels, model)
grad_fn = value_and_grad(loss_fn)

In [49]:
params = model.init(key, features[:32])

In [50]:
grad_fn(params)

(Array(0.76836544, dtype=float32),
 {'params': {'layer1': {'bias': Array([ 0.0102425 ,  0.00417563, -0.00650726,  0.02615878, -0.01032241,
            0.03366719, -0.02026459,  0.01346934], dtype=float32),
    'kernel': Array([[ 7.4284646e-05,  3.0275551e-05, -4.7192512e-05, ...,
             2.4417526e-04, -1.4694125e-04,  9.7677068e-05],
           [ 3.1963078e-04,  1.3022321e-04, -2.0302500e-04, ...,
             1.0505173e-03, -6.3225825e-04,  4.2027541e-04],
           [ 6.6196735e-05,  2.6997493e-05, -4.2071464e-05, ...,
             2.1765745e-04, -1.3102515e-04,  8.7064516e-05],
           ...,
           [ 6.9967493e-05,  2.8509496e-05, -4.4440589e-05, ...,
             2.2992164e-04, -1.3838179e-04,  9.1981878e-05],
           [ 5.7827383e-05,  2.3568957e-05, -3.6732050e-05, ...,
             1.9006434e-04, -1.1441826e-04,  7.6013639e-05],
           [-3.4089026e-04, -1.3900347e-04,  2.1636365e-04, ...,
            -1.1199651e-03,  6.7450467e-04, -4.4823688e-04]], dtype=float

In [58]:
N_ITERS = 1000
LR = 0.01

In [59]:
pbar = tqdm(range(N_ITERS))
for i in pbar:
    loss_val, grads = grad_fn(params)
    params = tree_map(lambda w, g: w - LR * g, params, grads)
    pbar.set_description(f"Iter: {i:<5}Loss: {loss_val}")

Iter: 999  Loss: 0.6680302619934082: 100%|██████████| 1000/1000 [00:08<00:00, 123.17it/s]


In [62]:
y_pred = (model.apply(params, features) > .5).astype("int")

  y_pred = (model.apply(params, features) > .5).astype("int")


In [63]:
y_pred

Array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int32)

In [65]:
(y_pred == labels).mean()

Array(0.607599, dtype=float32)