In [1]:
# !pip install opendatasets --upgrade --quiet
# import opendatasets as od
# dataset = 'https://www.kaggle.com/datasets/wcukierski/enron-email-dataset'
# od.download(dataset)

In [4]:
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

In [5]:
import pandas as pd
df = pd.read_csv('/kaggle/input/enron-email-dataset/emails.csv')

In [7]:
import sys, email
df['message'] = list(map(get_text_from_email, list(map(email.message_from_string, df['message']))))

In [8]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention, Dropout, LayerNormalization
from tensorflow.keras.models import Model
import numpy as np

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"), 
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def create_model(vocab_size, embed_dim, num_heads, ff_dim, maxlen):
    inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocab_size, embed_dim)(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(embedding_layer)
    outputs = Dense(vocab_size)(x)
    model = Model(inputs=inputs, outputs=outputs)
    return model

2024-04-26 06:17:41.425077: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 06:17:41.425206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 06:17:41.557067: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
texts = df.message.values
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
seqs = tokenizer.texts_to_sequences(texts)
maxlen = max([len(i.split(' ')) for i in texts])
seqs = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen=maxlen, padding='post')

def mask_input(seqs, mask_prob=0.15):
    random_masks = np.random.rand(*seqs.shape) < mask_prob
    masked_seqs = np.where(random_masks, 32365, seqs) 
    return masked_seqs, seqs

masked_inputs, labels = mask_input(seqs)
with tf.device('/gpu:0'):
    model = create_model(vocab_size, embed_dim=32, num_heads=2, ff_dim=32, maxlen=maxlen)
    model.compile(optimizer="adam", loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
    model.fit(masked_inputs, labels, epochs=10, batch_size=2)

Epoch 1/10


I0000 00:00:1714112277.460848     198 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
