In [1]:
import pandas as pd
df = pd.read_csv('/kaggle/input/enron-email-dataset/emails.csv')

In [4]:
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

In [6]:
import sys, email
df['message'] = list(map(get_text_from_email, list(map(email.message_from_string, df['message']))))

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention, Dropout, LayerNormalization
from tensorflow.keras.models import Model
import numpy as np

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"), 
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def create_model(vocab_size, embed_dim, num_heads, ff_dim, maxlen):
    inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocab_size, embed_dim)(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(embedding_layer)
    hidden = Dense(500)(x)
    outputs = Dense(vocab_size)(hidden)
    model = Model(inputs=inputs, outputs=outputs)
    return model

2024-05-27 06:53:33.025556: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-27 06:53:33.025613: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-27 06:53:33.027135: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
new = pd.read_csv('/kaggle/input/3k-conversations-dataset-for-chatbot/Conversation.csv')

In [9]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(pd.concat([df, new.rename(columns = {'question': 'message'})]).message)
vocab_size = len(tokenizer.word_index) + 1
seqs = tokenizer.texts_to_sequences(df.message)
maxlen = 100
seqs = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen=maxlen, padding='post')

def mask_input(seqs, mask_prob=0.15):
    random_masks = np.random.rand(*seqs.shape) < mask_prob
    masked_seqs = np.where(random_masks, 32365, seqs) 
    return masked_seqs, seqs

masked_inputs, labels = mask_input(seqs)
with tf.device('/gpu:0'):
    model = create_model(vocab_size, embed_dim=8, num_heads=2, ff_dim=8, maxlen=maxlen)
    model.compile(optimizer="adam", loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
    model.fit(masked_inputs, labels, epochs=3, batch_size=100)

Epoch 1/3


I0000 00:00:1716792824.045871     114 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


In [10]:
w1 = model.layers[1].get_weights()
w2 = model.layers[2].get_weights()
w3 = model.layers[3].get_weights()
w4 = model.layers[4].get_weights()

In [11]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention, Dropout, LayerNormalization, Flatten, Reshape
from tensorflow.keras.models import Model
import numpy as np

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
prompts_seq = tokenizer.texts_to_sequences(new.question)
answers_seq = tokenizer.texts_to_sequences(new.answer)

maxlen_prompt = 20
maxlen_answer = 20

prompts_seq = pad_sequences(prompts_seq, maxlen=maxlen_prompt, padding='post')
answers_seq = pad_sequences(answers_seq, maxlen=maxlen_answer, padding='post')


In [14]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention, Dropout, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"), 
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def create_model(vocab_size, embed_dim, num_heads, ff_dim, maxlen_prompt, maxlen_answer):
    inputs = Input(shape=(maxlen_prompt,))
    embedding_layer = Embedding(vocab_size, embed_dim)(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(embedding_layer, training=True)
    x = Dense(500, activation='relu')(x)
    outputs = Dense(vocab_size)(x)
    outputs = tf.keras.layers.Reshape((maxlen_prompt, vocab_size))(outputs)
    model = Model(inputs=inputs, outputs=outputs)
    return model


In [15]:
model = create_model(vocab_size, embed_dim=8, num_heads=2, ff_dim=8, maxlen_prompt=maxlen_prompt, maxlen_answer=maxlen_answer)

model.layers[1].set_weights(w1)
model.layers[2].set_weights(w2)
model.layers[3].set_weights(w3)
model.layers[4].set_weights(w4)

model.compile(optimizer="adam", loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

labels = np.expand_dims(answers_seq, axis=-1)

with tf.device('/gpu:0'):
    model.fit(prompts_seq, labels, epochs=5, batch_size=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
