In [40]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from tensorflow.keras.layers import Input, Dense, LayerNormalization, MultiHeadAttention, Dropout
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler

In [41]:
df = pd.read_csv("spam.csv")
df.head(2)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [42]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)  
    text = re.sub('"', '', text)  
    text = re.sub(r'[^a-zA-Z?.!,\d]', ' ', text)  
    text = re.sub(r'\s+', ' ', text) 
    return text
df['Message'] = df['Message'].apply(preprocess_text)

In [50]:
# Tokenizer for input text
msg_tokenizer = Tokenizer()
msg_tokenizer.fit_on_texts(df['Message'])
X_train = msg_tokenizer.texts_to_sequences(df['Message'])
max_msg_length = pd.Series(X_train).map(len).max()+1
X_train_padded = pad_sequences(X_train, maxlen=max_msg_length, padding='post')
vocab_size = len(msg_tokenizer.word_index) + 1

In [44]:
X_train_padded.shape

(5572, 191)

In [45]:
max_msg_length

191

In [67]:
from sklearn.preprocessing import LabelEncoder 
le=LabelEncoder()
df['Category']= le.fit_transform(df['Category'])
y=df['Category']
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64

In [47]:
from tensorflow.keras import layers
class PositionalEncoding(layers.Layer):
    def __init__(self, seq_len, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(seq_len, d_model)

    def positional_encoding(self, seq_len, d_model):
        position = tf.range(seq_len, dtype=tf.float32)[:, tf.newaxis]
        div_term = tf.exp(tf.range(0, d_model, 2, dtype=tf.float32) * -(tf.math.log(10000.0) / d_model))
        sin_vals = tf.math.sin(position * div_term)
        cos_vals = tf.math.cos(position * div_term)
        pos_encoding = tf.concat([sin_vals, cos_vals], axis=-1)
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, inputs):
        # Ensure that positional encoding matches input dimensions
        batch_size = tf.shape(inputs)[0]
        sequence_length = tf.shape(inputs)[1]
        d_model = tf.shape(inputs)[2]  # Ensure matching feature size

        pos_encoding_resized = self.pos_encoding[:sequence_length, :d_model]  # Adjust to match feature dimensions
        return inputs + tf.expand_dims(pos_encoding_resized, axis=0)  # Expand for batch dimension


In [51]:
# Model parameters
d_model = 25 # Embedding size
num_heads = 8  # Number of attention heads
dff = 256  # Feedforward network size
from tensorflow.keras import layers
# Encoder
encoder_inputs = Input(shape=(max_msg_length,))  # (Batch, Time Steps, Features)
encoder_embedding = Embedding(vocab_size, d_model, mask_zero=True)(encoder_inputs)
pos_encoding_enc = PositionalEncoding(max_msg_length,d_model)
encoder_inputs_with_pos = pos_encoding_enc(encoder_embedding)
attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(encoder_inputs_with_pos, encoder_inputs_with_pos)
attention = layers.Dropout(0.2)(attention)
attention = layers.LayerNormalization(epsilon=1e-6)(encoder_inputs_with_pos + attention)
outputs = layers.Dense(256, activation='relu')(attention)
outputs = layers.Dense(d_model)(outputs)
pooled_output = layers.GlobalAveragePooling1D()(outputs) # summarize
classification_output = layers.Dense(1, activation='sigmoid')(pooled_output)
model = Model(inputs=encoder_inputs, outputs=classification_output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_13 (InputLayer)       [(None, 191)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 191, 25)              217775    ['input_13[0][0]']            
                                                                                                  
 positional_encoding_9 (Pos  (None, 191, 25)              0         ['embedding[0][0]']           
 itionalEncoding)                                                                                 
                                                                                                  
 multi_head_attention_7 (Mu  (None, 191, 25)              20625     ['positional_encoding_9[

In [52]:
model.fit(X_train_padded, y, epochs=1, batch_size=64, validation_split=0.2)



<keras.src.callbacks.History at 0x201dfd27950>

In [71]:
test= 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.'

In [72]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)  
    text = re.sub('"', '', text)  
    text = re.sub(r'[^a-zA-Z?.!,\d]', ' ', text)  
    text = re.sub(r'\s+', ' ', text) 
    return text
test_input=preprocess_text(test)
msg_tokenizer.fit_on_texts(['test_input'])
seq = msg_tokenizer.texts_to_sequences(['test_input'])
test_padded = pad_sequences(seq, maxlen=max_msg_length, padding='post')
predicted_labels = model.predict(test_padded)



In [73]:
predicted_labels = np.argmax(predicted_labels, axis=1)  # Get class index
print(predicted_labels)

[0]
