In [1]:
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import sys
sys.path.insert(0, os.path.abspath(os.path.join("..", "src")))
from transformer_model import *

In [2]:
df = pd.read_csv("../data/train.csv")
y = df['target']
X = df['text']  

In [3]:
vocab_size = 10000  
maxlen = 200  

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=maxlen)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [5]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [6]:
BATCH_SIZE = 16
train_dataset = train_dataset.shuffle(buffer_size=len(X_train)).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [14]:
embed_dim = 16  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 16  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x) 



In [15]:
model = keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

history = model.fit(train_dataset,epochs=3, validation_data=test_dataset)


test_loss, test_acc = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

Epoch 1/3
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5438 - loss: 0.7058 - val_accuracy: 0.5701 - val_loss: 0.6762
Epoch 2/3
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.5814 - loss: 0.6710 - val_accuracy: 0.7680 - val_loss: 0.5059
Epoch 3/3
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8271 - loss: 0.4092 - val_accuracy: 0.7855 - val_loss: 0.4995
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7883 - loss: 0.4917
Test Loss: 0.49951064586639404, Test Accuracy: 0.7854641079902649


In [16]:
from sklearn.metrics import classification_report, f1_score
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

# Generate classification report
print(classification_report(y_test, y_pred, digits=4))

f1 = f1_score(y_pred,y_test)
print(f"F1 Score: {f1:.4f}")
# Optionally, evaluate the model
test_loss, test_acc = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
              precision    recall  f1-score   support

           0     0.8104    0.8141    0.8123       651
           1     0.7520    0.7475    0.7497       491

    accuracy                         0.7855      1142
   macro avg     0.7812    0.7808    0.7810      1142
weighted avg     0.7853    0.7855    0.7854      1142

F1 Score: 0.7497
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7883 - loss: 0.4917
Test Loss: 0.49951064586639404, Test Accuracy: 0.7854641079902649
