In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dfpath = '/content/drive/My Drive/SpamFilterTrain.csv'
df = pd.read_csv(dfpath)

In [5]:
X = df['question_text'].values
y = df['target'].values

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
max_length = max([len(x) for x in X_seq])
X_padded = pad_sequences(X_seq, maxlen=max_length, padding='post')

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [8]:
embedding_dim = 100
embedding_index = {}
with open('/content/drive/My Drive/glove.6B/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [9]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])



In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
history = model.fit(X_train, y_train, epochs=1, batch_size=128, validation_data=(X_val, y_val))

[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5001s[0m 612ms/step - accuracy: 0.9487 - loss: 0.1395 - val_accuracy: 0.9565 - val_loss: 0.1101


In [13]:
y_pred = (model.predict(X_val) > 0.5).astype("int32")
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0m 60ms/step
Validation Accuracy: 0.9565
