In [9]:
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.metrics import classification_report

# Download stopwords
nltk.download('stopwords')

# Load data
train_df = pd.read_csv('Train.csv').head(4000)
valid_df = pd.read_csv('Valid.csv').head(500)
test_df = pd.read_csv('Test.csv').head(500)

print('Train: '+ str(len(train_df)))
print('Valid: '+ str(len(valid_df)))
print('Test: '+ str(len(test_df)))

# Text preprocessing
train_df['text'] = train_df['text'].str.lower()
valid_df['text'] = valid_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()

import string
exclude = set(string.punctuation)

def remove_punctuation(x):
    return ''.join(ch for ch in x if ch not in exclude)

train_df['text'] = train_df['text'].apply(remove_punctuation)
valid_df['text'] = valid_df['text'].apply(remove_punctuation)
test_df['text'] = test_df['text'].apply(remove_punctuation)

from nltk.corpus import stopwords
stop = stopwords.words('english')

train_df['text'] = train_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
valid_df['text'] = valid_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test_df['text'] = test_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Tokenization and Padding
max_words = 20000  # Number of unique words
max_len = 100      # Maximum length of sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['text'])

X_train = tokenizer.texts_to_sequences(train_df['text'])
X_valid = tokenizer.texts_to_sequences(valid_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

X_train_pad = pad_sequences(X_train, maxlen=max_len)
X_valid_pad = pad_sequences(X_valid, maxlen=max_len)
X_test_pad = pad_sequences(X_test, maxlen=max_len)

# Define the CNN model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=4),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=4),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # For binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train_pad, train_df['label'],
    epochs=10,
    validation_data=(X_valid_pad, valid_df['label']),
    batch_size=32
)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_pad, test_df['label'])
print(f"Test accuracy: {test_acc:.4f}")

# Make predictions
predictions = (model.predict(X_test_pad) > 0.5).astype("int32")

# Convert predictions to a list
predictions = predictions.flatten()

# Classification report
report = classification_report(test_df['label'], predictions)
print(report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Train: 4000
Valid: 500
Test: 500




Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 68ms/step - accuracy: 0.5318 - loss: 0.6909 - val_accuracy: 0.7800 - val_loss: 0.4478
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 61ms/step - accuracy: 0.8606 - loss: 0.3437 - val_accuracy: 0.8260 - val_loss: 0.3944
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 60ms/step - accuracy: 0.9879 - loss: 0.0464 - val_accuracy: 0.8280 - val_loss: 0.5731
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 70ms/step - accuracy: 0.9987 - loss: 0.0063 - val_accuracy: 0.8140 - val_loss: 0.8086
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - accuracy: 1.0000 - loss: 8.9715e-04 - val_accuracy: 0.8180 - val_loss: 0.9158
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 69ms/step - accuracy: 1.0000 - loss: 2.4939e-04 - val_accuracy: 0.8140 - val_loss: 1.0096
Epoch 7/10
