In [None]:
import os
import pickle
import pandas as pd
import tensorflow as tf
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

In [None]:
CONFIG = {
    'data_path': '../jigsaw-toxic-comment-classification-challenge/train.csv/train.csv',
    'model_base_dir': 'model',
    'max_features': 200000, 
    'sequence_length': 1800, 
    'embedding_dim': 32,
    'lstm_units': 32,
    'batch_size': 16,
    'epochs': 3,
    'validation_split': 0.2,
    'test_split': 0.1,
    'random_seed': 42
}


# Set random seeds for reproducibility
tf.random.set_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])

In [None]:
# Load data
print(f"Loading data from: {CONFIG['data_path']}")
df = pd.read_csv(CONFIG['data_path'])

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
toxicity_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for col in toxicity_columns:
    toxic_count = df[col].sum()
    toxic_pct = (toxic_count / len(df)) * 100
    print(f"{col}: {toxic_count:,} ({toxic_pct:.2f}%)")

# Plot
fig, ax = plt.subplots(figsize=(5, 3))
toxicity_counts = df[toxicity_columns].sum().sort_values(ascending=True)
toxicity_counts.plot(kind='barh', ax=ax)
ax.set_title('Distribution of Toxicity Labels')
ax.set_xlabel('Number of Comments')
plt.tight_layout()
plt.show()

In [None]:
from tensorflow.keras.layers import TextVectorization

# Prepare data
X = df['comment_text'].values
y = df[toxicity_columns].values

print(f"Input shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Number of toxicity categories: {len(toxicity_columns)}")

In [None]:
# Create and configure text vectorizer
vectorizer = TextVectorization(
    max_tokens=CONFIG['max_features'],
    output_sequence_length=CONFIG['sequence_length'],
    output_mode='int'
)

print("Adapting vectorizer to the text data")
vectorizer.adapt(X)

print(f"Vocabulary size: {len(vectorizer.get_vocabulary())}")
print(f"Sequence length: {CONFIG['sequence_length']}")

In [None]:
# Vectorize the text
print("Vectorizing text data")
vectorized_text = vectorizer(X)
print(f"Vectorized text shape: {vectorized_text.shape}")

In [None]:
# Create TensorFlow dataset
print("Creating TensorFlow dataset")
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000, seed=CONFIG['random_seed'])
dataset = dataset.batch(CONFIG['batch_size'])
dataset = dataset.prefetch(tf.data.AUTOTUNE)

dataset_size = len(dataset)
print(f"Total dataset size: {dataset_size} batches")

In [None]:
# Split dataset
train_size = int(dataset_size * (1 - CONFIG['validation_split'] - CONFIG['test_split']))
val_size = int(dataset_size * CONFIG['validation_split'])
test_size = dataset_size - train_size - val_size

train_dataset = dataset.take(train_size)
remaining_dataset = dataset.skip(train_size)
val_dataset = remaining_dataset.take(val_size)
test_dataset = remaining_dataset.skip(val_size)

print(f"Training batches: {train_size}")
print(f"Validation batches: {val_size}")
print(f"Test batches: {test_size}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

def create_model(vocab_size, embedding_dim, lstm_units, num_classes, sequence_length):
    model = Sequential([
        # Embedding layer
        Embedding(vocab_size + 1, embedding_dim, input_length=sequence_length),
        
        # Bidirectional LSTM
        Bidirectional(LSTM(lstm_units, activation='tanh')),
        
        # Dense layers for feature extraction
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        
        # Output layer (sigmoid for multi-label classification)
        Dense(num_classes, activation='sigmoid')
    ])
    
    return model

# Create the model
model = create_model(
    vocab_size=CONFIG['max_features'],
    embedding_dim=CONFIG['embedding_dim'],
    lstm_units=CONFIG['lstm_units'],
    num_classes=len(toxicity_columns),
    sequence_length=CONFIG['sequence_length']
)

model.summary()

In [None]:
# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', 'precision', 'recall']
)

print("Model compiled")

In [None]:
# Setup callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-7,
        verbose=1
    )
]

print("Callbacks configured")

In [None]:
# Train the model
print(f"Starting training for {CONFIG['epochs']} epochs")
history = model.fit(
    train_dataset,
    epochs=CONFIG['epochs'],
    validation_data=val_dataset,
    callbacks=callbacks,
    verbose=1
)

print("Training complete")

In [None]:
# Evaluate on test set
print("Evaluating model on test set")
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_dataset, verbose=1)

print(f"Loss: {test_loss:.4f}")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {2 * (test_precision * test_recall) / (test_precision + test_recall):.4f}")


In [None]:
# Create subdirectory for this epoch run (e.g., 'models/01_epochs/')
epoch_dir = os.path.join(CONFIG['model_base_dir'], f"{CONFIG['epochs']:02d}_epochs")

# Define paths
model_save_path = os.path.join(epoch_dir, 'toxicity.keras')
vectorizer_save_path = os.path.join(epoch_dir, 'vectorizer.pkl')

# Ensure directory exists
def ensure_dir_exists(file_path):
    directory = os.path.dirname(file_path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    return file_path

# Save model
model_path = ensure_dir_exists(model_save_path)
print(f"Saving model to: {model_path}")
model.save(model_path)

# Save vectorizer
vectorizer_path = ensure_dir_exists(vectorizer_save_path)
print(f"Saving vectorizer to: {vectorizer_path}")
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)

# Output file sizes
print("\nModel and vectorizer saved")
print(f"Model file size: {os.path.getsize(model_path) / (1024*1024):.1f} MB")
print(f"Vectorizer file size: {os.path.getsize(vectorizer_path) / (1024*1024):.1f} MB")

In [None]:
# Test loading and prediction
print("Testing model")

# Load the saved model and vectorizer
loaded_model = tf.keras.models.load_model(model_path)
with open(vectorizer_path, 'rb') as f:
    loaded_vectorizer = pickle.load(f)

# Test with sample comments
test_comments = [
    "You are such an idiot!",
    "This is a great article, thanks for sharing.",
    "I hope you die in a fire!",
    "Could you please explain this better?"
]

print("\nTest predictions:")
for comment in test_comments:
    # Vectorize comment
    vectorized_comment = loaded_vectorizer([comment])
    
    # Make prediction
    prediction = loaded_model.predict(vectorized_comment, verbose=0)
    
    # Format results
    print(f"\nComment: '{comment}'")
    for i, category in enumerate(toxicity_columns):
        prob = prediction[0][i]
        is_toxic = prob > 0.5
        print(f"  {category}: {prob:.3f} {'(TOXIC)' if is_toxic else '(clean)'}")

print("\nModel loading and prediction successful")
