In [None]:
import os
import pickle
import pandas as pd
import tensorflow as tf
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix


print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

In [None]:
CONFIG = {
    'data_path': '../jigsaw-toxic-comment-classification-challenge/train.csv/train.csv',
    'model_base_dir': 'model',
    'max_features': 200000, 
    'sequence_length': 1800, 
    'embedding_dim': 32,
    'lstm_units': 32,
    'batch_size': 16,
    'epochs': 3,
    'validation_split': 0.2,
    'test_split': 0.1,
    'random_seed': 42
}


# Set random seeds for reproducibility
tf.random.set_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])

In [None]:
# Load model and vectorizer

# Create subdirectory for this epoch run (e.g., 'models/01_epochs/')
epoch_dir = os.path.join(CONFIG['model_base_dir'], f"{CONFIG['epochs']:02d}_epochs")

# Define paths
model_path = os.path.join(epoch_dir, 'toxicity.keras')
vectorizer_path = os.path.join(epoch_dir, 'vectorizer.pkl')

# Load the saved model and vectorizer
loaded_model = tf.keras.models.load_model(model_path)
with open(vectorizer_path, 'rb') as f:
    loaded_vectorizer = pickle.load(f)

if loaded_model:
    print(f"Loaded model from {model_path}")
else:
    print(f"Failed to load model from {model_path}")

if loaded_vectorizer:
    print(f"Loaded vectorizer from {vectorizer_path}")
else:
    print(f"Failed to load vectorizer from {vectorizer_path}")

In [None]:
# Load data
print(f"Loading data from: {CONFIG['data_path']}")
df = pd.read_csv(CONFIG['data_path'])

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
from tensorflow.keras.layers import TextVectorization

toxicity_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Prepare data
X = df['comment_text'].values
y = df[toxicity_columns].values

print(f"Input shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Number of toxicity categories: {len(toxicity_columns)}")

In [None]:
# Vectorize the text
print("Vectorizing text data")
vectorized_text = loaded_vectorizer(X)
print(f"Vectorized text shape: {vectorized_text.shape}")

In [None]:
# Create TensorFlow dataset
print("Creating TensorFlow dataset")
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000, seed=CONFIG['random_seed'])
dataset = dataset.batch(CONFIG['batch_size'])
dataset = dataset.prefetch(tf.data.AUTOTUNE)

dataset_size = len(dataset)
print(f"Total dataset size: {dataset_size} batches")

In [None]:
# Split dataset
train_size = int(dataset_size * (1 - CONFIG['validation_split'] - CONFIG['test_split']))
val_size = int(dataset_size * CONFIG['validation_split'])
test_size = dataset_size - train_size - val_size

train_dataset = dataset.take(train_size)
remaining_dataset = dataset.skip(train_size)
val_dataset = remaining_dataset.take(val_size)
test_dataset = remaining_dataset.skip(val_size)

print(f"Training batches: {train_size}")
print(f"Validation batches: {val_size}")
print(f"Test batches: {test_size}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Get predictions and true labels from test_dataset
y_true = []
y_pred_probs = []

for batch in test_dataset:
    X_batch, y_batch = batch
    y_true.append(y_batch.numpy())
    y_pred_probs.append(loaded_model.predict(X_batch, verbose=0))

# Stack into arrays
y_true = np.vstack(y_true)
y_pred_probs = np.vstack(y_pred_probs)
y_pred = (y_pred_probs > 0.5).astype(int)


In [None]:

# Ensure directory exists
save_dir = "../docs"
os.makedirs(save_dir, exist_ok=True)

# Save to file
save_path = os.path.join(save_dir, "confusion_matrix.png")

# Define class names
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Change index `i` for different labels (0 = toxic, 1 = severe_toxic, etc.)
i = class_names.index("toxic")
cm = confusion_matrix(y_true[:, i], y_pred[:, i])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not toxic", "Toxic"])
disp.plot(cmap="Blues")
plt.title(f"Confusion Matrix: {class_names[i].capitalize()}")
plt.tight_layout()
plt.savefig(save_path, bbox_inches='tight', dpi=300)
plt.close()
