In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test_labels.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv


In [2]:
# -*- coding: utf-8 -*-
"""
End-to-end script for Jigsaw Multilingual Toxic Comment Classification.

This script trains a multilingual model (XLM-RoBERTa) on English-only data
and evaluates its ability to classify toxic comments in other languages.

Strategy:
1.  Combine the two provided English training datasets.
2.  Use the KerasNLP library to build a classifier with a pre-trained
    multilingual backbone (XLM-RoBERTa).
3.  Train the model on all available English data.
4.  Use the provided non-English `validation.csv` to monitor the primary
    metric (AUC) during training and save the best performing model.
5.  Evaluate the best model's performance on the validation set using a
    range of metrics.
6.  Predict on the unseen test set and generate a submission file.
"""

import os
import pandas as pd
import numpy as np
import tensorflow as tf
import keras_nlp
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress verbose logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.get_logger().setLevel('ERROR')

print("--- Library Versions ---")
print(f"TensorFlow: {tf.__version__}")
print(f"Keras: {tf.keras.__version__}")
print(f"KerasNLP: {keras_nlp.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")
print("-" * 25)

# --- 1. Configuration ---
class Config:
    """Configuration class for hyperparameters and file paths."""
    # Data Paths
    BASE_PATH = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification'
    TRAIN_TOXIC_PATH = os.path.join(BASE_PATH, 'jigsaw-toxic-comment-train.csv')
    TRAIN_BIAS_PATH = os.path.join(BASE_PATH, 'jigsaw-unintended-bias-train.csv')
    VALIDATION_PATH = os.path.join(BASE_PATH, 'validation.csv')
    TEST_PATH = os.path.join(BASE_PATH, 'test.csv')

    # Model Configuration
    PRESET = "xlm_roberta_base_multi" # State-of-the-art for multilingual tasks
    SEQUENCE_LENGTH = 192 # Max sequence length for the model
    BATCH_SIZE = 32 # Adjust based on GPU memory (T4x2 can handle 32)
    EPOCHS = 3 # Transformers fine-tune quickly, more epochs risk overfitting
    LEARNING_RATE = 1e-5

    # Target column
    TARGET_COL = 'toxic'
    
cfg = Config()


# --- 2. Data Loading and Preprocessing ---
print("\n--- 2. Loading and Preparing Data ---")

# Load primary training data
print("Loading jigsaw-toxic-comment-train.csv...")
df_toxic = pd.read_csv(cfg.TRAIN_TOXIC_PATH, usecols=['comment_text', 'toxic'])

# Load and process unintended bias data
# The 'toxic' column is a float score from 0.0 to 1.0. We binarize it.
print("Loading jigsaw-unintended-bias-train.csv...")
df_bias = pd.read_csv(cfg.TRAIN_BIAS_PATH, usecols=['comment_text', 'toxic'])
df_bias['toxic'] = (df_bias['toxic'] >= 0.5).astype(int)

# Combine the two English training datasets
train_df = pd.concat([df_toxic, df_bias], ignore_index=True).drop_duplicates()
print(f"Combined English training data shape: {train_df.shape}")
print("Class distribution in combined training data:")
print(train_df[cfg.TARGET_COL].value_counts(normalize=True))

# Load validation and test data
print("\nLoading validation and test data...")
val_df = pd.read_csv(cfg.VALIDATION_PATH)
test_df = pd.read_csv(cfg.TEST_PATH)

print(f"Validation data shape: {val_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Prepare data for the model
X_train = train_df['comment_text']
y_train = train_df[cfg.TARGET_COL]

X_val = val_df['comment_text']
y_val = val_df[cfg.TARGET_COL]

X_test = test_df['content'] # Note the different column name in test.csv

# --- 3. Model Building with KerasNLP ---
print("\n--- 3. Building the KerasNLP Model ---")

# Set up the preprocessor and classifier from the same multilingual preset
# The preprocessor handles tokenization, padding, etc., automatically.
preprocessor = keras_nlp.models.XlmRobertaPreprocessor.from_preset(
    cfg.PRESET,
    sequence_length=cfg.SEQUENCE_LENGTH
)

# The classifier includes the pre-trained backbone and a new classification head
classifier = keras_nlp.models.XlmRobertaClassifier.from_preset(
    cfg.PRESET,
    preprocessor=preprocessor,
    num_classes=1, # Binary classification -> 1 output neuron with sigmoid
)

# You can print the summary to see the architecture
# Note: The output layer is a single neuron with a linear activation.
# We will use `from_logits=True` in our loss function.
# classifier.summary()

# --- 4. Model Training ---
print("\n--- 4. Training the Model ---")

# Compile the model
# We use AdamW optimizer, a standard choice for Transformers.
# BinaryCrossentropy with from_logits=True is the correct loss for this setup.
# AUC is our primary evaluation metric.
classifier.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=cfg.LEARNING_RATE),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.AUC(name="auc"),
    ],
    jit_compile=True # Enable XLA compilation for performance boost on GPUs
)

# Callbacks for training
# ModelCheckpoint saves the best model based on the validation AUC.
# This is crucial as we want the model that generalizes best to other languages.
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'best_model.keras',
    monitor='val_auc',
    mode='max',
    save_best_only=True,
    save_weights_only=False, # Save the full model
    verbose=1
)

# EarlyStopping prevents wasting resources and overfitting.
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=1, # Stop if val_auc doesn't improve for 1 epoch
    mode='max',
    restore_best_weights=True,
    verbose=1
)

# Train the model
# We use the full English dataset for training and the non-English validation
# set for the `validation_data` argument. This directly optimizes for the
# competition's cross-lingual objective.
print("\nStarting training...")
print(f"Training on {len(X_train)} English comments.")
print(f"Validating on {len(X_val)} non-English comments.")

history = classifier.fit(
    x=X_train,
    y=y_train,
    batch_size=cfg.BATCH_SIZE,
    epochs=cfg.EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=[model_checkpoint, early_stopping]
)

print("Training finished.")

# --- 5. Model Evaluation ---
print("\n--- 5. Evaluating the Best Model ---")

# Load the best model saved by ModelCheckpoint
print("Loading the best model from training...")
best_model = tf.keras.models.load_model('best_model.keras')

# Generate predictions on the non-English validation set
print("Generating predictions on the validation set...")
val_predictions_logits = best_model.predict(X_val, batch_size=cfg.BATCH_SIZE)
# Apply sigmoid to convert logits to probabilities
val_predictions_probs = tf.nn.sigmoid(val_predictions_logits).numpy().flatten()
val_predictions_binary = (val_predictions_probs >= 0.5).astype(int)

# Define an evaluation function for the requested metrics
def evaluate_classification(y_true, y_pred_prob, y_pred_binary):
    """Calculates and prints a suite of classification metrics."""
    print("\n--- Detailed Evaluation Metrics ---")
    
    # AUC (the main competition metric)
    auc = roc_auc_score(y_true, y_pred_prob)
    print(f"ROC AUC Score: {auc:.5f}")
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred_binary)
    print(f"Accuracy: {accuracy:.5f}")
    
    # Log Loss
    logloss = log_loss(y_true, y_pred_prob)
    print(f"Log Loss: {logloss:.5f}")
    
    # F1-Score (Macro and Weighted)
    f1_macro = f1_score(y_true, y_pred_binary, average='macro')
    f1_weighted = f1_score(y_true, y_pred_binary, average='weighted')
    print(f"F1-Score (Macro): {f1_macro:.5f}")
    print(f"F1-Score (Weighted): {f1_weighted:.5f}")
    
    # Note: roc_auc_ovr is for multi-class problems. For binary, it's equivalent
    # to the standard roc_auc_score, so we report that one.
    
    print("-" * 33)
    return auc

# Run the evaluation
val_auc = evaluate_classification(y_val, val_predictions_probs, val_predictions_binary)

# Plot training history
def plot_history(history):
    plt.figure(figsize=(12, 5))

    # Plot AUC
    plt.subplot(1, 2, 1)
    plt.plot(history.history['auc'], label='Training AUC')
    plt.plot(history.history['val_auc'], label='Validation AUC')
    plt.title('AUC Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('AUC')
    plt.legend()
    plt.grid(True)

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.show()

print("\nPlotting training history...")
plot_history(history)


# --- 6. Prediction and Submission ---
print("\n--- 6. Generating Submission File ---")

print("Predicting on the test set...")
test_predictions_logits = best_model.predict(X_test, batch_size=cfg.BATCH_SIZE)
test_predictions_probs = tf.nn.sigmoid(test_predictions_logits).numpy().flatten()

print("Creating submission dataframe...")
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'toxic': test_predictions_probs
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("Final Validation AUC:", val_auc)
print("Script finished.")

2025-07-01 13:07:49.003484: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751375269.239950      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751375269.309168      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


--- Library Versions ---
TensorFlow: 2.18.0
Keras: 3.8.0
KerasNLP: 0.18.1
Pandas: 2.2.3
NumPy: 1.26.4
-------------------------

--- 2. Loading and Preparing Data ---
Loading jigsaw-toxic-comment-train.csv...
Loading jigsaw-unintended-bias-train.csv...
Combined English training data shape: (2100579, 2)
Class distribution in combined training data:
toxic
0    0.918207
1    0.081793
Name: proportion, dtype: float64

Loading validation and test data...
Validation data shape: (8000, 4)
Test data shape: (63812, 3)

--- 3. Building the KerasNLP Model ---


AttributeError: module 'keras_hub.api.models' has no attribute 'XlmRobertaPreprocessor'