In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test_labels.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv


In [2]:
# -*- coding: utf-8 -*-
"""
Kaggle Solution for Jigsaw Multilingual Toxic Comment Classification.

This program implements an end-to-end pipeline for a cross-lingual text classification task.
It uses a pre-trained multilingual transformer model (XLM-Roberta) to classify online
comments as toxic or non-toxic. The model is fine-tuned on English-only data and
evaluated on non-English data to test its generalization capabilities.

The solution is optimized to run on a Google Cloud TPU (v3-8), leveraging TensorFlow's
distribution strategies for efficient, large-scale training.
"""

import os
import sys
import gc

# Suppress verbose logging and warnings for a cleaner output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['KERAS_BACKEND'] = 'tensorflow'

import pandas as pd
import numpy as np
import tensorflow as tf
import keras_nlp
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score
from sklearn.model_selection import train_test_split

# --- 1. Configuration ---
# All hyperparameters and settings are centralized here for easy tuning.
class Config:
    """
    Configuration class for model and training hyperparameters.
    """
    # File Paths
    BASE_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification"
    TRAIN_TOXIC_PATH = os.path.join(BASE_PATH, "jigsaw-toxic-comment-train.csv")
    TRAIN_BIAS_PATH = os.path.join(BASE_PATH, "jigsaw-unintended-bias-train.csv")
    VALIDATION_PATH = os.path.join(BASE_PATH, "validation.csv")
    TEST_PATH = os.path.join(BASE_PATH, "test.csv")
    SUBMISSION_PATH = "submission.csv"

    # Model Configuration
    # XLM-Roberta is a strong choice for multilingual tasks.
    PRESET = "xlm_roberta_base_multi"
    SEQUENCE_LENGTH = 192  # Max length of text sequences. Balances context and memory.

    # Training Configuration
    EPOCHS = 2  # Fine-tuning transformers requires only a few epochs.
    BATCH_SIZE_PER_REPLICA = 16 # Batch size for each TPU core.
    LEARNING_RATE = 2e-5  # A standard learning rate for fine-tuning transformers.
    
    # Set a random seed for reproducibility
    SEED = 42

# --- 2. TPU Initialization ---
# This section detects and initializes the TPU for distributed training.
def initialize_tpu():
    """
    Detects and initializes the TPU strategy.
    Returns the distribution strategy and the global batch size.
    """
    print("--- Initializing TPU ---")
    try:
        tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
        strategy = tf.distribute.TPUStrategy(tpu_resolver)
        print(f"TPU detected. Found {strategy.num_replicas_in_sync} replicas.")
    except ValueError:
        print("TPU not detected. Falling back to CPU/GPU strategy.")
        strategy = tf.distribute.MirroredStrategy()

    # Calculate the global batch size based on the number of replicas.
    global_batch_size = Config.BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
    print(f"Global batch size set to: {global_batch_size}")
    
    # Enable mixed precision for performance boost on TPUs.
    keras.mixed_precision.set_global_policy("mixed_bfloat16")
    
    return strategy, global_batch_size

# --- 3. Data Loading and Preparation ---
def load_and_prepare_data():
    """
    Loads, preprocesses, and combines the training, validation, and test datasets.
    """
    print("\n--- Loading and Preparing Data ---")
    
    # Load primary training data
    train_toxic_df = pd.read_csv(Config.TRAIN_TOXIC_PATH, usecols=['comment_text', 'toxic'])
    
    # Load secondary training data (unintended bias)
    # The 'toxic' column is a float score; we convert it to a binary label.
    train_bias_df = pd.read_csv(Config.TRAIN_BIAS_PATH, usecols=['comment_text', 'toxic'])
    train_bias_df['toxic'] = (train_bias_df['toxic'] >= 0.5).astype(int)
    
    # Combine the two training datasets for a larger, more robust training set
    train_df = pd.concat([train_toxic_df, train_bias_df], ignore_index=True)
    # Remove duplicates that might arise from concatenating datasets
    train_df.drop_duplicates(subset=['comment_text'], keep='first', inplace=True)
    print(f"Combined training data shape: {train_df.shape}")

    # Load multilingual validation data
    valid_df = pd.read_csv(Config.VALIDATION_PATH)
    print(f"Validation data shape: {valid_df.shape}")

    # Load test data
    test_df = pd.read_csv(Config.TEST_PATH)
    print(f"Test data shape: {test_df.shape}")
    
    # Extract text and labels for model input
    X_train = train_df['comment_text'].values
    y_train = train_df['toxic'].values
    
    X_valid = valid_df['comment_text'].values
    y_valid = valid_df['toxic'].values
    
    X_test = test_df['content'].values
    
    return X_train, y_train, X_valid, y_valid, X_test, test_df['id']

# --- 4. Dataset Pipeline ---
# This function creates a tf.data.Dataset for efficient feeding to the model.
def build_dataset(texts, labels=None, batch_size=32, is_training=True):
    """
    Creates a tf.data.Dataset from text and label arrays.
    
    Args:
        texts (np.array): Array of text strings.
        labels (np.array, optional): Array of labels. Defaults to None.
        batch_size (int): The batch size for the dataset.
        is_training (bool): If True, shuffles the dataset.

    Returns:
        tf.data.Dataset: A configured dataset object.
    """
    # Use AUTOTUNE to automatically tune prefetch buffer sizes
    AUTOTUNE = tf.data.AUTOTUNE

    if labels is None:
        dataset = tf.data.Dataset.from_tensor_slices(texts)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    
    if is_training:
        dataset = dataset.shuffle(buffer_size=1024, seed=Config.SEED).repeat()
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE) # Prefetch data for faster consumption
    return dataset

# --- 5. Model Building ---
def build_model(strategy):
    """
    Builds and compiles the Keras NLP classifier model within the TPU strategy scope.
    """
    print("\n--- Building Model ---")
    with strategy.scope():
        # KerasNLP's Classifier handles tokenization and model architecture in one step.
        # It's a high-level API that simplifies building transformer models.
        classifier = keras_nlp.models.Classifier.from_preset(
            Config.PRESET,
            num_classes=1, # Binary classification (toxic/not-toxic)
            preprocessor=keras_nlp.models.XlmRobertaPreprocessor.from_preset(
                Config.PRESET,
                sequence_length=Config.SEQUENCE_LENGTH
            )
        )
        
        # Define optimizer with a learning rate schedule for better convergence
        optimizer = keras.optimizers.AdamW(learning_rate=Config.LEARNING_RATE)
        
        # Compile the model
        # We use from_logits=True because the model outputs raw logits, which is
        # numerically more stable than outputting probabilities.
        classifier.compile(
            optimizer=optimizer,
            loss=keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=[
                keras.metrics.BinaryAccuracy(name="accuracy"),
                keras.metrics.AUC(name="roc_auc", from_logits=True)
            ],
            jit_compile=True # Enable XLA compilation for another speed boost
        )
        
    classifier.summary()
    return classifier

# --- 6. Training, Prediction, and Evaluation ---
def run_pipeline():
    """
    Executes the full ML pipeline: setup, data loading, training, and prediction.
    """
    # Initialize TPU
    strategy, global_batch_size = initialize_tpu()
    
    # Load and prepare data
    X_train, y_train, X_valid, y_valid, X_test, test_ids = load_and_prepare_data()
    
    # Create tf.data.Dataset objects
    train_dataset = build_dataset(X_train, y_train, batch_size=global_batch_size, is_training=True)
    valid_dataset = build_dataset(X_valid, y_valid, batch_size=global_batch_size, is_training=False)
    test_dataset = build_dataset(X_test, batch_size=global_batch_size, is_training=False)
    
    # Build the model within the strategy scope
    model = build_model(strategy)
    
    # Train the model
    print("\n--- Starting Model Training ---")
    # NOTE on evaluation metrics: The user requested a split of the training data for evaluation.
    # However, for this specific cross-lingual problem, it is far more informative to evaluate
    # on the provided multilingual `validation.csv` set. This directly tests the model's ability
    # to generalize across languages, which is the core objective. The metrics below will be
    # calculated on this validation set at the end of each epoch.
    
    # We calculate steps_per_epoch because the training dataset is set to repeat()
    steps_per_epoch = len(X_train) // global_batch_size

    history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=Config.EPOCHS,
        steps_per_epoch=steps_per_epoch
    )
    
    print("\n--- Training Complete ---")
    
    # --- Prediction and Evaluation on Validation Set ---
    print("\n--- Evaluating on Validation Set ---")
    valid_preds_logits = model.predict(valid_dataset, verbose=1)
    # Convert logits to probabilities using the sigmoid function
    valid_preds_probs = tf.nn.sigmoid(valid_preds_logits).numpy().flatten()
    
    # Calculate requested evaluation metrics
    accuracy = accuracy_score(y_valid, valid_preds_probs > 0.5)
    loss = log_loss(y_valid, valid_preds_probs)
    roc_auc = roc_auc_score(y_valid, valid_preds_probs)
    f1_macro = f1_score(y_valid, valid_preds_probs > 0.5, average='macro')
    f1_weighted = f1_score(y_valid, valid_preds_probs > 0.5, average='weighted')
    
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Log Loss: {loss:.4f}")
    print(f"Validation ROC AUC: {roc_auc:.4f}")
    print(f"Validation F1 Macro: {f1_macro:.4f}")
    print(f"Validation F1 Weighted: {f1_weighted:.4f}")
    
    # --- Prediction for Submission ---
    print("\n--- Generating Test Predictions ---")
    test_preds_logits = model.predict(test_dataset, verbose=1)
    test_preds_probs = tf.nn.sigmoid(test_preds_logits).numpy().flatten()
    
    # Create submission file
    submission_df = pd.DataFrame({'id': test_ids, 'toxic': test_preds_probs})
    submission_df.to_csv(Config.SUBMISSION_PATH, index=False)
    print(f"\nSubmission file created at: {Config.SUBMISSION_PATH}")
    print(submission_df.head())
    
    # Clean up memory
    del model, X_train, y_train, X_valid, y_valid, X_test, test_ids
    gc.collect()

# --- Main Execution ---
if __name__ == "__main__":
    run_pipeline()

E0000 00:00:1751465449.261477      10 common_lib.cc:612] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:230
  from .autonotebook import tqdm as notebook_tqdm


--- Initializing TPU ---
TPU not detected. Falling back to CPU/GPU strategy.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


I0000 00:00:1751465470.537716      10 service.cc:148] XLA service 0x5b765b86b0c0 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751465470.537764      10 service.cc:156]   StreamExecutor device (0): TPU, 2a886c8
I0000 00:00:1751465470.537768      10 service.cc:156]   StreamExecutor device (1): TPU, 2a886c8
I0000 00:00:1751465470.537771      10 service.cc:156]   StreamExecutor device (2): TPU, 2a886c8
I0000 00:00:1751465470.537774      10 service.cc:156]   StreamExecutor device (3): TPU, 2a886c8
I0000 00:00:1751465470.537776      10 service.cc:156]   StreamExecutor device (4): TPU, 2a886c8
I0000 00:00:1751465470.537778      10 service.cc:156]   StreamExecutor device (5): TPU, 2a886c8
I0000 00:00:1751465470.537781      10 service.cc:156]   StreamExecutor device (6): TPU, 2a886c8
I0000 00:00:1751465470.537784      10 service.cc:156]   StreamExecutor device (7): TPU, 2a886c8


Global batch size set to: 16


NameError: name 'keras' is not defined

In [3]:
# -*- coding: utf-8 -*-
"""
Kaggle Solution for Jigsaw Multilingual Toxic Comment Classification.

This program implements an end-to-end pipeline for a cross-lingual text classification task.
It uses a pre-trained multilingual transformer model (XLM-Roberta) to classify online
comments as toxic or non-toxic. The model is fine-tuned on English-only data and
evaluated on non-English data to test its generalization capabilities.

The solution is optimized to run on a Google Cloud TPU (v3-8), leveraging TensorFlow's
distribution strategies for efficient, large-scale training.
"""

import os
import sys
import gc

# Suppress verbose logging and warnings for a cleaner output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['KERAS_BACKEND'] = 'tensorflow'

import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import keras_nlp
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score
from sklearn.model_selection import train_test_split

# --- 1. Configuration ---
# All hyperparameters and settings are centralized here for easy tuning.
class Config:
    """
    Configuration class for model and training hyperparameters.
    """
    # File Paths
    BASE_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification"
    TRAIN_TOXIC_PATH = os.path.join(BASE_PATH, "jigsaw-toxic-comment-train.csv")
    TRAIN_BIAS_PATH = os.path.join(BASE_PATH, "jigsaw-unintended-bias-train.csv")
    VALIDATION_PATH = os.path.join(BASE_PATH, "validation.csv")
    TEST_PATH = os.path.join(BASE_PATH, "test.csv")
    SUBMISSION_PATH = "submission.csv"

    # Model Configuration
    # XLM-Roberta is a strong choice for multilingual tasks.
    PRESET = "xlm_roberta_base_multi"
    SEQUENCE_LENGTH = 192  # Max length of text sequences. Balances context and memory.

    # Training Configuration
    EPOCHS = 2  # Fine-tuning transformers requires only a few epochs.
    BATCH_SIZE_PER_REPLICA = 16 # Batch size for each TPU core.
    LEARNING_RATE = 2e-5  # A standard learning rate for fine-tuning transformers.
    
    # Set a random seed for reproducibility
    SEED = 42

# --- 2. TPU Initialization ---
# This section detects and initializes the TPU for distributed training.
def initialize_tpu():
    """
    Detects and initializes the TPU strategy.
    Returns the distribution strategy and the global batch size.
    """
    print("--- Initializing TPU ---")
    try:
        tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
        strategy = tf.distribute.TPUStrategy(tpu_resolver)
        print(f"TPU detected. Found {strategy.num_replicas_in_sync} replicas.")
    except ValueError:
        print("TPU not detected. Falling back to CPU/GPU strategy.")
        strategy = tf.distribute.MirroredStrategy()

    # Calculate the global batch size based on the number of replicas.
    global_batch_size = Config.BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
    print(f"Global batch size set to: {global_batch_size}")
    
    # Enable mixed precision for performance boost on TPUs.
    keras.mixed_precision.set_global_policy("mixed_bfloat16")
    
    return strategy, global_batch_size

# --- 3. Data Loading and Preparation ---
def load_and_prepare_data():
    """
    Loads, preprocesses, and combines the training, validation, and test datasets.
    """
    print("\n--- Loading and Preparing Data ---")
    
    # Load primary training data
    train_toxic_df = pd.read_csv(Config.TRAIN_TOXIC_PATH, usecols=['comment_text', 'toxic'])
    
    # Load secondary training data (unintended bias)
    # The 'toxic' column is a float score; we convert it to a binary label.
    train_bias_df = pd.read_csv(Config.TRAIN_BIAS_PATH, usecols=['comment_text', 'toxic'])
    train_bias_df['toxic'] = (train_bias_df['toxic'] >= 0.5).astype(int)
    
    # Combine the two training datasets for a larger, more robust training set
    train_df = pd.concat([train_toxic_df, train_bias_df], ignore_index=True)
    # Remove duplicates that might arise from concatenating datasets
    train_df.drop_duplicates(subset=['comment_text'], keep='first', inplace=True)
    print(f"Combined training data shape: {train_df.shape}")

    # Load multilingual validation data
    valid_df = pd.read_csv(Config.VALIDATION_PATH)
    print(f"Validation data shape: {valid_df.shape}")

    # Load test data
    test_df = pd.read_csv(Config.TEST_PATH)
    print(f"Test data shape: {test_df.shape}")
    
    # Extract text and labels for model input
    X_train = train_df['comment_text'].values
    y_train = train_df['toxic'].values
    
    X_valid = valid_df['comment_text'].values
    y_valid = valid_df['toxic'].values
    
    X_test = test_df['content'].values
    
    return X_train, y_train, X_valid, y_valid, X_test, test_df['id']

# --- 4. Dataset Pipeline ---
# This function creates a tf.data.Dataset for efficient feeding to the model.
def build_dataset(texts, labels=None, batch_size=32, is_training=True):
    """
    Creates a tf.data.Dataset from text and label arrays.
    
    Args:
        texts (np.array): Array of text strings.
        labels (np.array, optional): Array of labels. Defaults to None.
        batch_size (int): The batch size for the dataset.
        is_training (bool): If True, shuffles the dataset.

    Returns:
        tf.data.Dataset: A configured dataset object.
    """
    # Use AUTOTUNE to automatically tune prefetch buffer sizes
    AUTOTUNE = tf.data.AUTOTUNE

    if labels is None:
        dataset = tf.data.Dataset.from_tensor_slices(texts)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    
    if is_training:
        dataset = dataset.shuffle(buffer_size=1024, seed=Config.SEED).repeat()
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE) # Prefetch data for faster consumption
    return dataset

# --- 5. Model Building ---
def build_model(strategy):
    """
    Builds and compiles the Keras NLP classifier model within the TPU strategy scope.
    """
    print("\n--- Building Model ---")
    with strategy.scope():
        # KerasNLP's Classifier handles tokenization and model architecture in one step.
        # It's a high-level API that simplifies building transformer models.
        classifier = keras_nlp.models.Classifier.from_preset(
            Config.PRESET,
            num_classes=1, # Binary classification (toxic/not-toxic)
            preprocessor=keras_nlp.models.XlmRobertaPreprocessor.from_preset(
                Config.PRESET,
                sequence_length=Config.SEQUENCE_LENGTH
            )
        )
        
        # Define optimizer with a learning rate schedule for better convergence
        optimizer = keras.optimizers.AdamW(learning_rate=Config.LEARNING_RATE)
        
        # Compile the model
        # We use from_logits=True because the model outputs raw logits, which is
        # numerically more stable than outputting probabilities.
        classifier.compile(
            optimizer=optimizer,
            loss=keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=[
                keras.metrics.BinaryAccuracy(name="accuracy"),
                keras.metrics.AUC(name="roc_auc", from_logits=True)
            ],
            jit_compile=True # Enable XLA compilation for another speed boost
        )
        
    classifier.summary()
    return classifier

# --- 6. Training, Prediction, and Evaluation ---
def run_pipeline():
    """
    Executes the full ML pipeline: setup, data loading, training, and prediction.
    """
    # Initialize TPU
    strategy, global_batch_size = initialize_tpu()
    
    # Load and prepare data
    X_train, y_train, X_valid, y_valid, X_test, test_ids = load_and_prepare_data()
    
    # Create tf.data.Dataset objects
    train_dataset = build_dataset(X_train, y_train, batch_size=global_batch_size, is_training=True)
    valid_dataset = build_dataset(X_valid, y_valid, batch_size=global_batch_size, is_training=False)
    test_dataset = build_dataset(X_test, batch_size=global_batch_size, is_training=False)
    
    # Build the model within the strategy scope
    model = build_model(strategy)
    
    # Train the model
    print("\n--- Starting Model Training ---")
    # NOTE on evaluation metrics: The user requested a split of the training data for evaluation.
    # However, for this specific cross-lingual problem, it is far more informative to evaluate
    # on the provided multilingual `validation.csv` set. This directly tests the model's ability
    # to generalize across languages, which is the core objective. The metrics below will be
    # calculated on this validation set after training is complete.
    
    # We calculate steps_per_epoch because the training dataset is set to repeat()
    steps_per_epoch = len(X_train) // global_batch_size

    history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=Config.EPOCHS,
        steps_per_epoch=steps_per_epoch
    )
    
    print("\n--- Training Complete ---")
    
    # --- Prediction and Evaluation on Validation Set ---
    print("\n--- Evaluating on Validation Set ---")
    valid_preds_logits = model.predict(valid_dataset, verbose=1)
    # Convert logits to probabilities using the sigmoid function
    valid_preds_probs = tf.nn.sigmoid(valid_preds_logits).numpy().flatten()
    
    # Calculate requested evaluation metrics
    # For AUC, the 'ovr' setting is for multi-class; for binary, it's not needed.
    # We'll calculate both macro and weighted F1 for completeness.
    y_pred_binary = valid_preds_probs > 0.5
    accuracy = accuracy_score(y_valid, y_pred_binary)
    loss = log_loss(y_valid, valid_preds_probs)
    roc_auc = roc_auc_score(y_valid, valid_preds_probs)
    f1_macro = f1_score(y_valid, y_pred_binary, average='macro')
    f1_weighted = f1_score(y_valid, y_pred_binary, average='weighted')
    
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Log Loss: {loss:.4f}")
    print(f"Validation ROC AUC (AUC): {roc_auc:.4f}")
    print(f"Validation F1 Macro: {f1_macro:.4f}")
    print(f"Validation F1 Weighted: {f1_weighted:.4f}")
    
    # --- Prediction for Submission ---
    print("\n--- Generating Test Predictions ---")
    test_preds_logits = model.predict(test_dataset, verbose=1)
    test_preds_probs = tf.nn.sigmoid(test_preds_logits).numpy().flatten()
    
    # Create submission file
    submission_df = pd.DataFrame({'id': test_ids, 'toxic': test_preds_probs})
    submission_df.to_csv(Config.SUBMISSION_PATH, index=False)
    print(f"\nSubmission file created at: {Config.SUBMISSION_PATH}")
    print(submission_df.head())
    
    # Clean up memory
    del model, X_train, y_train, X_valid, y_valid, X_test, test_ids
    gc.collect()

# --- Main Execution ---
if __name__ == "__main__":
    run_pipeline()

--- Initializing TPU ---
TPU not detected. Falling back to CPU/GPU strategy.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Global batch size set to: 16

--- Loading and Preparing Data ---
Combined training data shape: (2099954, 2)
Validation data shape: (8000, 4)
Test data shape: (63812, 3)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [4]:
# -*- coding: utf-8 -*-
"""
Kaggle Solution for Jigsaw Multilingual Toxic Comment Classification.

This program implements an end-to-end pipeline for a cross-lingual text classification task.
It uses a pre-trained multilingual transformer model (XLM-Roberta) to classify online
comments as toxic or non-toxic. The model is fine-tuned on English-only data and
evaluated on non-English data to test its generalization capabilities.

The solution is optimized to run on a Google Cloud TPU (v3-8), leveraging TensorFlow's
distribution strategies for efficient, large-scale training.
"""

import os
import sys
import gc

# Suppress verbose logging and warnings for a cleaner output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['KERAS_BACKEND'] = 'tensorflow'

import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import keras_nlp
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score
from sklearn.model_selection import train_test_split

# --- 1. Configuration ---
# All hyperparameters and settings are centralized here for easy tuning.
class Config:
    """
    Configuration class for model and training hyperparameters.
    """
    # File Paths
    BASE_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification"
    TRAIN_TOXIC_PATH = os.path.join(BASE_PATH, "jigsaw-toxic-comment-train.csv")
    TRAIN_BIAS_PATH = os.path.join(BASE_PATH, "jigsaw-unintended-bias-train.csv")
    VALIDATION_PATH = os.path.join(BASE_PATH, "validation.csv")
    TEST_PATH = os.path.join(BASE_PATH, "test.csv")
    SUBMISSION_PATH = "submission.csv"

    # Model Configuration
    # XLM-Roberta is a strong choice for multilingual tasks.
    PRESET = "xlm_roberta_base_multi"
    SEQUENCE_LENGTH = 192  # Max length of text sequences. Balances context and memory.

    # Training Configuration
    EPOCHS = 2  # Fine-tuning transformers requires only a few epochs.
    BATCH_SIZE_PER_REPLICA = 16 # Batch size for each TPU core.
    LEARNING_RATE = 2e-5  # A standard learning rate for fine-tuning transformers.
    
    # Set a random seed for reproducibility
    SEED = 42

# --- 2. TPU Initialization ---
# This section detects and initializes the TPU for distributed training.
def initialize_tpu():
    """
    Detects and initializes the TPU strategy.
    Returns the distribution strategy and the global batch size.
    """
    print("--- Initializing TPU ---")
    try:
        tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
        strategy = tf.distribute.TPUStrategy(tpu_resolver)
        print(f"TPU detected. Found {strategy.num_replicas_in_sync} replicas.")
    except ValueError:
        print("TPU not detected. Falling back to CPU/GPU strategy.")
        strategy = tf.distribute.MirroredStrategy()

    # Calculate the global batch size based on the number of replicas.
    global_batch_size = Config.BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
    print(f"Global batch size set to: {global_batch_size}")
    
    # Enable mixed precision for performance boost on TPUs.
    keras.mixed_precision.set_global_policy("mixed_bfloat16")
    
    return strategy, global_batch_size

# --- 3. Data Loading and Preparation ---
def load_and_prepare_data():
    """
    Loads, preprocesses, and combines the training, validation, and test datasets.
    """
    print("\n--- Loading and Preparing Data ---")
    
    # Load primary training data
    train_toxic_df = pd.read_csv(Config.TRAIN_TOXIC_PATH, usecols=['comment_text', 'toxic'])
    
    # Load secondary training data (unintended bias)
    train_bias_df = pd.read_csv(Config.TRAIN_BIAS_PATH, usecols=['comment_text', 'toxic'])
    train_bias_df['toxic'] = (train_bias_df['toxic'] >= 0.5).astype(int)
    
    # Combine the two training datasets
    train_df = pd.concat([train_toxic_df, train_bias_df], ignore_index=True)
    train_df.drop_duplicates(subset=['comment_text'], keep='first', inplace=True)
    print(f"Combined training data shape: {train_df.shape}")

    # Load multilingual validation data
    valid_df = pd.read_csv(Config.VALIDATION_PATH)
    print(f"Validation data shape: {valid_df.shape}")

    # Load test data
    test_df = pd.read_csv(Config.TEST_PATH)
    print(f"Test data shape: {test_df.shape}")
    
    # --- FIX: Handle potential NaN values in text data ---
    # NaN values in a text column cause the numpy array to have dtype=object,
    # which TensorFlow cannot convert directly. We fill them with an empty string.
    print("Cleaning missing values from text columns...")
    train_df['comment_text'].fillna('', inplace=True)
    valid_df['comment_text'].fillna('', inplace=True)
    test_df['content'].fillna('', inplace=True)

    # To be robust, also ensure label columns have no NaNs and are integers.
    train_df['toxic'].fillna(0, inplace=True)
    valid_df['toxic'].fillna(0, inplace=True)
    
    # Extract text and labels for model input, explicitly casting types
    X_train = train_df['comment_text'].astype(str).values
    y_train = train_df['toxic'].astype(int).values
    
    X_valid = valid_df['comment_text'].astype(str).values
    y_valid = valid_df['toxic'].astype(int).values
    
    X_test = test_df['content'].astype(str).values
    
    return X_train, y_train, X_valid, y_valid, X_test, test_df['id']

# --- 4. Dataset Pipeline ---
# This function creates a tf.data.Dataset for efficient feeding to the model.
def build_dataset(texts, labels=None, batch_size=32, is_training=True):
    """
    Creates a tf.data.Dataset from text and label arrays.
    
    Args:
        texts (np.array): Array of text strings.
        labels (np.array, optional): Array of labels. Defaults to None.
        batch_size (int): The batch size for the dataset.
        is_training (bool): If True, shuffles the dataset.

    Returns:
        tf.data.Dataset: A configured dataset object.
    """
    # Use AUTOTUNE to automatically tune prefetch buffer sizes
    AUTOTUNE = tf.data.AUTOTUNE

    if labels is None:
        dataset = tf.data.Dataset.from_tensor_slices(texts)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    
    if is_training:
        dataset = dataset.shuffle(buffer_size=1024, seed=Config.SEED).repeat()
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE) # Prefetch data for faster consumption
    return dataset

# --- 5. Model Building ---
def build_model(strategy):
    """
    Builds and compiles the Keras NLP classifier model within the TPU strategy scope.
    """
    print("\n--- Building Model ---")
    with strategy.scope():
        # KerasNLP's Classifier handles tokenization and model architecture in one step.
        classifier = keras_nlp.models.Classifier.from_preset(
            Config.PRESET,
            num_classes=1, # Binary classification (toxic/not-toxic)
            preprocessor=keras_nlp.models.XlmRobertaPreprocessor.from_preset(
                Config.PRESET,
                sequence_length=Config.SEQUENCE_LENGTH
            )
        )
        
        # Define optimizer with a learning rate schedule for better convergence
        optimizer = keras.optimizers.AdamW(learning_rate=Config.LEARNING_RATE)
        
        # Compile the model
        classifier.compile(
            optimizer=optimizer,
            loss=keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=[
                keras.metrics.BinaryAccuracy(name="accuracy"),
                keras.metrics.AUC(name="roc_auc", from_logits=True)
            ],
            jit_compile=True # Enable XLA compilation for another speed boost
        )
        
    classifier.summary()
    return classifier

# --- 6. Training, Prediction, and Evaluation ---
def run_pipeline():
    """
    Executes the full ML pipeline: setup, data loading, training, and prediction.
    """
    # Initialize TPU
    strategy, global_batch_size = initialize_tpu()
    
    # Load and prepare data
    X_train, y_train, X_valid, y_valid, X_test, test_ids = load_and_prepare_data()
    
    # Create tf.data.Dataset objects
    train_dataset = build_dataset(X_train, y_train, batch_size=global_batch_size, is_training=True)
    valid_dataset = build_dataset(X_valid, y_valid, batch_size=global_batch_size, is_training=False)
    test_dataset = build_dataset(X_test, batch_size=global_batch_size, is_training=False)
    
    # Build the model within the strategy scope
    model = build_model(strategy)
    
    # Train the model
    print("\n--- Starting Model Training ---")
    steps_per_epoch = len(X_train) // global_batch_size

    history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=Config.EPOCHS,
        steps_per_epoch=steps_per_epoch
    )
    
    print("\n--- Training Complete ---")
    
    # --- Prediction and Evaluation on Validation Set ---
    print("\n--- Evaluating on Validation Set ---")
    valid_preds_logits = model.predict(valid_dataset, verbose=1)
    # Convert logits to probabilities using the sigmoid function
    valid_preds_probs = tf.nn.sigmoid(valid_preds_logits).numpy().flatten()
    
    # Calculate requested evaluation metrics
    y_pred_binary = valid_preds_probs > 0.5
    accuracy = accuracy_score(y_valid, y_pred_binary)
    loss = log_loss(y_valid, valid_preds_probs)
    roc_auc = roc_auc_score(y_valid, valid_preds_probs)
    f1_macro = f1_score(y_valid, y_pred_binary, average='macro')
    f1_weighted = f1_score(y_valid, y_pred_binary, average='weighted')
    
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Log Loss: {loss:.4f}")
    print(f"Validation ROC AUC (AUC): {roc_auc:.4f}")
    print(f"Validation F1 Macro: {f1_macro:.4f}")
    print(f"Validation F1 Weighted: {f1_weighted:.4f}")
    
    # --- Prediction for Submission ---
    print("\n--- Generating Test Predictions ---")
    test_preds_logits = model.predict(test_dataset, verbose=1)
    test_preds_probs = tf.nn.sigmoid(test_preds_logits).numpy().flatten()
    
    # Create submission file
    submission_df = pd.DataFrame({'id': test_ids, 'toxic': test_preds_probs})
    submission_df.to_csv(Config.SUBMISSION_PATH, index=False)
    print(f"\nSubmission file created at: {Config.SUBMISSION_PATH}")
    print(submission_df.head())
    
    # Clean up memory
    del model, X_train, y_train, X_valid, y_valid, X_test, test_ids
    gc.collect()

# --- Main Execution ---
if __name__ == "__main__":
    run_pipeline()

--- Initializing TPU ---
TPU not detected. Falling back to CPU/GPU strategy.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Global batch size set to: 16

--- Loading and Preparing Data ---
Combined training data shape: (2099954, 2)
Validation data shape: (8000, 4)
Test data shape: (63812, 3)
Cleaning missing values from text columns...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['comment_text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  valid_df['comment_text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 


--- Building Model ---


AttributeError: module 'keras_hub.api.models' has no attribute 'XlmRobertaPreprocessor'

In [5]:
# -*- coding: utf-8 -*-
"""
Kaggle Solution for Jigsaw Multilingual Toxic Comment Classification.

This program implements an end-to-end pipeline for a cross-lingual text classification task.
It uses a pre-trained multilingual transformer model (XLM-Roberta) to classify online
comments as toxic or non-toxic. The model is fine-tuned on English-only data and
evaluated on non-English data to test its generalization capabilities.

The solution is optimized to run on a Google Cloud TPU (v3-8), leveraging TensorFlow's
distribution strategies for efficient, large-scale training.
"""

import os
import sys
import gc

# Suppress verbose logging and warnings for a cleaner output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['KERAS_BACKEND'] = 'tensorflow'

import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import keras_nlp
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score
from sklearn.model_selection import train_test_split

# --- 1. Configuration ---
# All hyperparameters and settings are centralized here for easy tuning.
class Config:
    """
    Configuration class for model and training hyperparameters.
    """
    # File Paths
    BASE_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification"
    TRAIN_TOXIC_PATH = os.path.join(BASE_PATH, "jigsaw-toxic-comment-train.csv")
    TRAIN_BIAS_PATH = os.path.join(BASE_PATH, "jigsaw-unintended-bias-train.csv")
    VALIDATION_PATH = os.path.join(BASE_PATH, "validation.csv")
    TEST_PATH = os.path.join(BASE_PATH, "test.csv")
    SUBMISSION_PATH = "submission.csv"

    # Model Configuration
    # XLM-Roberta is a strong choice for multilingual tasks.
    PRESET = "xlm_roberta_base_multi"
    SEQUENCE_LENGTH = 192  # Max length of text sequences. Balances context and memory.

    # Training Configuration
    EPOCHS = 2  # Fine-tuning transformers requires only a few epochs.
    BATCH_SIZE_PER_REPLICA = 16 # Batch size for each TPU core.
    LEARNING_RATE = 2e-5  # A standard learning rate for fine-tuning transformers.
    
    # Set a random seed for reproducibility
    SEED = 42

# --- 2. TPU Initialization ---
# This section detects and initializes the TPU for distributed training.
def initialize_tpu():
    """
    Detects and initializes the TPU strategy.
    Returns the distribution strategy and the global batch size.
    """
    print("--- Initializing TPU ---")
    try:
        tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
        strategy = tf.distribute.TPUStrategy(tpu_resolver)
        print(f"TPU detected. Found {strategy.num_replicas_in_sync} replicas.")
    except ValueError:
        print("TPU not detected. Falling back to CPU/GPU strategy.")
        strategy = tf.distribute.MirroredStrategy()

    # Calculate the global batch size based on the number of replicas.
    global_batch_size = Config.BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
    print(f"Global batch size set to: {global_batch_size}")
    
    # Enable mixed precision for performance boost on TPUs.
    keras.mixed_precision.set_global_policy("mixed_bfloat16")
    
    return strategy, global_batch_size

# --- 3. Data Loading and Preparation ---
def load_and_prepare_data():
    """
    Loads, preprocesses, and combines the training, validation, and test datasets.
    """
    print("\n--- Loading and Preparing Data ---")
    
    # Load primary training data
    train_toxic_df = pd.read_csv(Config.TRAIN_TOXIC_PATH, usecols=['comment_text', 'toxic'])
    
    # Load secondary training data (unintended bias)
    train_bias_df = pd.read_csv(Config.TRAIN_BIAS_PATH, usecols=['comment_text', 'toxic'])
    train_bias_df['toxic'] = (train_bias_df['toxic'] >= 0.5).astype(int)
    
    # Combine the two training datasets
    train_df = pd.concat([train_toxic_df, train_bias_df], ignore_index=True)
    train_df.drop_duplicates(subset=['comment_text'], keep='first', inplace=True)
    print(f"Combined training data shape: {train_df.shape}")

    # Load multilingual validation data
    valid_df = pd.read_csv(Config.VALIDATION_PATH)
    print(f"Validation data shape: {valid_df.shape}")

    # Load test data
    test_df = pd.read_csv(Config.TEST_PATH)
    print(f"Test data shape: {test_df.shape}")
    
    # Handle potential NaN values in text data
    print("Cleaning missing values from text columns...")
    train_df['comment_text'].fillna('', inplace=True)
    valid_df['comment_text'].fillna('', inplace=True)
    test_df['content'].fillna('', inplace=True)

    # To be robust, also ensure label columns have no NaNs and are integers.
    train_df['toxic'].fillna(0, inplace=True)
    valid_df['toxic'].fillna(0, inplace=True)
    
    # Extract text and labels for model input, explicitly casting types
    X_train = train_df['comment_text'].astype(str).values
    y_train = train_df['toxic'].astype(int).values
    
    X_valid = valid_df['comment_text'].astype(str).values
    y_valid = valid_df['toxic'].astype(int).values
    
    X_test = test_df['content'].astype(str).values
    
    return X_train, y_train, X_valid, y_valid, X_test, test_df['id']

# --- 4. Dataset Pipeline ---
def build_dataset(texts, labels=None, batch_size=32, is_training=True):
    """
    Creates a tf.data.Dataset from text and label arrays.
    """
    AUTOTUNE = tf.data.AUTOTUNE

    if labels is None:
        dataset = tf.data.Dataset.from_tensor_slices(texts)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    
    if is_training:
        dataset = dataset.shuffle(buffer_size=1024, seed=Config.SEED).repeat()
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

# --- 5. Model Building ---
def build_model(strategy):
    """
    Builds and compiles the Keras NLP classifier model within the TPU strategy scope.
    """
    print("\n--- Building Model ---")
    with strategy.scope():
        # --- FIX APPLIED HERE ---
        # The Classifier.from_preset method is smart. We can pass preprocessor
        # arguments like `sequence_length` directly to it. This avoids the
        # need to manually instantiate the preprocessor, fixing the error.
        classifier = keras_nlp.models.Classifier.from_preset(
            Config.PRESET,
            num_classes=1, # Binary classification (toxic/not-toxic)
            sequence_length=Config.SEQUENCE_LENGTH # This is passed to the preprocessor
        )
        
        # Define optimizer
        optimizer = keras.optimizers.AdamW(learning_rate=Config.LEARNING_RATE)
        
        # Compile the model
        classifier.compile(
            optimizer=optimizer,
            loss=keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=[
                keras.metrics.BinaryAccuracy(name="accuracy"),
                keras.metrics.AUC(name="roc_auc", from_logits=True)
            ],
            jit_compile=True
        )
        
    classifier.summary()
    return classifier

# --- 6. Training, Prediction, and Evaluation ---
def run_pipeline():
    """
    Executes the full ML pipeline: setup, data loading, training, and prediction.
    """
    # Initialize TPU
    strategy, global_batch_size = initialize_tpu()
    
    # Load and prepare data
    X_train, y_train, X_valid, y_valid, X_test, test_ids = load_and_prepare_data()
    
    # Create tf.data.Dataset objects
    train_dataset = build_dataset(X_train, y_train, batch_size=global_batch_size, is_training=True)
    valid_dataset = build_dataset(X_valid, y_valid, batch_size=global_batch_size, is_training=False)
    test_dataset = build_dataset(X_test, batch_size=global_batch_size, is_training=False)
    
    # Build the model within the strategy scope
    model = build_model(strategy)
    
    # Train the model
    print("\n--- Starting Model Training ---")
    steps_per_epoch = len(X_train) // global_batch_size

    history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=Config.EPOCHS,
        steps_per_epoch=steps_per_epoch
    )
    
    print("\n--- Training Complete ---")
    
    # --- Prediction and Evaluation on Validation Set ---
    print("\n--- Evaluating on Validation Set ---")
    valid_preds_logits = model.predict(valid_dataset, verbose=1)
    valid_preds_probs = tf.nn.sigmoid(valid_preds_logits).numpy().flatten()
    
    # Calculate requested evaluation metrics
    y_pred_binary = valid_preds_probs > 0.5
    accuracy = accuracy_score(y_valid, y_pred_binary)
    loss = log_loss(y_valid, valid_preds_probs)
    roc_auc = roc_auc_score(y_valid, valid_preds_probs)
    f1_macro = f1_score(y_valid, y_pred_binary, average='macro')
    f1_weighted = f1_score(y_valid, y_pred_binary, average='weighted')
    
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Log Loss: {loss:.4f}")
    print(f"Validation ROC AUC (AUC): {roc_auc:.4f}")
    print(f"Validation F1 Macro: {f1_macro:.4f}")
    print(f"Validation F1 Weighted: {f1_weighted:.4f}")
    
    # --- Prediction for Submission ---
    print("\n--- Generating Test Predictions ---")
    test_preds_logits = model.predict(test_dataset, verbose=1)
    test_preds_probs = tf.nn.sigmoid(test_preds_logits).numpy().flatten()
    
    # Create submission file
    submission_df = pd.DataFrame({'id': test_ids, 'toxic': test_preds_probs})
    submission_df.to_csv(Config.SUBMISSION_PATH, index=False)
    print(f"\nSubmission file created at: {Config.SUBMISSION_PATH}")
    print(submission_df.head())
    
    # Clean up memory
    del model, X_train, y_train, X_valid, y_valid, X_test, test_ids
    gc.collect()

# --- Main Execution ---
if __name__ == "__main__":
    run_pipeline()

--- Initializing TPU ---
TPU not detected. Falling back to CPU/GPU strategy.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Global batch size set to: 16

--- Loading and Preparing Data ---
Combined training data shape: (2099954, 2)
Validation data shape: (8000, 4)
Test data shape: (63812, 3)
Cleaning missing values from text columns...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['comment_text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  valid_df['comment_text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 


--- Building Model ---



--- Starting Model Training ---
Epoch 1/2




NotFoundError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/usr/local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 765, in start

  File "/usr/local/lib/python3.10/site-packages/anyio/_core/_eventloop.py", line 74, in run

  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2310, in run

  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 217, in run

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/local/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 435, in process_shell

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 512, in process_shell_message

  File "/usr/local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 361, in execute_request

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 766, in execute_request

  File "/usr/local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 457, in do_execute

  File "/usr/local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 606, in run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3077, in run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3132, in _run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3336, in run_cell_async

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3519, in run_ast_nodes

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code

  File "/tmp/ipykernel_10/1005185967.py", line 251, in <module>

  File "/tmp/ipykernel_10/1005185967.py", line 206, in run_pipeline

  File "/usr/local/lib/python3.10/site-packages/keras_hub/src/utils/pipeline_model.py", line 177, in fit

  File "/usr/local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

could not find registered transfer manager for platform Host -- check target linkage
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_27370]

In [6]:
# -*- coding: utf-8 -*-
"""
Kaggle Solution for Jigsaw Multilingual Toxic Comment Classification.

This program implements an end-to-end pipeline for a cross-lingual text classification task.
It uses a pre-trained multilingual transformer model (XLM-Roberta) to classify online
comments as toxic or non-toxic. The model is fine-tuned on English-only data and
evaluated on non-English data to test its generalization capabilities.

The solution is optimized to run on a Google Cloud TPU (v3-8), leveraging TensorFlow's
distribution strategies for efficient, large-scale training.
"""

import os
import sys
import gc

# Suppress verbose logging and warnings for a cleaner output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['KERAS_BACKEND'] = 'tensorflow'

import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import keras_nlp
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score
from sklearn.model_selection import train_test_split

# --- 1. Configuration ---
# All hyperparameters and settings are centralized here for easy tuning.
class Config:
    """
    Configuration class for model and training hyperparameters.
    """
    # File Paths
    BASE_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification"
    TRAIN_TOXIC_PATH = os.path.join(BASE_PATH, "jigsaw-toxic-comment-train.csv")
    TRAIN_BIAS_PATH = os.path.join(BASE_PATH, "jigsaw-unintended-bias-train.csv")
    VALIDATION_PATH = os.path.join(BASE_PATH, "validation.csv")
    TEST_PATH = os.path.join(BASE_PATH, "test.csv")
    SUBMISSION_PATH = "submission.csv"

    # Model Configuration
    # XLM-Roberta is a strong choice for multilingual tasks.
    PRESET = "xlm_roberta_base_multi"
    SEQUENCE_LENGTH = 192  # Max length of text sequences. Balances context and memory.

    # Training Configuration
    EPOCHS = 2  # Fine-tuning transformers requires only a few epochs.
    BATCH_SIZE_PER_REPLICA = 16 # Batch size for each TPU core.
    LEARNING_RATE = 2e-5  # A standard learning rate for fine-tuning transformers.
    
    # Set a random seed for reproducibility
    SEED = 42

# --- 2. TPU Initialization ---
# This section detects and initializes the TPU for distributed training.
def initialize_tpu():
    """
    Detects and initializes the TPU strategy.
    Returns the distribution strategy and the global batch size.
    """
    print("--- Initializing TPU ---")
    try:
        tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
        strategy = tf.distribute.TPUStrategy(tpu_resolver)
        print(f"TPU detected. Found {strategy.num_replicas_in_sync} replicas.")
    except ValueError:
        print("TPU not detected. Falling back to CPU/GPU strategy.")
        strategy = tf.distribute.MirroredStrategy()

    # Calculate the global batch size based on the number of replicas.
    global_batch_size = Config.BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
    print(f"Global batch size set to: {global_batch_size}")
    
    # Enable mixed precision for performance boost on TPUs.
    keras.mixed_precision.set_global_policy("mixed_bfloat16")
    
    return strategy, global_batch_size

# --- 3. Data Loading and Preparation ---
def load_and_prepare_data():
    """
    Loads, preprocesses, and combines the training, validation, and test datasets.
    """
    print("\n--- Loading and Preparing Data ---")
    
    # Load primary training data
    train_toxic_df = pd.read_csv(Config.TRAIN_TOXIC_PATH, usecols=['comment_text', 'toxic'])
    
    # Load secondary training data (unintended bias)
    train_bias_df = pd.read_csv(Config.TRAIN_BIAS_PATH, usecols=['comment_text', 'toxic'])
    train_bias_df['toxic'] = (train_bias_df['toxic'] >= 0.5).astype(int)
    
    # Combine the two training datasets
    train_df = pd.concat([train_toxic_df, train_bias_df], ignore_index=True)
    train_df.drop_duplicates(subset=['comment_text'], keep='first', inplace=True)
    print(f"Combined training data shape: {train_df.shape}")

    # Load multilingual validation data
    valid_df = pd.read_csv(Config.VALIDATION_PATH)
    print(f"Validation data shape: {valid_df.shape}")

    # Load test data
    test_df = pd.read_csv(Config.TEST_PATH)
    print(f"Test data shape: {test_df.shape}")
    
    # Handle potential NaN values in text data
    print("Cleaning missing values from text columns...")
    train_df['comment_text'].fillna('', inplace=True)
    valid_df['comment_text'].fillna('', inplace=True)
    test_df['content'].fillna('', inplace=True)

    # To be robust, also ensure label columns have no NaNs and are integers.
    train_df['toxic'].fillna(0, inplace=True)
    valid_df['toxic'].fillna(0, inplace=True)
    
    # Extract text and labels for model input, explicitly casting types
    X_train = train_df['comment_text'].astype(str).values
    y_train = train_df['toxic'].astype(int).values
    
    X_valid = valid_df['comment_text'].astype(str).values
    y_valid = valid_df['toxic'].astype(int).values
    
    X_test = test_df['content'].astype(str).values
    
    return X_train, y_train, X_valid, y_valid, X_test, test_df['id']

# --- 4. Dataset Pipeline ---
def build_dataset(texts, labels=None, batch_size=32, is_training=True):
    """
    Creates a tf.data.Dataset from text and label arrays.
    """
    AUTOTUNE = tf.data.AUTOTUNE

    if labels is None:
        dataset = tf.data.Dataset.from_tensor_slices(texts)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    
    if is_training:
        dataset = dataset.shuffle(buffer_size=1024, seed=Config.SEED).repeat()
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

# --- 5. Model Building ---
def build_model(strategy):
    """
    Builds and compiles the Keras NLP classifier model within the TPU strategy scope.
    """
    print("\n--- Building Model ---")
    with strategy.scope():
        # The Classifier.from_preset method is smart. We can pass preprocessor
        # arguments like `sequence_length` directly to it.
        classifier = keras_nlp.models.Classifier.from_preset(
            Config.PRESET,
            num_classes=1, # Binary classification (toxic/not-toxic)
            sequence_length=Config.SEQUENCE_LENGTH # This is passed to the preprocessor
        )
        
        # Define optimizer
        optimizer = keras.optimizers.AdamW(learning_rate=Config.LEARNING_RATE)
        
        # Compile the model
        classifier.compile(
            optimizer=optimizer,
            loss=keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=[
                keras.metrics.BinaryAccuracy(name="accuracy"),
                keras.metrics.AUC(name="roc_auc", from_logits=True)
            ],
            jit_compile=True
        )
        
    classifier.summary()
    return classifier

# --- 6. Training, Prediction, and Evaluation ---
def run_pipeline():
    """
    Executes the full ML pipeline: setup, data loading, training, and prediction.
    """
    # Initialize TPU
    strategy, global_batch_size = initialize_tpu()
    
    # Load and prepare data
    X_train, y_train, X_valid, y_valid, X_test, test_ids = load_and_prepare_data()
    
    # Create tf.data.Dataset objects
    train_dataset = build_dataset(X_train, y_train, batch_size=global_batch_size, is_training=True)
    valid_dataset = build_dataset(X_valid, y_valid, batch_size=global_batch_size, is_training=False)
    test_dataset = build_dataset(X_test, batch_size=global_batch_size, is_training=False)
    
    # Build the model within the strategy scope
    model = build_model(strategy)
    
    # Train the model
    print("\n--- Starting Model Training ---")
    steps_per_epoch = len(X_train) // global_batch_size

    history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=Config.EPOCHS,
        steps_per_epoch=steps_per_epoch
    )
    
    print("\n--- Training Complete ---")
    
    # --- Prediction and Evaluation on Validation Set ---
    print("\n--- Evaluating on Validation Set ---")
    valid_preds_logits = model.predict(valid_dataset, verbose=1)
    valid_preds_probs = tf.nn.sigmoid(valid_preds_logits).numpy().flatten()
    
    # Calculate requested evaluation metrics
    y_pred_binary = valid_preds_probs > 0.5
    accuracy = accuracy_score(y_valid, y_pred_binary)
    loss = log_loss(y_valid, valid_preds_probs)
    roc_auc = roc_auc_score(y_valid, valid_preds_probs)
    f1_macro = f1_score(y_valid, y_pred_binary, average='macro')
    f1_weighted = f1_score(y_valid, y_pred_binary, average='weighted')
    
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Log Loss: {loss:.4f}")
    print(f"Validation ROC AUC (AUC): {roc_auc:.4f}")
    print(f"Validation F1 Macro: {f1_macro:.4f}")
    print(f"Validation F1 Weighted: {f1_weighted:.4f}")
    
    # --- Prediction for Submission ---
    print("\n--- Generating Test Predictions ---")
    test_preds_logits = model.predict(test_dataset, verbose=1)
    test_preds_probs = tf.nn.sigmoid(test_preds_logits).numpy().flatten()
    
    # Create submission file
    submission_df = pd.DataFrame({'id': test_ids, 'toxic': test_preds_probs})
    submission_df.to_csv(Config.SUBMISSION_PATH, index=False)
    print(f"\nSubmission file created at: {Config.SUBMISSION_PATH}")
    print(submission_df.head())
    
    # Clean up memory
    del model, X_train, y_train, X_valid, y_valid, X_test, test_ids
    gc.collect()

# --- Main Execution ---
if __name__ == "__main__":
    run_pipeline()

--- Initializing TPU ---
TPU not detected. Falling back to CPU/GPU strategy.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Global batch size set to: 16

--- Loading and Preparing Data ---
Combined training data shape: (2099954, 2)
Validation data shape: (8000, 4)
Test data shape: (63812, 3)
Cleaning missing values from text columns...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['comment_text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  valid_df['comment_text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 


--- Building Model ---



--- Starting Model Training ---
Epoch 1/2




NotFoundError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/usr/local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 765, in start

  File "/usr/local/lib/python3.10/site-packages/anyio/_core/_eventloop.py", line 74, in run

  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2310, in run

  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 217, in run

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/local/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 435, in process_shell

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 512, in process_shell_message

  File "/usr/local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 361, in execute_request

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 766, in execute_request

  File "/usr/local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 457, in do_execute

  File "/usr/local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 606, in run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3077, in run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3132, in _run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3336, in run_cell_async

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3519, in run_ast_nodes

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code

  File "/tmp/ipykernel_10/4072198260.py", line 249, in <module>

  File "/tmp/ipykernel_10/4072198260.py", line 204, in run_pipeline

  File "/usr/local/lib/python3.10/site-packages/keras_hub/src/utils/pipeline_model.py", line 177, in fit

  File "/usr/local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

could not find registered transfer manager for platform Host -- check target linkage
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_54693]