In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test_labels.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv


In [None]:
# -*- coding: utf-8 -*-
"""
This script trains a multilingual model for toxic comment classification.

The core approach is zero-shot cross-lingual transfer:
1.  A pre-trained multilingual transformer model (XLM-RoBERTa) is used.
2.  The model is fine-tuned *only* on the provided English-language training data.
3.  The model's ability to generalize is tested on a non-English validation set.
4.  Final predictions are made on the non-English test set.
"""

# # 1. SETUP AND IMPORTS
# =======================
print("✅ Starting setup...")

import os
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    log_loss, 
    f1_score, 
    roc_auc_score
)
import logging

# Suppress verbose logging from transformers
logging.getLogger("transformers").setLevel(logging.ERROR)

# Ensure reproducibility
tf.random.set_seed(42)
np.random.seed(42)

print("TensorFlow Version:", tf.__version__)
print("✅ Setup complete.")


# # 2. CONFIGURATION
# ====================
# A configuration class to hold all hyperparameters and settings in one place.
class CFG:
    # Model configuration
    MODEL_NAME = 'jplu/tf-xlm-roberta-base' # A TF-specific version of xlm-roberta-base
    
    # Training parameters
    MAX_LEN = 192       # Max sequence length for the tokenizer
    EPOCHS = 3          # Number of training epochs
    LEARNING_RATE = 1e-5 # AdamW optimizer learning rate
    
    # Batching - BATCH_SIZE_PER_REPLICA is the key setting for TPUs
    # The global batch size will be BATCH_SIZE_PER_REPLICA * number of replicas
    BATCH_SIZE_PER_REPLICA = 16
    
    # Data paths
    # Kaggle datasets are typically located in /kaggle/input/
    DATA_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification"
    TRAIN_BIAS_PATH = os.path.join(DATA_PATH, "jigsaw-unintended-bias-train.csv")
    TRAIN_TOXIC_PATH = os.path.join(DATA_PATH, "jigsaw-toxic-comment-train.csv")
    VALIDATION_PATH = os.path.join(DATA_PATH, "validation.csv")
    TEST_PATH = os.path.join(DATA_PATH, "test.csv")
    SUBMISSION_PATH = os.path.join(DATA_PATH, "sample_submission.csv")
    
    # Internal validation split (from English data)
    VALID_SPLIT_RATIO = 0.1

print("✅ Configuration loaded.")


# # 3. TPU INITIALIZATION
# =========================
# This block is standard boilerplate for using TPUs in TensorFlow.
print("🚀 Initializing TPU...")
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    print(f"✅ TPU initialized. Running on: {tpu.master()}")
except ValueError:
    # If TPU is not available, fall back to the default strategy (CPU/GPU)
    strategy = tf.distribute.get_strategy()
    print("⚠️ TPU not found. Running on CPU/GPU.")

print(f"✔️ Number of replicas: {strategy.num_replicas_in_sync}")

# Calculate global batch size
CFG.GLOBAL_BATCH_SIZE = CFG.BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
print(f"✔️ Global batch size: {CFG.GLOBAL_BATCH_SIZE}")


# # 4. DATA LOADING AND PREPROCESSING
# =====================================
print("🔄 Loading and preprocessing data...")

# --- Load Raw Data ---
# Training data comes from two files. We'll combine them.
train_bias_df = pd.read_csv(CFG.TRAIN_BIAS_PATH)
train_toxic_df = pd.read_csv(CFG.TRAIN_TOXIC_PATH)

# The 'unintended-bias' dataset has a toxicity score from 0.0 to 1.0.
# We convert it to a binary label (1 if toxic, 0 if not).
train_bias_df['toxic'] = (train_bias_df['toxic'] >= 0.5).astype(int)

# Combine the two training sets, keeping only the necessary columns
train_df = pd.concat([
    train_bias_df[['comment_text', 'toxic']],
    train_toxic_df[['comment_text', 'toxic']]
]).drop_duplicates(subset=['comment_text']).reset_index(drop=True)

print(f"Combined English training data shape: {train_df.shape}")

# Load non-English validation and test data
valid_df = pd.read_csv(CFG.VALIDATION_PATH)
test_df = pd.read_csv(CFG.TEST_PATH)

print(f"Non-English validation data shape: {valid_df.shape}")
print(f"Test data shape: {test_df.shape}")

# --- Split English data for local evaluation ---
# This creates an internal validation set to check model performance on English data.
X_train, X_val_eng, y_train, y_val_eng = train_test_split(
    train_df['comment_text'].values,
    train_df['toxic'].values,
    test_size=CFG.VALID_SPLIT_RATIO,
    random_state=42,
    stratify=train_df['toxic'].values # Stratify to maintain class balance
)

print(f"Training samples (English): {len(X_train)}")
print(f"Internal validation samples (English): {len(X_val_eng)}")

# The primary validation set is the non-English one
X_val_multi = valid_df['comment_text'].values
y_val_multi = valid_df['toxic'].values

# Test data
X_test = test_df['comment_text'].values

print("✅ Data loading and preprocessing complete.")


# # 5. TOKENIZATION
# ===================
print("✍️ Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)

def encode_text(texts, tokenizer, max_len):
    """Encodes a list of texts into token IDs, attention masks, etc."""
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False # XLM-R doesn't use token_type_ids
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids), np.array(attention_masks)

print("✍️ Tokenizing all datasets...")
# Tokenize all text data splits
x_train_ids, x_train_mask = encode_text(X_train, tokenizer, CFG.MAX_LEN)
x_val_eng_ids, x_val_eng_mask = encode_text(X_val_eng, tokenizer, CFG.MAX_LEN)
x_val_multi_ids, x_val_multi_mask = encode_text(X_val_multi, tokenizer, CFG.MAX_LEN)
x_test_ids, x_test_mask = encode_text(X_test, tokenizer, CFG.MAX_LEN)

print("✅ Tokenization complete.")


# # 6. CREATE TF.DATA.DATASET
# =============================
# Using tf.data.Dataset is highly recommended for performance, especially on TPUs.
def create_tf_dataset(ids, masks, labels=None, shuffle=False, is_test=False):
    """Creates a tf.data.Dataset from numpy arrays."""
    dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': ids, 'attention_mask': masks}, labels) if not is_test else {'input_ids': ids, 'attention_mask': masks})
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(ids))
        
    # Batch and prefetch for performance
    dataset = dataset.batch(CFG.GLOBAL_BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

print("📊 Creating tf.data.Dataset objects...")
# Create datasets for training, validation (multilingual), and testing
train_dataset = create_tf_dataset(x_train_ids, x_train_mask, y_train, shuffle=True)
val_multi_dataset = create_tf_dataset(x_val_multi_ids, x_val_multi_mask, y_val_multi)
test_dataset = create_tf_dataset(x_test_ids, x_test_mask, is_test=True)

# Create a separate dataset for evaluating on the English validation split
val_eng_dataset = create_tf_dataset(x_val_eng_ids, x_val_eng_mask, y_val_eng)

print("✅ TF Datasets created.")


# # 7. MODEL BUILDING
# =====================
def build_model(model_name, max_len):
    """Builds and compiles the Keras model within the TPU strategy scope."""
    with strategy.scope():
        # Input layers
        input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
        attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

        # Load the pre-trained transformer model
        transformer_encoder = TFAutoModel.from_pretrained(model_name)
        
        # The output is a dictionary. We want the 'last_hidden_state'.
        # We take the embedding of the [CLS] token (at index 0) for classification.
        embedding = transformer_encoder(input_ids, attention_mask=attention_mask)[0][:, 0, :]

        # Classification head
        output = tf.keras.layers.Dense(1, activation='sigmoid')(embedding)
        
        # Define the model
        model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

        # Compile the model with AdamW optimizer, Binary Crossentropy loss, and AUC metric
        optimizer = tf.keras.optimizers.AdamW(learning_rate=CFG.LEARNING_RATE)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=[tf.keras.metrics.AUC(name='auc')]
        )
    return model

print("🧠 Building the model...")
model = build_model(CFG.MODEL_NAME, CFG.MAX_LEN)
model.summary()
print("✅ Model built and compiled.")


# # 8. TRAINING
# ===============
print("🚂 Starting training...")

# Callbacks
# Save the best model based on the multilingual validation AUC
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'best_model.h5', 
    monitor='val_auc', 
    mode='max', 
    save_best_only=True,
    save_weights_only=True
)

# Early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=1, # Stop after 1 epoch of no improvement
    mode='max',
    restore_best_weights=True # This is redundant if save_best_only=True but good practice
)

history = model.fit(
    train_dataset,
    epochs=CFG.EPOCHS,
    validation_data=val_multi_dataset, # Key: validate on non-English data
    callbacks=[model_checkpoint, early_stopping]
)

print("✅ Training complete.")

# # 9. EVALUATION ON ENGLISH SPLIT
# ==================================
# After training, we explicitly evaluate the model's performance on the held-out English data.
print("\n" + "="*50)
print("📈 Evaluating on the held-out English validation set...")

# Load the best weights saved during training
print("Loading best model weights from 'best_model.h5'...")
model.load_weights('best_model.h5')

# Get predictions (probabilities)
y_pred_eng_probs = model.predict(val_eng_dataset, verbose=1).flatten()
# Get class predictions (0 or 1)
y_pred_eng_class = (y_pred_eng_probs >= 0.5).astype(int)

# Calculate metrics as requested
accuracy = accuracy_score(y_val_eng, y_pred_eng_class)
loss = log_loss(y_val_eng, y_pred_eng_probs)
f1_macro = f1_score(y_val_eng, y_pred_eng_class, average='macro')
f1_weighted = f1_score(y_val_eng, y_pred_eng_class, average='weighted')
# For binary classification, 'ovr' and 'weighted' AUC are the same as the standard AUC.
roc_auc = roc_auc_score(y_val_eng, y_pred_eng_probs)

print("\n--- English Validation Metrics ---")
print(f"Accuracy         : {accuracy:.4f}")
print(f"Log Loss         : {loss:.4f}")
print(f"F1 Score (Macro) : {f1_macro:.4f}")
print(f"F1 Score (Weighted): {f1_weighted:.4f}")
print(f"ROC AUC          : {roc_auc:.4f}")
print("="*50 + "\n")


# # 10. INFERENCE AND SUBMISSION
# ================================
print("🚀 Making predictions on the test set...")

# Predict on the test dataset
test_predictions = model.predict(test_dataset, verbose=1).flatten()

# Create submission file
print("📝 Creating submission file...")
submission_df = pd.read_csv(CFG.SUBMISSION_PATH)
submission_df['toxic'] = test_predictions
submission_df.to_csv('submission.csv', index=False)

print("\n🎉 Submission file 'submission.csv' created successfully!")
print("Top 5 rows of the submission file:")
print(submission_df.head())

✅ Starting setup...


E0000 00:00:1751383795.746842     372 common_lib.cc:612] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:230
  from .autonotebook import tqdm as notebook_tqdm
