In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from datasets import Dataset
import nltk
from nltk.tokenize import sent_tokenize

In [None]:
# Download required NLTK data
nltk.download('punkt', quiet=True)

In [None]:
# Constants
DATA_PATH = '/kaggle/input/data-under-th4/data_under_th4.csv'
MODEL_NAME = 'unitary/toxic-bert'
OUTPUT_DIR = './results'
SAVE_PATH = './bias_detection_model'
MAX_LENGTH = 128
BATCH_SIZE = 128
NUM_EPOCHS = 5
LEARNING_RATE = 1e-5
WEIGHT_DECAY = 0.15
RANDOM_STATE = 1234

In [None]:
def load_and_preprocess_data(file_path):
    """Load and preprocess the dataset."""
    # Read CSV file
    df = pd.read_csv(file_path, index_col=None)
    
    # Rename columns for clarity
    df = df.rename(columns={'0': 'comment_text', '1': 'toxicity'})
    
    # Shuffle dataset
    df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    
    # Convert toxicity scores to binary labels (threshold = 0.5)
    df['label'] = (df['toxicity'] >= 0.5).astype(int)
    
    return df

def balance_dataset(train_df):
    """Balance the training dataset by oversampling biased instances."""
    # Identify biased samples
    biased_df = train_df[train_df['label'] == 1]
    
    # Concatenate original and duplicated biased data
    train_df_balanced = pd.concat([train_df, biased_df])
    
    return train_df_balanced

def compute_class_weights(labels):
    """Calculate class weights to handle imbalance."""
    weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=labels)
    return torch.tensor(weights, dtype=torch.float)

def prepare_datasets(train_df, val_df):
    """Convert DataFrames to Hugging Face Datasets and tokenize."""
    # Convert to Dataset objects
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    
    # Tokenization function
    def tokenize_function(examples):
        return tokenizer(examples['comment_text'], 
                        padding='max_length', 
                        truncation=True, 
                        max_length=MAX_LENGTH)
    
    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    
    # Rename 'label' to 'labels' for compatibility
    train_dataset = train_dataset.rename_column('label', 'labels')
    val_dataset = val_dataset.rename_column('label', 'labels')
    
    # Set format for PyTorch
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    return train_dataset, val_dataset

class WeightedTrainer(Trainer):
    """Custom Trainer with weighted loss for imbalanced classes."""
    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss
    
    def __init__(self, class_weights, *args, **kwargs):
        self.class_weights = class_weights
        super().__init__(*args, **kwargs)

def setup_training_args():
    """Configure training arguments with regularization and early stopping."""
    return TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir='./logs',
        logging_steps=10,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        report_to="none"
    )

def compute_metrics(pred):
    """Compute evaluation metrics."""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def train_model(model, train_dataset, val_dataset, training_args, class_weights):
    """Initialize and train the model."""
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        class_weights=class_weights.to(device)
    )
    
    # Train the model
    trainer.train()
    return trainer

def identify_biased_sentences(text, classifier, threshold=0.3):
    """Identify biased sentences in a given text."""
    sentences = sent_tokenize(text)
    biased_sentences = []
    scores = []
    labels = []
    
    for sentence in sentences:
        result = classifier(sentence)[0]
        if result['label'] == 'LABEL_1' and result['score'] > threshold:
            biased_sentences.append(sentence)
        scores.append(result['score'])
        labels.append(result['label'])
    
    return biased_sentences, scores, labels

In [None]:
# Main execution
if __name__ == "__main__":
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Load and preprocess data
    df = load_and_preprocess_data(DATA_PATH)
    
    # Split into train and validation sets
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Balance the training dataset
    train_df_balanced = balance_dataset(train_df)
    
    # Compute class weights
    class_weights = compute_class_weights(train_df_balanced['label'])
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2,
        ignore_mismatched_sizes=True
    ).to(device)
    
    # Prepare datasets
    train_dataset, val_dataset = prepare_datasets(train_df_balanced, val_df)
    
    # Setup training arguments
    training_args = setup_training_args()
    
    # Train the model
    trainer = train_model(model, train_dataset, val_dataset, training_args, class_weights)
    
    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Metrics:", eval_results)
    
    # Save the trained model and tokenizer
    model.save_pretrained(SAVE_PATH)
    tokenizer.save_pretrained(SAVE_PATH)
    
    # Load classifier for inference
    classifier = pipeline('text-classification', model=SAVE_PATH, tokenizer=SAVE_PATH)
    
    # Test inference on sample text
    sample_text = "Girls should only do household chores. Sample text. She is a girl, she can't drive."
    biased_sentences, scores, labels = identify_biased_sentences(sample_text, classifier)
    
    # Display results
    print("\nInference Results:")
    print("Biased sentences:")
    for sentence in biased_sentences:
        print(f"- {sentence}")
    print("Scores:")
    for score in scores:
        print(f"- {score:.4f}")
    print("Labels:")
    for label in labels:
        print(f"- {label}")