# Ultra-Lightweight Cross-lingual Stance Detection Model Development

This notebook implements an extremely memory-efficient approach to developing a cross-lingual stance detection model using a smaller XLM-RoBERTa model.

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm
import gc

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Enable mixed precision training
scaler = torch.amp.GradScaler()

Using device: cpu


## 1. Load and Prepare Data

In [3]:
# Load the preprocessed data
train_data = pd.read_csv('../data/processed/train.csv')
val_data = pd.read_csv('../data/processed/val.csv')
test_data = pd.read_csv('../data/processed/test.csv')

print(f"Train: {len(train_data)}, Validation: {len(val_data)}, Test: {len(test_data)}")

# Encode stance labels
le = LabelEncoder()
train_data['stance_encoded'] = le.fit_transform(train_data['stance'])
val_data['stance_encoded'] = le.transform(val_data['stance'])
test_data['stance_encoded'] = le.transform(test_data['stance'])

print(f"Stance labels: {le.classes_}")

Train: 5645, Validation: 807, Test: 1613
Stance labels: [0 1 2]


## 2. Tokenize Data

In [4]:
# Initialize the tokenizer (using a smaller model)
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

def tokenize_data(texts, labels, max_length=128):  # Reduced max_length
    if isinstance(texts, pd.Series):
        texts = texts.astype(str).tolist()
    else:
        texts = [str(text) for text in texts]
    
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))

# Tokenize the data
train_dataset = tokenize_data(train_data['processed_text'], train_data['stance_encoded'])
val_dataset = tokenize_data(val_data['processed_text'], val_data['stance_encoded'])
test_dataset = tokenize_data(test_data['processed_text'], test_data['stance_encoded'])

print("Data tokenized and prepared for model training.")

# Clear some memory
del train_data, val_data, test_data
gc.collect()



Data tokenized and prepared for model training.


0

## 3. Define Model and Training Functions

In [5]:
# Use a smaller model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(le.classes_))
model.to(device)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

def train(model, train_loader, val_loader, epochs=5, lr=1e-5, accumulation_steps=8):  # Increased accumulation steps
    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs // accumulation_steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    best_val_loss = float('inf')
    patience = 3
    patience_counter = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            with torch.cuda.amp.autocast():  # Mixed precision
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss / accumulation_steps
            
            scaler.scale(loss).backward()
            train_loss += loss.item()
            
            if (i + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {avg_val_loss:.4f}")
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), '../models/xlm_roberta_stance_detection_best.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered after epoch {epoch+1}")
                break
    
    return model

def evaluate(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    
    return classification_report(true_labels, predictions, target_names=le.classes_, digits=4)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Train the Model

In [6]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Reduced batch size
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

trained_model = train(model, train_loader, val_loader, epochs=5)  # Reduced epochs

print("Model training completed.")

  with torch.cuda.amp.autocast():  # Mixed precision
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Epoch 1/5:  11%|█▏        | 160/1412 [01:42<14:03,  1.48it/s]

: 

## 5. Evaluate the Model

In [None]:
# Load the best model
model.load_state_dict(torch.load('../models/xlm_roberta_stance_detection_best.pth'))

evaluation_report = evaluate(model, test_loader)
print("Model Evaluation Report:")
print(evaluation_report)

## 6. Analyze Results and Next Steps

Based on the evaluation results, analyze the model's performance:

1. Overall accuracy: [Your observation]
2. Performance across different stances: [Your observation]
3. Performance across different languages: [Your observation]

Next steps:
1. If the model performance is satisfactory, proceed to create a stance detection script
2. If not, consider further optimizations or using a different approach (e.g., distilled models, or non-transformer based methods)
3. Analyze misclassifications to understand model weaknesses
4. Consider data augmentation or additional preprocessing steps if needed

With this model developed and evaluated, you can now move on to creating a script for detecting stances in new, unseen data, assuming the performance is acceptable.