# Model Development

In this notebook, we'll develop our cross-lingual stance detection model using XLM-RoBERTa.

In [7]:
import pandas as pd
import numpy as np
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm
import gc

## 1. Load and Prepare Data

In [8]:
# Load the preprocessed data
train_data = pd.read_csv('../data/processed/train.csv')
val_data = pd.read_csv('../data/processed/val.csv')
test_data = pd.read_csv('../data/processed/test.csv')

print(f"Train: {len(train_data)}, Validation: {len(val_data)}, Test: {len(test_data)}")

# Encode stance labels
le = LabelEncoder()
train_data['stance_encoded'] = le.fit_transform(train_data['stance'])
val_data['stance_encoded'] = le.transform(val_data['stance'])
test_data['stance_encoded'] = le.transform(test_data['stance'])

print(f"Stance labels: {le.classes_}")

# Display some sample data
print("\nSample data from training set:")
print(train_data[['processed_text', 'stance', 'stance_encoded']].head())

# Check class distribution
print("\nClass distribution in training set:")
print(train_data['stance'].value_counts(normalize=True))

Train: 5645, Validation: 807, Test: 1613
Stance labels: [0 1 2]

Sample data from training set:
                                      processed_text  stance  stance_encoded
0           worst hurrican season evar accord expert       2               2
1                demand climat action finnish govern       0               0
2       home depot fine million sell ban superpollut       2               2
3  mexiko illegal abholz vertreibt ureinwohn orga...       2               2
4  web mobil dev look help hey web dev realli wan...       0               0

Class distribution in training set:
stance
2    0.888751
0    0.105757
1    0.005492
Name: proportion, dtype: float64


## 2. Tokenize Data

In [12]:
# Initialize the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

def tokenize_data(texts, labels, max_length=256):
    # Ensure texts is a list of strings
    if isinstance(texts, pd.Series):
        texts = texts.astype(str).tolist()
    else:
        texts = [str(text) for text in texts]
    
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))

# Try tokenizing again
train_dataset = tokenize_data(train_data['processed_text'], train_data['stance_encoded'])
val_dataset = tokenize_data(val_data['processed_text'], val_data['stance_encoded'])
test_dataset = tokenize_data(test_data['processed_text'], test_data['stance_encoded'])

print("Data tokenized and prepared for model training.")



Data tokenized and prepared for model training.


## 3. Define Model and Training Functions

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(le.classes_))
model.to(device)

def train(model, train_loader, val_loader, epochs=3, lr=2e-5):
    optimizer = AdamW(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")
    
    return model

def evaluate(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    
    return classification_report(true_labels, predictions, target_names=le.classes_, digits=4)

Using device: cpu


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Train the Model

In [14]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

trained_model = train(model, train_loader, val_loader, epochs=3)

# Save the trained model
torch.save(trained_model.state_dict(), 'models/xlm_roberta_stance_detection.pth')
print("Model trained and saved.")

Epoch 1/3:   4%|▍         | 14/353 [01:36<34:31,  6.11s/it] 

: 

## 5. Evaluate the Model

In [None]:
evaluation_report = evaluate(trained_model, test_loader)
print("Model Evaluation Report:")
print(evaluation_report)

## 6. Analyze Results and Next Steps

Based on the evaluation results, we can analyze the model's performance:

1. Overall accuracy: [Your observation]
2. Performance across different stances: [Your observation]
3. Performance across different languages: [Your observation]

Next steps:
1. Fine-tune hyperparameters if necessary
2. Analyze misclassifications to understand model weaknesses
3. Consider data augmentation or additional preprocessing steps
4. Implement the trained model in a stance detection script

With this model developed and evaluated, we can now move on to creating a script for detecting stances in new, unseen data.