In [None]:
!pip install flair

Collecting flair
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.35.57-py3-none-any.whl.metadata (6.7 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mpld3>=0.3 (from flair)
  Downloading mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)


In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
# load tagger
tagger = SequenceTagger.load("flair/pos-english")

# make example sentence
sentence = Sentence("I love Berlin.")

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('pos'):
    print(entity)

2024-11-11 18:59:13,393 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD
Sentence[4]: "I love Berlin." → ["I"/PRP, "love"/VBP, "Berlin"/NNP, "."/.]
The following NER tags are found:


### Initial pipeline:

In [3]:
!pip install conllu

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    XLMRobertaTokenizerFast,  # Changed from XLMRobertaTokenizer to XLMRobertaTokenizerFast
    XLMRobertaForTokenClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
from typing import List, Dict, Tuple
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
class POSDataset(Dataset):
    def __init__(self, texts: List[List[str]], tags: List[List[str]],
                 tokenizer, tag2id: Dict[str, int], max_len: int = 128):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.tag2id = tag2id
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        words = self.texts[idx]
        tags = self.tags[idx]

        encoded = self.tokenizer(
            words,
            is_split_into_words=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        label_ids = []
        word_ids = encoded.word_ids()

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(self.tag2id[tags[word_idx]])

        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'labels': torch.tensor(label_ids)
        }

In [7]:
class POSTaggingPipeline:
    def __init__(self, model_name: str = "xlm-roberta-base"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device: {self.device}")

        self.model_name = model_name
        self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)
        logger.info("Initialized Fast tokenizer")

        self.tag2id = None
        self.id2tag = None
        self.model = None

    def prepare_data(self, dataset_name: str, split: str = "train") -> Tuple[List[List[str]], List[List[str]]]:
        """
        Load and prepare data from Universal Dependencies dataset.

        Args:
            dataset_name (str): Name of the UD dataset (e.g., 'en_ewt', 'wo_wtb')
            split (str): Dataset split ('train', 'validation', 'test')

        Returns:
            Tuple[List[List[str]], List[List[str]]]: Tuple of (texts, tags)
        """
        logger.info(f"Loading {dataset_name} dataset, {split} split")
        try:
            # Load the dataset
            dataset = load_dataset("universal_dependencies", dataset_name)

            # Get the specified split
            data_split = dataset[split]

            # Extract texts and tags
            texts = [item['tokens'] for item in data_split]
            tags = [item['upos'] for item in data_split]

            logger.info(f"Loaded {len(texts)} sentences from {dataset_name} {split} split")

            # Basic validation
            assert all(len(text) == len(tag) for text, tag in zip(texts, tags)), \
                "Mismatch between text and tag lengths"

            return texts, tags

        except Exception as e:
            logger.error(f"Error loading dataset {dataset_name}: {str(e)}")
            raise

    def initialize_model(self, num_labels: int):
        """Initialize the model with proper classification head"""
        logger.info(f"Initializing model with {num_labels} labels")

        self.model = XLMRobertaForTokenClassification.from_pretrained(
            self.model_name,
            num_labels=num_labels
        )

        # Initialize classification layer
        torch.nn.init.xavier_uniform_(self.model.classifier.weight)
        torch.nn.init.zeros_(self.model.classifier.bias)

        self.model = self.model.to(self.device)
        logger.info("Model initialized and moved to device")

    def create_tag_mappings(self, tags: List[List[str]]):
        """Create tag to ID mappings"""
        unique_tags = sorted(list(set(tag for seq in tags for tag in seq)))
        self.tag2id = {tag: i for i, tag in enumerate(unique_tags)}
        self.id2tag = {i: tag for tag, i in self.tag2id.items()}
        logger.info(f"Created mappings for {len(unique_tags)} unique tags: {unique_tags}")

    def train(self, train_texts: List[List[str]], train_tags: List[List[str]],
              eval_texts: List[List[str]] = None, eval_tags: List[List[str]] = None,
              epochs: int = 3, batch_size: int = 16, learning_rate: float = 2e-5):
        """Train the model on source language data"""
        if self.tag2id is None:
            self.create_tag_mappings(train_tags)

        self.initialize_model(len(self.tag2id))

        train_dataset = POSDataset(train_texts, train_tags, self.tokenizer, self.tag2id)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        if eval_texts:
            eval_dataset = POSDataset(eval_texts, eval_tags, self.tokenizer, self.tag2id)
            eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

        optimizer = AdamW(self.model.parameters(), lr=learning_rate, weight_decay=0.01)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=total_steps // 10,
            num_training_steps=total_steps
        )

        logger.info("Starting training...")
        best_f1 = 0
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0

            for batch_idx, batch in enumerate(train_loader):
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

                if (batch_idx + 1) % 100 == 0:
                    logger.info(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

            avg_loss = total_loss / len(train_loader)
            logger.info(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

            if eval_texts:
                metrics = self.evaluate(eval_loader)
                current_f1 = metrics['weighted avg']['f1-score']
                logger.info(f"Validation F1: {current_f1:.4f}")

                if current_f1 > best_f1:
                    best_f1 = current_f1
                    logger.info(f"New best F1 score: {best_f1:.4f}")

    def evaluate(self, eval_loader: DataLoader) -> Dict:
        """Evaluate the model"""
        self.model.eval()
        true_labels = []
        pred_labels = []

        logger.info("Starting evaluation...")
        with torch.no_grad():
            for batch in eval_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels']

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                predictions = torch.argmax(outputs.logits, dim=2)

                for i in range(labels.shape[0]):
                    true_seq = [self.id2tag[l.item()] for l in labels[i] if l.item() != -100]
                    pred_seq = [self.id2tag[p.item()] for p, l in zip(predictions[i], labels[i]) if l.item() != -100]
                    true_labels.extend(true_seq)
                    pred_labels.extend(pred_seq)

        return classification_report(true_labels, pred_labels, output_dict=True)

    def predict(self, texts: List[List[str]]) -> List[List[str]]:
        """Predict POS tags for new texts"""
        if self.model is None:
            raise ValueError("Model has not been trained yet!")

        self.model.eval()
        dataset = POSDataset(texts, [[self.id2tag[0]]*len(text) for text in texts],
                           self.tokenizer, self.tag2id)
        loader = DataLoader(dataset, batch_size=1)
        predictions = []

        logger.info("Starting prediction...")
        with torch.no_grad():
            for batch in loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                preds = torch.argmax(outputs.logits, dim=2)
                pred_tags = [self.id2tag[p.item()] for p, m in zip(preds[0], batch['attention_mask'][0])
                           if m.item() == 1][1:-1]  # Remove CLS and SEP
                predictions.append(pred_tags)

        logger.info(f"Generated predictions for {len(predictions)} sentences")
        return predictions

In [8]:
# Initialize pipeline
pipeline = POSTaggingPipeline()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [9]:
# Load English data
en_train_texts, en_train_tags = pipeline.prepare_data("en_ewt", "train")
en_eval_texts, en_eval_tags = pipeline.prepare_data("en_ewt", "validation")

README.md:   0%|          | 0.00/191k [00:00<?, ?B/s]

universal_dependencies.py:   0%|          | 0.00/87.8k [00:00<?, ?B/s]

The repository for universal_dependencies contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/universal_dependencies.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2002 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2077 [00:00<?, ? examples/s]

In [10]:
import random

In [11]:
# Adjust the dataset size by sampling a smaller fraction for quicker training
'''
en_train_texts_reduced = random.sample(en_train_texts, int(len(en_train_texts) * 0.1))
en_train_tags_reduced = random.sample(en_train_tags, int(len(en_train_tags) * 0.1))
en_eval_texts_reduced = random.sample(en_eval_texts, int(len(en_eval_texts) * 0.1))
en_eval_tags_reduced = random.sample(en_eval_tags, int(len(en_eval_tags) * 0.1))
'''

'\nen_train_texts_reduced = random.sample(en_train_texts, int(len(en_train_texts) * 0.1))\nen_train_tags_reduced = random.sample(en_train_tags, int(len(en_train_tags) * 0.1))\nen_eval_texts_reduced = random.sample(en_eval_texts, int(len(en_eval_texts) * 0.1))\nen_eval_tags_reduced = random.sample(en_eval_tags, int(len(en_eval_tags) * 0.1))\n'

In [12]:
# Adjust dataset size before training
# Assuming `train_texts` and `train_tags` are lists of texts and tags respectively
def reduce_dataset_size(texts, tags, fraction=0.1, random_seed=42):
    random.seed(random_seed)

    # Pair texts with their corresponding tags for sampling
    paired_data = list(zip(texts, tags))

    # Sample a fraction of the paired data
    reduced_data = random.sample(paired_data, int(len(paired_data) * fraction))

    # Unzip the reduced data back into separate lists
    reduced_texts, reduced_tags = zip(*reduced_data)
    return list(reduced_texts), list(reduced_tags)

# Reduce dataset size for training and evaluation
en_train_texts_reduced, en_train_tags_reduced = reduce_dataset_size(en_train_texts, en_train_tags, fraction=0.1)
en_eval_texts_reduced, en_eval_tags_reduced = reduce_dataset_size(en_eval_texts, en_eval_tags, fraction=0.1)

In [13]:
# Train model
pipeline.train(
    train_texts=en_train_texts_reduced,
    train_tags=en_train_tags_reduced,
    eval_texts=en_eval_texts_reduced,
    eval_tags=en_eval_tags_reduced,
    epochs=1
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# Load Wolof test data
wo_texts, wo_tags = pipeline.prepare_data("wo_wtb", "test")

Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/717k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1188 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/449 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/470 [00:00<?, ? examples/s]

In [15]:
# Get predictions
predictions = pipeline.predict(wo_texts)

In [None]:
# Calculate metrics
from sklearn.metrics import classification_report
print(classification_report(
    [tag for sent in wo_tags for tag in sent],
    [tag for sent in predictions for tag in sent]
))