In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import spacy
from collections import Counter
from tqdm.notebook import tqdm

# Load spaCy for tokenization and lemmatization
nlp = spacy.load("en_core_web_sm")

In [3]:
class TextPreprocessor:
    def __init__(self, remove_stopwords=True, lemmatize=True):
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
    
    def preprocess(self, text):
        doc = nlp(text)
        tokens = []
        
        for token in doc:
            # Skip stopwords if configured
            if self.remove_stopwords and token.is_stop:
                continue
            
            # Skip punctuation
            if token.is_punct:
                continue
                
            # Lemmatize if configured, otherwise use the original token
            processed_token = token.lemma_ if self.lemmatize else token.text
            tokens.append(processed_token.lower())
            
        return tokens

# NER Dataset


In [4]:
class NERDataset(Dataset):
    def __init__(self, dataset_split, max_length=128):
        self.dataset = dataset_split
        self.max_length = max_length
        
        # Build vocabulary and tag dictionary
        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        self.tag2idx = {"<PAD>": 0}
        self.idx2tag = {0: "<PAD>"}
        
        # Get tag names from dataset features
        tag_names = dataset_split.features['ner_tags'].feature.names
        for i, tag in enumerate(tag_names):
            self.tag2idx[tag] = i + 1  # +1 because 0 is for PAD
            self.idx2tag[i + 1] = tag
        
        # Build word vocabulary
        word_counter = Counter()
        for example in tqdm(dataset_split, desc="Building vocabulary"):
            for token in example['tokens']:
                word_counter[token.lower()] += 1
        
        # Keep only words that appear at least 2 times
        for word, count in word_counter.items():
            if count >= 2:
                self.word2idx[word] = len(self.word2idx)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        example = self.dataset[idx]
        tokens = example['tokens']
        tags = example['ner_tags']
        
        # Convert tokens to indices
        token_indices = []
        for token in tokens[:self.max_length]:
            token = token.lower()
            if token in self.word2idx:
                token_indices.append(self.word2idx[token])
            else:
                token_indices.append(self.word2idx["<UNK>"])
        
        # Pad sequences
        padding_length = self.max_length - len(token_indices)
        if padding_length > 0:
            token_indices = token_indices + [self.word2idx["<PAD>"]] * padding_length
            tags = tags[:self.max_length] + [0] * padding_length  # 0 is PAD tag
        else:
            token_indices = token_indices[:self.max_length]
            tags = tags[:self.max_length]
        
        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * min(len(tokens), self.max_length) + [0] * padding_length
        
        return {
            'input_ids': torch.tensor(token_indices, dtype=torch.long),
            'tags': torch.tensor(tags, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
        }

# Text Classification Dataset


In [5]:
class TextClassificationDataset(Dataset):
    def __init__(self, dataset_split, preprocessor=None, max_length=128):
        self.dataset = dataset_split
        self.preprocessor = preprocessor or TextPreprocessor()
        self.max_length = max_length
        
        # Build vocabulary
        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        
        # Process all texts to build vocabulary
        word_counter = Counter()
        for example in tqdm(dataset_split, desc="Building vocabulary"):
            tokens = self.preprocessor.preprocess(example['text'])
            for token in tokens:
                word_counter[token] += 1
        
        # Keep only words that appear at least 5 times
        for word, count in word_counter.items():
            if count >= 5:
                self.word2idx[word] = len(self.word2idx)
        
        # Get class names
        self.num_classes = len(dataset_split.features['label'].names)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        example = self.dataset[idx]
        text = example['text']
        label = example['label']
        
        # Preprocess text
        tokens = self.preprocessor.preprocess(text)
        
        # Convert tokens to indices
        token_indices = []
        for token in tokens[:self.max_length]:
            if token in self.word2idx:
                token_indices.append(self.word2idx[token])
            else:
                token_indices.append(self.word2idx["<UNK>"])
        
        # Pad sequences
        padding_length = self.max_length - len(token_indices)
        if padding_length > 0:
            token_indices = token_indices + [self.word2idx["<PAD>"]] * padding_length
        else:
            token_indices = token_indices[:self.max_length]
        
        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * min(len(tokens), self.max_length) + [0] * padding_length
        
        return {
            'input_ids': torch.tensor(token_indices, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
        }

# Create data loaders


In [6]:
def create_data_loaders(train_dataset, val_dataset, batch_size=32):
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )
    
    return train_loader, val_loader

In [7]:
from datasets import load_dataset

# Load NER dataset
ner_dataset = load_dataset("conll2003")

# Load text classification dataset
text_classification_dataset = load_dataset("ag_news")

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

# Test the datasets


In [8]:
ner_train_dataset = NERDataset(ner_dataset['train'])
ner_val_dataset = NERDataset(ner_dataset['validation'])

text_preprocessor = TextPreprocessor(remove_stopwords=True, lemmatize=True)
text_train_dataset = TextClassificationDataset(text_classification_dataset['train'], preprocessor=text_preprocessor)
text_val_dataset = TextClassificationDataset(text_classification_dataset['test'], preprocessor=text_preprocessor)

print(f"NER vocabulary size: {len(ner_train_dataset.word2idx)}")
print(f"NER tag set size: {len(ner_train_dataset.tag2idx)}")
print(f"Text classification vocabulary size: {len(text_train_dataset.word2idx)}")
print(f"Number of classes: {text_train_dataset.num_classes}")

Building vocabulary:   0%|          | 0/14041 [00:00<?, ?it/s]

Building vocabulary:   0%|          | 0/3250 [00:00<?, ?it/s]

Building vocabulary:   0%|          | 0/120000 [00:00<?, ?it/s]

Building vocabulary:   0%|          | 0/7600 [00:00<?, ?it/s]

NER vocabulary size: 10951
NER tag set size: 10
Text classification vocabulary size: 24429
Number of classes: 4


# Sample a batch


In [9]:
ner_train_loader, ner_val_loader = create_data_loaders(ner_train_dataset, ner_val_dataset)
text_train_loader, text_val_loader = create_data_loaders(text_train_dataset, text_val_dataset)

sample_ner_batch = next(iter(ner_train_loader))
sample_text_batch = next(iter(text_train_loader))

print("NER batch shape:")
print(f"Input IDs: {sample_ner_batch['input_ids'].shape}")
print(f"Tags: {sample_ner_batch['tags'].shape}")
print(f"Attention mask: {sample_ner_batch['attention_mask'].shape}")

print("\nText classification batch shape:")
print(f"Input IDs: {sample_text_batch['input_ids'].shape}")
print(f"Labels: {sample_text_batch['label'].shape}")
print(f"Attention mask: {sample_text_batch['attention_mask'].shape}")

NER batch shape:
Input IDs: torch.Size([32, 128])
Tags: torch.Size([32, 128])
Attention mask: torch.Size([32, 128])

Text classification batch shape:
Input IDs: torch.Size([32, 128])
Labels: torch.Size([32])
Attention mask: torch.Size([32, 128])
