In [3]:
# Import necessary libraries
import pandas as pd

# Define file paths
train_file = "conll2003/eng.testb"
valid_file = "conll2003/eng.testa"
test_file = "conll2003/eng.train"

# Function to read CoNLL-2003 formatted files
def read_conll_data(conll2003):
    with open(conll2003, "r", encoding="utf-8") as f:
        lines = f.readlines()
    
    sentences, sentence, labels = [], [], []
    
    for line in lines:
        line = line.strip()
        if line:  # If the line is not empty
            parts = line.split()
            sentence.append(parts[0])  # Token (word)
            labels.append(parts[-1])  # NER label (last column)
        else:  # Empty line means the end of a sentence
            if sentence:
                sentences.append((sentence, labels))
                sentence, labels = [], []  # Reset for the next sentence
    
    return sentences

# Load dataset
train_data = read_conll_data(train_file)
valid_data = read_conll_data(valid_file)
test_data = read_conll_data(test_file)

# Print sample sentence
print("Sample sentence:", train_data[0])


Sample sentence: (['-DOCSTART-'], ['O'])


### Dataset Structure: First Sample Sentence

The dataset follows the **CoNLL-2003 format**, where each sentence consists of words and their corresponding **Named Entity Recognition (NER) labels**.


### **Explanation:**
- The first sentence contains only the token `-DOCSTART-`, which is a **document separator marker** in the dataset.
- The label **"O"** (Outside) means that this token **does not belong to any named entity**.
- This confirms that our dataset is correctly loaded and structured.

### **Key Takeaways:**
✅ **The dataset is read correctly, and sentences are extracted.**  
✅ **"-DOCSTART-" is a document boundary marker, not an actual sentence.**  
✅ **Each token has a corresponding entity label.**


In [4]:
# Convert dataset into a structured DataFrame
train_sentences, train_labels = zip(*train_data)  # Unpack sentences and labels
train_df = pd.DataFrame({"sentence": train_sentences, "labels": train_labels})

# Display first few rows
print(train_df.head())


                                            sentence  \
0                                       [-DOCSTART-]   
1  [SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...   
2                                     [Nadim, Ladki]   
3    [AL-AIN, ,, United, Arab, Emirates, 1996-12-06]   
4  [Japan, began, the, defence, of, their, Asian,...   

                                              labels  
0                                                [O]  
1       [O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O]  
2                                     [B-PER, I-PER]  
3                 [B-LOC, O, B-LOC, I-LOC, I-LOC, O]  
4  [B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...  


## Dataset Formatting: Converting Sentences to DataFrame

We convert the dataset into a **structured Pandas DataFrame** for easier manipulation and analysis.

### **Output:**
| Sentence Example | Corresponding Labels |
|-----------------|----------------------|
| `[-DOCSTART-]` | `[O]` |
| `[SOCCER, -, JAPAN, GET, LUCKY, WIN, ...]` | `[O, O, B-LOC, O, O, O, ...]` |
| `[Nadim, Ladki]` | `[B-PER, I-PER]` |
| `[AL-AIN, ,, United, Arab, Emirates, ...]` | `[B-LOC, O, B-LOC, I-LOC, ...]` |

### **Explanation:**
- **Sentences and Labels:** The dataset is structured into two columns:
  - `sentence`: A list of tokens (words).
  - `labels`: Corresponding **NER labels**.
- **Entity Tagging:**
  - `B-LOC` (Beginning of a Location) → "JAPAN".
  - `B-PER` (Beginning of a Person) → "Nadim".
  - `I-PER` (Inside a Person Entity) → "Ladki".
- **BIO Tagging Scheme:**
  - `B-` (Beginning of an entity).
  - `I-` (Inside an entity).
  - `O` (Outside, not an entity).

### **Key Takeaways:**
✅ **The dataset is correctly structured in a DataFrame.**  
✅ **Entity labels follow the BIO tagging format.**  
✅ **Named entities (like persons, locations) are properly labeled.**  


In [5]:
from transformers import AutoTokenizer

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Function to tokenize and align labels
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,  # Keeps word boundaries
        padding="max_length",  # Pads sequences to the same length
        truncation=True,  # Truncates long sequences
        max_length=128,  # Limit input length
        return_tensors="pt"  # Returns PyTorch tensors
    )

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokenized words to original
        new_labels = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        aligned_labels.append(new_labels)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# Apply tokenization on training data
tokenized_train_data = tokenize_and_align_labels(train_sentences[:5], train_labels[:5])  # Tokenize first 5 sentences

# Check output
print("Tokenized Example:", tokenizer.convert_ids_to_tokens(tokenized_train_data["input_ids"][0]))
print("Aligned Labels:", tokenized_train_data["labels"][0])


Tokenized Example: ['[CLS]', '-', 'D', '##OC', '##ST', '##AR', '##T', '-', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

## Tokenization with BERT

BERT uses **WordPiece Tokenization**, which splits words into **subwords** and aligns entity labels accordingly.

### **Tokenized Example:**
['[CLS]', '-', 'D', '##OC', '##ST', '##AR', '##T', '-', '[SEP]', '[PAD]', '[PAD]', ...]


### **Explanation:**
- **[CLS]** → Special token added at the start for classification tasks.
- **[SEP]** → Separator token marking the end of a sentence.
- **[PAD]** → Padding tokens added to ensure all sequences have the same length.
- **Subword Tokenization:**  
  - `"-DOCSTART-"` is split into: `'-', 'D', '##OC', '##ST', '##AR', '##T', '-'`.
  - The `##` symbol indicates that the subword is part of a previous word.

### **Aligned Labels:**
[-100, 'O', 'O', 'O', 'O', 'O', 'O', 'O', -100, -100, -100, ...]


- **`-100`** → Used for special tokens like `[CLS]`, `[SEP]`, and `[PAD]`, ensuring they are ignored during training.
- **"O" Labels:** These remain properly mapped to actual tokens.

### **Key Takeaways:**
✅ **BERT’s tokenizer splits words into subwords (WordPiece Tokenization).**  
✅ **Special tokens (`[CLS]`, `[SEP]`, `[PAD]`) are handled correctly.**  
✅ **NER labels are aligned to tokenized words, ignoring padding.**  


In [6]:
import pandas as pd
import torch

# Function to convert tokenized dataset into a structured format
def convert_to_dataframe(tokenized_data, sentences, labels):
    data = []
    for i in range(len(sentences)):  # Iterate over each sentence
        tokens = tokenizer.convert_ids_to_tokens(tokenized_data["input_ids"][i])  # Convert token IDs to words
        input_ids = tokenized_data["input_ids"][i].tolist()  # Token IDs
        attention_mask = tokenized_data["attention_mask"][i].tolist()  # Attention mask
        aligned_labels = tokenized_data["labels"][i]  # Labels aligned with tokens
        
        # Store the processed sentence
        data.append({
            "tokens": tokens,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": aligned_labels
        })

    return pd.DataFrame(data)

# Convert the training dataset to a DataFrame
train_df = convert_to_dataframe(tokenized_train_data, train_sentences[:5], train_labels[:5])  # Storing only 5 sentences for testing

# Display first few rows
print(train_df.head())


                                              tokens  \
0  [[CLS], -, D, ##OC, ##ST, ##AR, ##T, -, [SEP],...   
1  [[CLS], S, ##OC, ##CE, ##R, -, J, ##AP, ##AN, ...   
2  [[CLS], Na, ##di, ##m, La, ##d, ##ki, [SEP], [...   
3  [[CLS], AL, -, AI, ##N, ,, United, Arab, Emira...   
4  [[CLS], Japan, began, the, defence, of, their,...   

                                           input_ids  \
0  [101, 118, 141, 9244, 9272, 12426, 1942, 118, ...   
1  [101, 156, 9244, 10954, 2069, 118, 147, 12240,...   
2  [101, 11896, 3309, 1306, 2001, 1181, 2293, 102...   
3  [101, 18589, 118, 19016, 2249, 117, 1244, 4699...   
4  [101, 1999, 1310, 1103, 6465, 1104, 1147, 3141...   

                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [7]:
train_df.to_csv("conll2003/tokenized_train.csv", index=False)
print("Tokenized training dataset saved as CSV!")


Tokenized training dataset saved as CSV!


In [8]:
train_df.to_json("conll2003/tokenized_train.json", orient="records", lines=True)
print("Tokenized training dataset saved as JSON!")


Tokenized training dataset saved as JSON!


In [9]:
# Load the CSV
loaded_train_df = pd.read_csv("conll2003/tokenized_train.csv")
print("Loaded Tokenized Dataset (CSV):")
print(loaded_train_df.head())

# Load the JSON
loaded_json_df = pd.read_json("conll2003/tokenized_train.json", orient="records", lines=True)
print("Loaded Tokenized Dataset (JSON):")
print(loaded_json_df.head())


Loaded Tokenized Dataset (CSV):
                                              tokens  \
0  ['[CLS]', '-', 'D', '##OC', '##ST', '##AR', '#...   
1  ['[CLS]', 'S', '##OC', '##CE', '##R', '-', 'J'...   
2  ['[CLS]', 'Na', '##di', '##m', 'La', '##d', '#...   
3  ['[CLS]', 'AL', '-', 'AI', '##N', ',', 'United...   
4  ['[CLS]', 'Japan', 'began', 'the', 'defence', ...   

                                           input_ids  \
0  [101, 118, 141, 9244, 9272, 12426, 1942, 118, ...   
1  [101, 156, 9244, 10954, 2069, 118, 147, 12240,...   
2  [101, 11896, 3309, 1306, 2001, 1181, 2293, 102...   
3  [101, 18589, 118, 19016, 2249, 117, 1244, 4699...   
4  [101, 1999, 1310, 1103, 6465, 1104, 1147, 3141...   

                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
4  [1, 1, 1, 1

### Final Deliverables:


✅ Tokenized and cleaned dataset saved as CSV & JSON.



✅ Ready for training the BERT model.