In [1]:
!pip install transformers datasets seqeval evaluate pytorch-crf

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datas

In [59]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from tqdm import tqdm
from torchcrf import CRF
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

In [60]:
# Load and preprocess data
data = pd.read_csv("/content/ner_dataset_mountains.csv")
data.dropna(subset=['word', 'tag'])
data = data[data['word'] != ',']
data.head()

Unnamed: 0,word,tag
0,Climbing,O
1,famous,O
2,mountains,O
3,around,O
4,the,O


In [61]:
# Assign sentence_id
sentence_id = 0
sentence_ids = []
for word in data['word']:
    sentence_ids.append(sentence_id)
    if word == '.':
        sentence_id += 1
data['sentence_id'] = sentence_ids

# Group words by sentences
data_gr = data.groupby("sentence_id").agg({'word': list, 'tag': list}).reset_index()
data_gr.head()

Unnamed: 0,sentence_id,word,tag
0,0,"[Climbing, famous, mountains, around, the, wor...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,1,"[Mount, Everest, the, highest, peak, in, the, ...","[B-Mountain, I-Mountain, O, O, O, O, O, O, O, ..."
2,2,"[Located, in, the, Himalayas, on, the, border,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,3,"[Denali, also, known, as, Mount, McKinley, is,...","[B-Mountain, O, O, O, B-Mountain, O, O, O, O, ..."
4,4,"[Kilimanjaro, Africa, ', s, tallest, mountain,...","[B-Mountain, O, O, O, O, O, O, O, O, O, O, O, ..."


In [62]:
# Encode the tags
tag_encoder = LabelEncoder()
data['tag'] = tag_encoder.fit_transform(data['tag'])
data_gr['tag'] = data.groupby("sentence_id")['tag'].apply(list).reset_index(drop=True)
data_gr.head()

Unnamed: 0,sentence_id,word,tag
0,0,"[Climbing, famous, mountains, around, the, wor...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
1,1,"[Mount, Everest, the, highest, peak, in, the, ...","[0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,2,"[Located, in, the, Himalayas, on, the, border,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
3,3,"[Denali, also, known, as, Mount, McKinley, is,...","[0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,4,"[Kilimanjaro, Africa, ', s, tallest, mountain,...","[0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."


In [64]:
# Split dataset
train_sent, val_sent, train_tag, val_tag = train_test_split(data_gr['word'], data_gr['tag'], test_size=0.2, random_state=10)
train_sent, train_tag = train_sent.tolist(), train_tag.tolist()
val_sent, val_tag = val_sent.tolist(), val_tag.tolist()

In [65]:
# Class to determine the main parameters of the training
class Config:
    CLS = [101]
    SEP = [102]
    VALUE_TOKEN = [0]
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VAL_BATCH_SIZE = 32
    EPOCHS = 3
    TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    USE_CRF = True  # Option to use CRF or Focal Loss for optimization



In [66]:
# Dataset class to handle tokenization and formatting of the inputs
class Dataset:
    def __init__(self, texts, tags):
        self.texts = texts
        self.tags = tags

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        texts = self.texts[index]
        tags = self.tags[index]
        ids = []
        target_tag = []

        for i, s in enumerate(texts):
            inputs = Config.TOKENIZER.encode(s, add_special_tokens=False)
            input_len = len(inputs)
            ids.extend(inputs)
            target_tag.extend(input_len * [tags[i]])

        # Add special tokens [CLS], [SEP] and handle padding
        ids = ids[:Config.MAX_LEN - 2]
        target_tag = target_tag[:Config.MAX_LEN - 2]
        ids = Config.CLS + ids + Config.SEP
        target_tags = Config.VALUE_TOKEN + target_tag + Config.VALUE_TOKEN

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        # Add padding if the sentence is shorter than MAX_LEN
        padding_len = Config.MAX_LEN - len(ids)
        ids += [0] * padding_len
        target_tags += [0] * padding_len
        mask += [0] * padding_len
        token_type_ids += [0] * padding_len

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_tags": torch.tensor(target_tags, dtype=torch.long)
        }

In [67]:
class NERBertModel(nn.Module):
    def __init__(self, num_tag, class_weights=None):
        super(NERBertModel, self).__init__()
        self.num_tag = num_tag
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768, self.num_tag)
        self.crf = CRF(num_tag, batch_first=True) if Config.USE_CRF else None

        # Optional class weights for imbalance handling
        self.class_weights = class_weights

    def forward(self, ids, mask, token_type_ids, target_tags=None):
        output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)[0]
        bert_out = self.bert_drop(output)
        emissions = self.out_tag(bert_out)

        # Scale logits by class weights if provided
        if self.class_weights is not None:
            class_weights = self.class_weights.to(emissions.device)
            emissions = emissions * class_weights

        if target_tags is not None:
            if self.crf:
                # Training: CRF log-likelihood calculation
                log_likelihood = self.crf(emissions, target_tags, mask=mask.byte(), reduction='mean')
                return emissions, -log_likelihood
            else:
                # Training: Use Focal Loss if not using CRF
                loss_fn = FocalLoss(alpha=0.5, gamma=2)
                return emissions, loss_fn(emissions.view(-1, self.num_tag), target_tags.view(-1))

        if self.crf:
            # Inference: CRF decoding for predicted tag sequence
            pred_tags = self.crf.decode(emissions, mask=mask.byte())
            return pred_tags, None
        return emissions, None

In [68]:
# Calculate class weights based on tag frequency in the dataset
def calculate_class_weights(train_data_loader, num_tag):
    all_tags = [tag for sample in train_data_loader.dataset for tag in sample['target_tags'].tolist()]
    tag_counts = Counter(all_tags)
    total_tags = sum(tag_counts.values())
    class_weights = {tag: total_tags / (len(tag_counts) * count) for tag, count in tag_counts.items()}
    class_weights_list = [class_weights[i] for i in range(num_tag)]
    return torch.tensor(class_weights_list, dtype=torch.float)

In [69]:
# Focal Loss implementation to handle class imbalance
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, ignore_index=-100):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index
        self.ce_loss = nn.CrossEntropyLoss(reduction='none', ignore_index=ignore_index)

    def forward(self, logits, target):
        ce_loss = self.ce_loss(logits, target)
        valid_mask = target != self.ignore_index
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        return focal_loss[valid_mask].mean()

In [70]:
# Train function
def train_fn(train_data_loader, model, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    for data in tqdm(train_data_loader, total=len(train_data_loader)):
        for key, value in data.items():
            data[key] = value.to(device)
        optimizer.zero_grad()
        _, loss = model(ids=data['ids'], mask=data['mask'], token_type_ids=data['token_type_ids'], target_tags=data['target_tags'])
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return model, total_loss / len(train_data_loader)

# Validation function
def val_fn(val_data_loader, model, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in tqdm(val_data_loader, total=len(val_data_loader)):
            for key, value in data.items():
                data[key] = value.to(device)
            _, loss = model(ids=data['ids'], mask=data['mask'], token_type_ids=data['token_type_ids'], target_tags=data['target_tags'])
            total_loss += loss.item()
    return total_loss / len(val_data_loader)

# Test function
def test_fn(test_data_loader, model, device):
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for data in tqdm(test_data_loader, total=len(test_data_loader)):
            for key, value in data.items():
                data[key] = value.to(device)
            pred_tags, _ = model(ids=data['ids'], mask=data['mask'], token_type_ids=data['token_type_ids'])
            all_predictions.extend(pred_tags)
            all_labels.extend(data['target_tags'].cpu().numpy())
    return all_predictions, all_labels

In [77]:
# Predict a single sentence and return the tokens, token IDs, and predicted tags
def predict_sentence(sentence, model, tokenizer, tag_encoder, device):
    model.eval()

    if isinstance(sentence, str):
        sentence = sentence.split()

    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, is_split_into_words=True)
    input_ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs.get('token_type_ids', None)

    if token_type_ids is not None:
        token_type_ids = token_type_ids.to(device)

    with torch.no_grad():
        pred_tags, _ = model(ids=input_ids, mask=mask, token_type_ids=token_type_ids)

    # Decode predicted tags
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
    pred_tags = [tag_encoder.inverse_transform([pred])[0] for pred in pred_tags[0]]  # Decode CRF output

    return tokens, input_ids[0].cpu().numpy(), pred_tags

In [71]:
# Setup model
num_tag = len(tag_encoder.classes_)
class_weights_tensor = calculate_class_weights(DataLoader(Dataset(train_sent, train_tag)), num_tag)
model = NERBertModel(num_tag=num_tag, class_weights=class_weights_tensor).to(device)

In [72]:
# Setup optimizer and scheduler
optimizer_grouped_parameters = [
    {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "gamma", "beta"])], "weight_decay": 0.01},
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in ["bias", "gamma", "beta"])], "weight_decay": 0.0},
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=1e-5)
num_train_steps = int(len(train_sent) / Config.TRAIN_BATCH_SIZE * Config.EPOCHS)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

In [73]:
# DataLoader setup
train_dataset = Dataset(train_sent, train_tag)
val_dataset = Dataset(val_sent, val_tag)
train_data_loader = DataLoader(train_dataset, batch_size=Config.TRAIN_BATCH_SIZE)
val_data_loader = DataLoader(val_dataset, batch_size=Config.VAL_BATCH_SIZE)

In [74]:
# Training loop
for epoch in range(Config.EPOCHS):
    model, train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    val_loss = val_fn(val_data_loader, model, device)
    print(f"Epoch: {epoch + 1}, Train_loss: {train_loss:.7f}, Val_loss: {val_loss:.7f}")

100%|██████████| 390/390 [02:20<00:00,  2.79it/s]
100%|██████████| 98/98 [00:16<00:00,  5.91it/s]


Epoch: 1, Train_loss: 10.6893846, Val_loss: 1.1245377


100%|██████████| 390/390 [02:21<00:00,  2.76it/s]
100%|██████████| 98/98 [00:16<00:00,  5.85it/s]


Epoch: 2, Train_loss: 0.8084274, Val_loss: 0.2314504


100%|██████████| 390/390 [02:20<00:00,  2.78it/s]
100%|██████████| 98/98 [00:16<00:00,  5.86it/s]

Epoch: 3, Train_loss: 0.3831736, Val_loss: 0.1696130





In [78]:
# Example sentence prediction
sentence = "Climbing Mount Everest is a great challenge."
tokens, token_ids, pred_tags = predict_sentence(sentence, model, Config.TOKENIZER, tag_encoder, device)
print(f"Tokens: {tokens}\nPredicted Tags: {pred_tags}")

Tokens: ['[CLS]', 'climbing', 'mount', 'everest', 'is', 'a', 'great', 'challenge', '.', '[SEP]']
Predicted Tags: ['B-Mountain', 'O', 'B-Mountain', 'I-Mountain', 'O', 'O', 'O', 'O', 'O', 'B-Mountain']


In [None]:
# Function to save the model
def save_model(model, path='ner_bert_model.pth'):
    torch.save(model.state_dict(), path)
    print(f"Model weights saved to {path}")

# Function to load the model
def load_model(model, path='ner_bert_model.pth'):
    model.load_state_dict(torch.load(path))
    model.eval()
    print(f"Model weights loaded from {path}")
    return model

# Function to save the LabelEncoder
def save_label_encoder(encoder, path='label_encoder.pkl'):
    joblib.dump(encoder, path)
    print(f"LabelEncoder saved to {path}")

# Function to load the model LabelEncoder
def load_label_encoder(path='label_encoder.pkl'):
    encoder = joblib.load(path)
    print(f"LabelEncoder loaded from {path}")
    return encoder

In [None]:
save_model(model, 'ner_bert_model.pth')
save_label_encoder(tag_encoder, 'ner_label_encoder.pkl')