In [1]:
import json
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from tqdm import tqdm

In [2]:
!git clone https://github.com/itsZiang/data.git

Cloning into 'data'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 16 (delta 4), reused 6 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (16/16), 7.97 MiB | 8.10 MiB/s, done.
Resolving deltas: 100% (4/4), done.


In [3]:
MAX_LEN = 64
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 4

In [4]:
# Load datasets
def load_data(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

# Extract all unique labels from data
def extract_labels(datasets):
    labels = set()
    for data in datasets:
        for item in data:
            for tag, slot, _, _, _ in item['span_info']:
                labels.add(f"B-{tag}:{slot}")
                labels.add(f"I-{tag}:{slot}")
    labels.add("O")
    return sorted(labels)


In [5]:
# Dataset class
class SlotFillingDataset(Dataset):
    def __init__(self, data, tokenizer, label2id, max_len=MAX_LEN):
        self.data = data
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        utterance = self.data[idx]['utterance']
        span_info = self.data[idx]['span_info']

        # Tokenize utterance
        tokens = self.tokenizer(utterance, truncation=True, padding='max_length', max_length=self.max_len, return_offsets_mapping=True)
        labels = ['O'] * len(tokens['input_ids'])

        for tag, slot, value, start, end in span_info:
            bio_tag = f"{tag}:{slot}"
            for i, (offset_start, offset_end) in enumerate(tokens['offset_mapping']):
                if offset_start == start:
                    labels[i] = f"B-{bio_tag}"
                elif offset_start > start and offset_end <= end:
                    labels[i] = f"I-{bio_tag}"

        labels = [self.label2id[label] for label in labels]

        return {
            'input_ids': torch.tensor(tokens['input_ids']),
            'attention_mask': torch.tensor(tokens['attention_mask']),
            'labels': torch.tensor(labels)
        }

In [6]:
# Define training function
def train_model(model, train_loader, val_loader, optimizer, device, num_epochs=5):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()
        total_loss = 0

        # Training loop with tqdm
        train_progress = tqdm(train_loader, desc="Training", leave=False)
        for batch in train_progress:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            train_progress.set_postfix(loss=loss.item())

        avg_train_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            val_progress = tqdm(val_loader, desc="Validating", leave=False)
            for batch in val_progress:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), '/kaggle/working/best_model.pt')

In [7]:
# Load and process data
data_train = load_data('/kaggle/working/data/data_slot_filling_train.json')
data_dev = load_data('/kaggle/working/data/data_slot_filling_dev.json')
data_test = load_data('/kaggle/working/data/data_slot_filling_test.json')



In [8]:
# Extract labels from all datasets
all_labels = extract_labels([data_train, data_dev, data_test])
label2id = {label: idx for idx, label in enumerate(all_labels)}
id2label = {idx: label for label, idx in label2id.items()}

In [9]:
# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_dataset = SlotFillingDataset(data_train, tokenizer, label2id)
dev_dataset = SlotFillingDataset(data_dev, tokenizer, label2id)
test_dataset = SlotFillingDataset(data_test, tokenizer, label2id)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=VALID_BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [10]:
# Model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(all_labels))
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Train model
train_model(model, train_loader, dev_loader, optimizer, device)

Epoch 1/5


                                                                        

Train Loss: 0.4477, Val Loss: 0.1592
Epoch 2/5


                                                                        

Train Loss: 0.1441, Val Loss: 0.1149
Epoch 3/5


                                                                        

Train Loss: 0.1071, Val Loss: 0.1009
Epoch 4/5


                                                                         

Train Loss: 0.0897, Val Loss: 0.1063
Epoch 5/5


                                                                        

Train Loss: 0.0790, Val Loss: 0.1013




In [14]:
# Load best model
model.load_state_dict(torch.load('/kaggle/working/best_model.pt'))
model.eval()

  model.load_state_dict(torch.load('/kaggle/working/best_model.pt'))


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [48]:
# Predict example utterance
utterance = "I've booked that. Your reference number is 0ICM79OM ."
tokens = tokenizer(utterance, return_tensors="pt", truncation=True, padding=True, return_offsets_mapping=True)
offsets = tokens.pop("offset_mapping")[0]
input_ids = tokens['input_ids'].to(device)
attention_mask = tokens['attention_mask'].to(device)

outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=-1).squeeze().tolist()
predicted_tags = [id2label[label] for label in predicted_labels]

# Map tokens to slot-value pairs
decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
slot_value_pairs = []
current_slot = None
current_value = ""

for token, tag, offset in zip(decoded_tokens, predicted_tags, offsets):
    if tag.startswith("B-"):
        if current_slot is not None and current_value:
            slot_value_pairs.append((current_slot, current_value.strip()))
        current_slot = tag[2:]
        current_value = tokenizer.convert_tokens_to_string([token])
    elif tag.startswith("I-") and current_slot == tag[2:]:
        current_value += tokenizer.convert_tokens_to_string([token])
    else:
        if current_slot is not None and current_value:
            slot_value_pairs.append((current_slot, current_value.strip()))
        current_slot = None
        current_value = ""

if current_slot is not None and current_value:
    slot_value_pairs.append((current_slot, current_value.strip()))

print(f"Tokens: {decoded_tokens}")
print(f"Predicted Tags: {predicted_tags}")
print(f"Slot-Value Pairs: {slot_value_pairs}")

Tokens: ['[CLS]', 'their', 'phone', 'number', 'is', '01', '##22', '##33', '##53', '##11', '##0', '.', 'can', 'i', 'help', 'you', 'with', 'anything', 'else', 'today', '[SEP]']
Predicted Tags: ['O', 'O', 'O', 'O', 'O', 'B-Restaurant-Inform:phone', 'I-Attraction-Inform:phone', 'I-Attraction-Inform:phone', 'I-Attraction-Inform:phone', 'I-Attraction-Inform:phone', 'I-Attraction-Inform:phone', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Slot-Value Pairs: [('Restaurant-Inform:phone', '01')]


In [49]:
def predict_slot_values(text):
    # Tokenize input text
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, return_offsets_mapping=True)
    offsets = tokens.pop("offset_mapping")[0]
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    # Get model predictions
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=-1).squeeze().tolist()
    predicted_tags = [id2label[label] for label in predicted_labels]

    # Map tokens to slot-value pairs
    decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
    slot_value_pairs = []
    current_slot = None
    current_value_tokens = []
    current_start = None

    for token, tag, offset in zip(decoded_tokens, predicted_tags, offsets):
        if token in ["[CLS]", "[SEP]"]:  # Skip special tokens
            continue

        if tag.startswith("B-"):
            # Save the previous slot-value pair
            if current_slot is not None and current_value_tokens:
                value = text[current_start:offset[0]].strip()
                slot_value_pairs.append((current_slot, value))
            # Start a new slot-value pair
            current_slot = tag[2:]
            current_value_tokens = [token.replace("##", "")]
            current_start = offset[0]
        elif tag.startswith("I-") and current_slot == tag[2:]:
            # Append to the current slot-value pair
            current_value_tokens.append(token.replace("##", ""))
        else:
            # Save the previous slot-value pair
            if current_slot is not None and current_value_tokens:
                value = text[current_start:offset[0]].strip()
                slot_value_pairs.append((current_slot, value))
            # Reset for the next slot
            current_slot = None
            current_value_tokens = []
            current_start = None

    # Save the last slot-value pair
    if current_slot is not None and current_value_tokens:
        value = text[current_start:].strip()
        slot_value_pairs.append((current_slot, value))

    return slot_value_pairs


In [73]:
text = "Sure. There are several churches and an old schools attraction, all in the centre area. Do you have a preference?"
slot_value_pairs = predict_slot_values(text)
for slot, value in slot_value_pairs:
    print(f'slot: {slot} / value: {value}')

slot: Attraction-Inform:choice / value: several
slot: Attraction-Inform:type / value: churches
slot: Attraction-Inform:type / value: old schools
slot: Attraction-Inform:area / value: centre


In [75]:

def calculate_metrics_from_json(json_file_path, tokenizer, model, id2label, device):
    utterance_metrics = []
    entity_metrics = {'true_positive': 0, 'false_positive': 0, 'false_negative': 0}
    total_utterances = 0
    utterance_correct = 0  # To calculate utterance-level accuracy

    # Load the JSON file
    with open(json_file_path, 'r') as f:
        test_data = json.load(f)
    
    total_utterances = len(test_data)

    for sample in test_data:
        # Ground truth entities
        ground_truth = {(tag, slot, value) for tag, slot, value, _, _ in sample['span_info']}
        
        # Predicted entities
        utterance = sample['utterance']
        predicted_slot_value_pairs = predict_slot_values(utterance)
        predicted = {(slot.split(':')[0], slot.split(':')[1], value) for slot, value in predicted_slot_value_pairs}
        
        # Entity-Level Metrics
        true_positives = len(ground_truth & predicted)  # Intersection of ground truth and predicted
        false_positives = len(predicted - ground_truth)  # Predicted but not in ground truth
        false_negatives = len(ground_truth - predicted)  # Ground truth but not predicted

        entity_metrics['true_positive'] += true_positives
        entity_metrics['false_positive'] += false_positives
        entity_metrics['false_negative'] += false_negatives

        # Utterance-Level Metrics
        utterance_correct += 1 if ground_truth == predicted else 0
        precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
        f1 = (2 * precision * recall / (precision + recall)) if precision + recall > 0 else 0

        utterance_metrics.append({'precision': precision, 'recall': recall, 'f1': f1})

    # Aggregate Utterance-Level Metrics
    avg_precision = sum(m['precision'] for m in utterance_metrics) / total_utterances
    avg_recall = sum(m['recall'] for m in utterance_metrics) / total_utterances
    avg_f1 = sum(m['f1'] for m in utterance_metrics) / total_utterances
    utterance_accuracy = utterance_correct / total_utterances

    # Aggregate Entity-Level Metrics
    tp, fp, fn = entity_metrics['true_positive'], entity_metrics['false_positive'], entity_metrics['false_negative']
    entity_precision = tp / (tp + fp) if tp + fp > 0 else 0
    entity_recall = tp / (tp + fn) if tp + fn > 0 else 0
    entity_f1 = (2 * entity_precision * entity_recall / (entity_precision + entity_recall)) if entity_precision + entity_recall > 0 else 0
    entity_accuracy = tp / (tp + fp + fn) if tp + fp + fn > 0 else 0

    # Print Results
    print("Utterance-Level Metrics:")
    print(f"Precision: {avg_precision:.2f}, Recall: {avg_recall:.2f}, F1 Score: {avg_f1:.2f}, Accuracy: {utterance_accuracy:.2f}")
    print("Entity-Level Metrics:")
    print(f"Precision: {entity_precision:.2f}, Recall: {entity_recall:.2f}, F1 Score: {entity_f1:.2f}, Accuracy: {entity_accuracy:.2f}")

# Example usage:
calculate_metrics_from_json('/kaggle/working/data/data_slot_filling_test.json', tokenizer, model, id2label, device)


Utterance-Level Metrics:
Precision: 0.70, Recall: 0.70, F1 Score: 0.70, Accuracy: 0.54
Entity-Level Metrics:
Precision: 0.72, Recall: 0.73, F1 Score: 0.72, Accuracy: 0.57


In [78]:
def calculate_metrics_from_json(json_file_path, tokenizer, model, id2label, device):
    utterance_metrics = []
    entity_metrics = {'true_positive': 0, 'false_positive': 0, 'false_negative': 0}
    total_utterances = 0

    # Load the JSON file
    with open(json_file_path, 'r') as f:
        test_data = json.load(f)
    
    total_utterances = len(test_data)

    for sample in test_data:
        # Ground truth entities (set normalization, unordered)
        ground_truth = {(tag, slot, value) for tag, slot, value, _, _ in sample['span_info']}
        
        # Predicted entities (set normalization, unordered)
        utterance = sample['utterance']
        predicted_slot_value_pairs = predict_slot_values(utterance)
        predicted = {(slot.split(':')[0], slot.split(':')[1], value) for slot, value in predicted_slot_value_pairs}
        
        # Entity-Level Metrics
        true_positives = len(ground_truth & predicted)  # Intersection of ground truth and predicted
        false_positives = len(predicted - ground_truth)  # Predicted but not in ground truth
        false_negatives = len(ground_truth - predicted)  # Ground truth but not predicted

        entity_metrics['true_positive'] += true_positives
        entity_metrics['false_positive'] += false_positives
        entity_metrics['false_negative'] += false_negatives

        # Utterance-Level Metrics (exact match at entity level)
        precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
        f1 = (2 * precision * recall / (precision + recall)) if precision + recall > 0 else 0

        utterance_metrics.append({'precision': precision, 'recall': recall, 'f1': f1})

    # Aggregate Utterance-Level Metrics
    avg_precision = sum(m['precision'] for m in utterance_metrics) / total_utterances
    avg_recall = sum(m['recall'] for m in utterance_metrics) / total_utterances
    avg_f1 = sum(m['f1'] for m in utterance_metrics) / total_utterances

    # Aggregate Entity-Level Metrics
    tp, fp, fn = entity_metrics['true_positive'], entity_metrics['false_positive'], entity_metrics['false_negative']
    entity_precision = tp / (tp + fp) if tp + fp > 0 else 0
    entity_recall = tp / (tp + fn) if tp + fn > 0 else 0
    entity_f1 = (2 * entity_precision * entity_recall / (entity_precision + entity_recall)) if entity_precision + entity_recall > 0 else 0

    # Print Results
    print("Utterance-Level Metrics:")
    print(f"Precision: {avg_precision:.2f}, Recall: {avg_recall:.2f}, F1 Score: {avg_f1:.2f}")
    print("Entity-Level Metrics:")
    print(f"Precision: {entity_precision:.2f}, Recall: {entity_recall:.2f}, F1 Score: {entity_f1:.2f}")

# Example usage:
calculate_metrics_from_json('/kaggle/working/data/data_slot_filling_test.json', tokenizer, model, id2label, device)


Utterance-Level Metrics:
Precision: 0.70, Recall: 0.70, F1 Score: 0.70
Entity-Level Metrics:
Precision: 0.72, Recall: 0.73, F1 Score: 0.72
