<a href="https://colab.research.google.com/github/iamedobor/DynaQA-1/blob/main/Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install transformers datasets scikit-learn matplotlib torch nltk

# Import required libraries
import numpy as np
import torch
import torch.nn as nn
import random
from torch.utils.data import DataLoader, Dataset, Subset, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from transformers import BertModel, BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification, get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import torch.optim as optim
import copy
import warnings
from sklearn.exceptions import UndefinedMetricWarning

# Set device to CPU
device = torch.device("cpu")

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

**Baseline For Snip Dataset**

In [2]:
# Suppress UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Set device to CPU
device = torch.device("cpu")

# Load datasets
snips_dataset = load_dataset("snips_built_in_intents")

# Split the train dataset into train and validation sets
train_dataset = snips_dataset["train"]
train_indices, val_indices = train_test_split(list(range(len(train_dataset))), test_size=0.2, random_state=42)

# Create custom datasets for training and validation
train_split = train_dataset.select(train_indices)
val_split = train_dataset.select(val_indices)

# Initialize BERT model for embeddings
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.to(device)
bert_model.eval()  # Set BERT to evaluation mode

class SnipsDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        data = self.dataset[index]
        input_text = data["text"]
        label = data["label"]

        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        with torch.no_grad():
            embeddings = bert_model(input_ids.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device)).last_hidden_state.squeeze(0)

        return {
            "embeddings": embeddings,
            "label": torch.tensor(label, dtype=torch.long),
            "text": input_text
        }

def custom_collate_fn(batch):
    embeddings = torch.stack([item['embeddings'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    texts = [item['text'] for item in batch]

    return {
        'embeddings': embeddings,
        'label': labels,
        'text': texts
    }

def create_data_loader(dataset, tokenizer, batch_size=16):
    ds = SnipsDataset(dataset, tokenizer)
    return DataLoader(ds, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

snips_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_data_loader = create_data_loader(train_split, snips_tokenizer)
val_data_loader = create_data_loader(val_split, snips_tokenizer)

# 1. Rule-Based Baseline
def rule_based_response(input_text):
    rules = {
        "weather": "It looks like it will be sunny.",
        "location": "Your current location is being shared.",
        "restaurant": "Per Se is quite crowded right now.",
        "itinerary": "Here is the fastest itinerary for your trip.",
        "photos": "Here are some photos of Mondrian Soho."
    }
    for keyword, response in rules.items():
        if keyword in input_text.lower():
            return response
    return "I'm sorry, I don't understand your request."

# Map rule-based responses to numerical labels
rule_based_mapping = {
    "It looks like it will be sunny.": 0,
    "Your current location is being shared.": 1,
    "Per Se is quite crowded right now.": 2,
    "Here is the fastest itinerary for your trip.": 3,
    "Here are some photos of Mondrian Soho.": 4,
    "I'm sorry, I don't understand your request.": 5  # Assuming 5 is an appropriate label for unrecognized intents
}

def evaluate_rule_based(data_loader):
    all_preds = []
    all_labels = []
    all_texts = []

    for d in data_loader:
        texts = d["text"]
        labels = d["label"].detach().cpu().numpy().flatten()

        preds = [rule_based_mapping.get(rule_based_response(text), 5) for text in texts]  # Convert to numerical labels
        all_preds.extend(preds)
        all_labels.extend(labels)
        all_texts.extend(texts)

    return all_texts, all_preds, all_labels

# 2. Retrieval-Based Baseline
def prepare_tfidf_vectorizer(train_texts):
    vectorizer = TfidfVectorizer().fit(train_texts)
    return vectorizer

def retrieval_based_response(input_text, vectorizer, train_texts, train_labels):
    input_vector = vectorizer.transform([input_text])
    train_vectors = vectorizer.transform(train_texts)

    cosine_similarities = cosine_similarity(input_vector, train_vectors).flatten()
    most_similar_idx = cosine_similarities.argmax()

    return train_labels[most_similar_idx]

def evaluate_retrieval_based(data_loader, vectorizer, train_texts, train_labels):
    all_preds = []
    all_labels = []
    all_texts = []

    for d in data_loader:
        texts = d["text"]
        labels = d["label"].detach().cpu().numpy().flatten()

        preds = [retrieval_based_response(text, vectorizer, train_texts, train_labels) for text in texts]
        all_preds.extend(preds)
        all_labels.extend(labels)
        all_texts.extend(texts)

    return all_texts, all_preds, all_labels

# Prepare vectorizer and evaluate (assuming you have train_split and test_data_loader)
train_texts = [example["text"] for example in train_split]
train_labels = [example["label"] for example in train_split]
vectorizer = prepare_tfidf_vectorizer(train_texts)

# 3. LSTM-Based Seq2Seq Model
class LSTMSeq2Seq(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super(LSTMSeq2Seq, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Assuming x is of shape (batch_size, seq_length, input_dim)
        batch_size, seq_length, _ = x.size()
        _, (hidden, _) = self.encoder(x)
        hidden = hidden.repeat(seq_length, 1, 1).transpose(0, 1)  # Repeat hidden state for each time step
        output, _ = self.decoder(hidden)
        output = self.fc(output[:, -1, :])  # Use the last output for classification
        return output

input_dim = 768  # Size of input embeddings from BERT
hidden_dim = 256  # Size of hidden layer
output_dim = len(snips_dataset["train"].unique("label"))  # Number of output classes

lstm_model = LSTMSeq2Seq(input_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(lstm_model.parameters(), lr=1e-3)

def train_lstm_model(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    for batch in data_loader:
        embeddings, labels = batch["embeddings"].to(device), batch["label"].to(device)
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, precision, recall, f1

def evaluate_lstm_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            embeddings, labels = batch["embeddings"].to(device), batch["label"].to(device)
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, precision, recall, f1

# Early stopping class
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.01):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

train_data_loader_lstm = train_data_loader
val_data_loader_lstm = val_data_loader

# Training loop for LSTM
print("Training LSTM-Based Seq2Seq Model")
EPOCHS = 20
early_stopping = EarlyStopping(patience=3, min_delta=0.01)
for epoch in range(EPOCHS):
    train_loss, train_accuracy, train_precision, train_recall, train_f1 = train_lstm_model(lstm_model, train_data_loader_lstm, criterion, optimizer, device)
    val_loss, val_accuracy, val_precision, val_recall, val_f1 = evaluate_lstm_model(lstm_model, val_data_loader_lstm, criterion, device)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1: {train_f1}")
    print(f"Val Loss: {val_loss}, Val Accuracy: {val_accuracy}, Val Precision: {val_precision}, Val Recall: {val_recall}, Val F1: {val_f1}")
    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping")
        break

# Evaluate on the test set
test_data_loader_lstm = val_data_loader  # Using val_data_loader as test data_loader

test_loss, test_accuracy, test_precision, test_recall, test_f1 = evaluate_lstm_model(lstm_model, test_data_loader_lstm, criterion, device)
print(f"LSTM-Based Seq2Seq Model - Test Loss: {test_loss}, Test Accuracy: {test_accuracy}, Test Precision: {test_precision}, Test Recall: {test_recall}, Test F1: {test_f1}")

# Evaluate rule-based baseline
print("\nEvaluating Rule-Based Baseline")
texts, preds, labels = evaluate_rule_based(val_data_loader)
print("Rule-Based Baseline Results:")
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")
for i in range(5):
    print(f"Text: {texts[i]}")
    print(f"Prediction: {preds[i]}")
    print(f"Label: {labels[i]}")
    print()

# Evaluate retrieval-based baseline
print("\nEvaluating Retrieval-Based Baseline")
texts, preds, labels = evaluate_retrieval_based(val_data_loader, vectorizer, train_texts, train_labels)
print("Retrieval-Based Baseline Results:")
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")
for i in range(5):
    print(f"Text: {texts[i]}")
    print(f"Prediction: {preds[i]}")
    print(f"Label: {labels[i]}")
    print()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.95k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.56k [00:00<?, ?B/s]

The repository for snips_built_in_intents contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/snips_built_in_intents.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/172k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/328 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Training LSTM-Based Seq2Seq Model
Epoch 1, Train Loss: 1.9176029037026798, Train Accuracy: 0.33969465648854963, Train Precision: 0.34345373826937275, Train Recall: 0.33969465648854963, Train F1: 0.2791094305367977
Val Loss: 1.4053269624710083, Val Accuracy: 0.4393939393939394, Val Precision: 0.2911649334063127, Val Recall: 0.4393939393939394, Val F1: 0.34275426859158914
Epoch 2, Train Loss: 1.227330628563376, Train Accuracy: 0.5725190839694656, Train Precision: 0.514892144857915, Train Recall: 0.5725190839694656, Train F1: 0.5252115099428883
Val Loss: 0.8460886180400848, Val Accuracy: 0.6515151515151515, Val Precision: 0.6032608695652173, Val Recall: 0.6515151515151515, Val F1: 0.6055417814508723
Epoch 3, Train Loss: 0.7634110906544853, Train Accuracy: 0.732824427480916, Train Precision: 0.7291737627685169, Train Recall: 0.732824427480916, Train F1: 0.6911036639525605
Val Loss: 0.6511127293109894, Val Accuracy: 0.7121212121212122, Val Precision: 0.7651515151515151, Val Recall: 0.712121

Baseline for MultiWOZ Dataset

In [None]:


# Load the MultiWOZ dataset
multiwoz_dataset = load_dataset("multi_woz_v22")

# Define the rule-based baseline model
def rule_based_response(input_text):
    rules = {
        "weather": "It looks like it will be sunny.",
        "location": "Your current location is being shared.",
        "restaurant": "Per Se is quite crowded right now.",
        "itinerary": "Here is the fastest itinerary for your trip.",
        "photos": "Here are some photos of Mondrian Soho."
    }
    for keyword, response in rules.items():
        if keyword in input_text.lower():
            return response
    return "I'm sorry, I don't understand your request."

# Map rule-based responses to numerical labels
rule_based_mapping = {
    "It looks like it will be sunny.": 0,
    "Your current location is being shared.": 1,
    "Per Se is quite crowded right now.": 2,
    "Here is the fastest itinerary for your trip.": 3,
    "Here are some photos of Mondrian Soho.": 4,
    "I'm sorry, I don't understand your request.": 5
}

# Define the Feedforward Neural Network Model
class FeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Prepare data (example with tokenized input)
class SimpleDataset(Dataset):
    def __init__(self, dataset, tokenizer, act2idx, max_len=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.act2idx = act2idx
        self.max_len = max_len
        self.data = self._prepare_data()

    def _prepare_data(self):
        data = []
        print("Starting to prepare data...")
        total_turns = sum(len(dialogue["turns"]["utterance"]) for dialogue in self.dataset)
        for dialogue in tqdm(self.dataset, total=total_turns, desc="Preparing data"):
            for turn_id, utterance in enumerate(dialogue["turns"]["utterance"]):
                dialogue_acts = []
                if "dialogue_acts" in dialogue["turns"] and len(dialogue["turns"]["dialogue_acts"]) > turn_id:
                    turn_dialogue_acts = dialogue["turns"]["dialogue_acts"][turn_id]["dialog_act"]
                    if "act_type" in turn_dialogue_acts and len(turn_dialogue_acts["act_type"]) > 0:
                        for act_type in turn_dialogue_acts["act_type"]:
                            dialogue_acts.append(act_type)

                if dialogue_acts:
                    label = self.act2idx.get(dialogue_acts[0], -1)
                else:
                    label = -1

                if label != -1:
                    data.append({"utterance": utterance, "label": label})
        print("Data preparation completed.")
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index]
        input_text = data["utterance"]
        label = data["label"]

        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label, dtype=torch.long),
            "text": input_text
        }

def collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=0)
    attention_masks = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = torch.tensor([item['label'] for item in batch])
    texts = [item['text'] for item in batch]
    return {'input_ids': input_ids, 'attention_mask': attention_masks, 'label': labels, 'text': texts}

# Create data loaders
all_dialogue_acts = set()
for dialogue in multiwoz_dataset["train"]:
    if "turns" in dialogue:
        for turn_id in range(len(dialogue["turns"]["utterance"])):
            if "dialogue_acts" in dialogue["turns"] and len(dialogue["turns"]["dialogue_acts"]) > turn_id:
                turn_dialogue_acts = dialogue["turns"]["dialogue_acts"][turn_id]["dialog_act"]
                if "act_type" in turn_dialogue_acts and len(turn_dialogue_acts["act_type"]) > 0:
                    for act_type in turn_dialogue_acts["act_type"]:
                        all_dialogue_acts.add(act_type)
act2idx = {act: idx for idx, act in enumerate(all_dialogue_acts)}

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Subset the dataset for quick testing
train_dataset_full = SimpleDataset(multiwoz_dataset["train"], tokenizer, act2idx)
val_dataset_full = SimpleDataset(multiwoz_dataset["validation"], tokenizer, act2idx)
test_dataset_full = SimpleDataset(multiwoz_dataset["test"], tokenizer, act2idx)

train_subset = Subset(train_dataset_full, range(500))  # Increased subset size
val_subset = Subset(val_dataset_full, range(200))      # Increased subset size
test_subset = Subset(test_dataset_full, range(200))    # Increased subset size

train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_subset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_subset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Early stopping class
class EarlyStopping:
    def __init__(self, patience=5, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_loss = np.Inf

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decreases.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.best_loss:.6f} --> {val_loss:.6f}).  Saving model ...')
        self.best_loss = val_loss
        self.best_model = copy.deepcopy(model.state_dict())

# Define training loop
def train_nn_model(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    all_labels = []
    all_preds = []
    for batch in data_loader:
        inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)

        # Get BERT embeddings
        with torch.no_grad():
            embeddings = bert_model(input_ids=inputs, attention_mask=attention_mask).last_hidden_state[:, 0, :]

        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = outputs.argmax(dim=1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, precision, recall, f1

# Define evaluation function
def evaluate_nn_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in data_loader:
            inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)

            # Get BERT embeddings
            embeddings = bert_model(input_ids=inputs, attention_mask=attention_mask).last_hidden_state[:, 0, :]

            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = outputs.argmax(dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, precision, recall, f1

# Training example
input_dim = 768  # BERT embedding size
hidden_dim = 256  # Reduced hidden size for the feedforward network
output_dim = len(act2idx)  # Number of dialogue acts
nn_model = FeedforwardNN(input_dim, hidden_dim, output_dim)

optimizer = optim.Adam(nn_model.parameters(), lr=5e-4)  # Adjusted learning rate
criterion = nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nn_model = nn_model.to(device)
bert_model = bert_model.to(device)
epochs = 20
patience = 3  # Early stopping patience
early_stopping = EarlyStopping(patience=patience, verbose=True)

for epoch in range(epochs):
    train_loss, train_acc, train_precision, train_recall, train_f1 = train_nn_model(nn_model, train_loader, optimizer, criterion, device)
    val_loss, val_acc, val_precision, val_recall, val_f1 = evaluate_nn_model(nn_model, val_loader, criterion, device)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}, Train F1: {train_f1:.4f}')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}, Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1: {val_f1:.4f}')

    early_stopping(val_loss, nn_model)

    if early_stopping.early_stop:
        print("Early stopping")
        break

# Load the best model saved by early stopping
nn_model.load_state_dict(early_stopping.best_model)

# Final evaluation on test set
test_loss, test_acc, test_precision, test_recall, test_f1 = evaluate_nn_model(nn_model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')

# Evaluate rule-based model
def evaluate_rule_based_model(data_loader):
    all_preds = []
    all_labels = []
    all_texts = []

    for batch in data_loader:
        texts = batch["text"]
        labels = batch["label"].detach().cpu().numpy().flatten()

        preds = [rule_based_mapping.get(rule_based_response(text), 5) for text in texts]  # Convert to numerical labels
        all_preds.extend(preds)
        all_labels.extend(labels)
        all_texts.extend(texts)

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    return accuracy, precision, recall, f1

rule_based_results = evaluate_rule_based_model(test_loader)
print(f"Rule-based Results: Accuracy={rule_based_results[0]}, Precision={rule_based_results[1]}, Recall={rule_based_results[2]}, F1={rule_based_results[3]}")


Downloading builder script:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

The repository for multi_woz_v22 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/multi_woz_v22.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/8437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Starting to prepare data...


Preparing data:   7%|▋         | 8437/113552 [00:18<03:44, 468.18it/s]


Data preparation completed.
Starting to prepare data...


Preparing data:   7%|▋         | 1000/14748 [00:01<00:24, 550.74it/s]


Data preparation completed.
Starting to prepare data...


Preparing data:   7%|▋         | 1000/14744 [00:01<00:25, 535.97it/s]


Data preparation completed.
Epoch 1, Train Loss: 3.0304, Train Accuracy: 0.1780, Train Precision: 0.0809, Train Recall: 0.1780, Train F1: 0.0854
Validation Loss: 2.5681, Validation Accuracy: 0.2550, Validation Precision: 0.1679, Validation Recall: 0.2550, Validation F1: 0.1461
Validation loss decreased (inf --> 2.568148).  Saving model ...
Epoch 2, Train Loss: 2.4682, Train Accuracy: 0.3360, Train Precision: 0.3923, Train Recall: 0.3360, Train F1: 0.2172
Validation Loss: 2.2877, Validation Accuracy: 0.4150, Validation Precision: 0.3077, Validation Recall: 0.4150, Validation F1: 0.2892
Validation loss decreased (2.568148 --> 2.287699).  Saving model ...
Epoch 3, Train Loss: 2.1636, Train Accuracy: 0.4380, Train Precision: 0.3855, Train Recall: 0.4380, Train F1: 0.3399
Validation Loss: 2.0506, Validation Accuracy: 0.4850, Validation Precision: 0.3192, Validation Recall: 0.4850, Validation F1: 0.3785
Validation loss decreased (2.287699 --> 2.050615).  Saving model ...


**Baseline for CoNLL-2012 Dataset**

In [None]:

# Load the CoNLL-2012 dataset
conll_dataset = load_dataset("conll2012_ontonotesv5", "english_v4")

class RuleBasedCoreferenceResolver:
    def resolve(self, sentences):
        resolved = []
        for sentence in sentences:
            words = sentence['words']
            resolution = []
            for word in words:
                if self.is_pronoun(word):
                    antecedent = self.find_nearest_antecedent(word, words)
                    resolution.append(antecedent)
                else:
                    resolution.append(word)
            resolved.append(resolution)
        return resolved

    def is_pronoun(self, word):
        pronouns = {"he", "she", "it", "they", "him", "her", "them"}
        return word.lower() in pronouns

    def find_nearest_antecedent(self, pronoun, words):
        for i in range(len(words) - 1, -1, -1):
            if self.is_noun(words[i]):
                return words[i]
        return pronoun

    def is_noun(self, word):
        return word[0].isupper()

    def evaluate(self, dataset):
        all_labels = []
        all_preds = []
        for item in dataset:
            sentences = item['sentences']
            for sentence in sentences:
                words = sentence['words']
                coref_spans = sentence['coref_spans']
                labels = self.get_labels(len(words), coref_spans)
                preds = self.resolve([{'words': words}])
                all_labels.extend(labels)
                all_preds.extend([1 if p != w else 0 for p, w in zip(preds[0], words)])
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        return accuracy, precision, recall, f1

    def get_labels(self, length, coref_spans):
        labels = [0] * length
        for span in coref_spans:
            for i in range(span[1], span[2] + 1):
                labels[i] = 1
        return labels

# Initialize the resolver
rule_based_resolver = RuleBasedCoreferenceResolver()

# Evaluate the resolver on train, validation, and test sets
train_accuracy, train_precision, train_recall, train_f1 = rule_based_resolver.evaluate(conll_dataset['train'])
val_accuracy, val_precision, val_recall, val_f1 = rule_based_resolver.evaluate(conll_dataset['validation'])
test_accuracy, test_precision, test_recall, test_f1 = rule_based_resolver.evaluate(conll_dataset['test'])

print(f"Rule-Based System - Train Accuracy: {train_accuracy:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}")
print(f"Rule-Based System - Validation Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")
print(f"Rule-Based System - Test Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}")


# Mention-Pair Model using Logistic Regression

import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import vstack
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

class MentionPairDataset(Dataset):
    def __init__(self, dataset):
        self.data = self._prepare_data(dataset)

    def _prepare_data(self, dataset):
        data = []
        for item in dataset:
            for sentence in item['sentences']:
                words = sentence['words']
                coref_spans = sentence.get('coref_spans', [])
                for i in range(len(words)):
                    for j in range(i+1, len(words)):
                        mention1 = words[i]
                        mention2 = words[j]
                        label = 1 if any(start <= i <= end and start <= j <= end for _, start, end in coref_spans) else 0
                        data.append((mention1, mention2, label))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

class MentionPairModel:
    def __init__(self):
        self.vectorizer = DictVectorizer(sparse=True)
        self.model = LogisticRegression(solver='liblinear')

    def extract_features(self, mention1, mention2):
        features = {
            'string_match': int(mention1 == mention2),
            'mention_distance': abs(len(mention1) - len(mention2)),
            'mention1_type': 'NOUN' if mention1[0].isupper() else 'PRON',
            'mention2_type': 'NOUN' if mention2[0].isupper() else 'PRON',
        }
        return features

    def fit(self, dataset, batch_size=500):
        # Ensure balanced classes in each batch
        for i in range(0, len(dataset), batch_size):
            batch_data = dataset[i:i + batch_size]
            feature_dicts = [self.extract_features(m1, m2) for m1, m2, _ in batch_data]
            labels = [label for _, _, label in batch_data]

            # Ensure we have both classes in the batch
            if len(set(labels)) < 2:
                continue

            feature_matrix = self.vectorizer.fit_transform(feature_dicts)
            self.model.fit(feature_matrix, labels)

    def predict(self, dataset, batch_size=500):
        all_preds = []
        for i in range(0, len(dataset), batch_size):
            batch_data = dataset[i:i + batch_size]
            feature_dicts = [self.extract_features(m1, m2) for m1, m2, _ in batch_data]
            feature_matrix = self.vectorizer.transform(feature_dicts)
            preds = self.model.predict(feature_matrix)
            all_preds.extend(preds)
        return all_preds

    def evaluate(self, dataset, batch_size=500):
        all_labels = [label for _, _, label in dataset]
        all_preds = self.predict(dataset, batch_size=batch_size)
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        return accuracy, precision, recall, f1

# Load and process datasets
train_dataset = MentionPairDataset(conll_dataset['train'])
val_dataset = MentionPairDataset(conll_dataset['validation'])
test_dataset = MentionPairDataset(conll_dataset['test'])

# Initialize and train the model
mention_pair_model = MentionPairModel()
mention_pair_model.fit(train_dataset)

# Evaluate the model
train_accuracy, train_precision, train_recall, train_f1 = mention_pair_model.evaluate(train_dataset)
val_accuracy, val_precision, val_recall, val_f1 = mention_pair_model.evaluate(val_dataset)
test_accuracy, test_precision, test_recall, test_f1 = mention_pair_model.evaluate(test_dataset)

print(f"Mention-Pair Model - Train Accuracy: {train_accuracy:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}")
print(f"Mention-Pair Model - Validation Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")
print(f"Mention-Pair Model - Test Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}")


#Fine-Tuned BERT Model
class CoreferenceDataset(Dataset):
    def __init__(self, tokenizer, dataset, max_len=128):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        tokens = []
        labels = []

        for sentence in item['sentences']:
            words = sentence['words']
            coref_spans = sentence.get('coref_spans', [])
            tokens.extend(words)
            token_labels = [0] * len(words)
            for span in coref_spans:
                for i in range(span[1], span[2] + 1):
                    token_labels[i] = 1
            labels.extend(token_labels)

        inputs = self.tokenizer(tokens, is_split_into_words=True, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        labels = torch.tensor(labels[:self.max_len], dtype=torch.long)
        if len(labels) < self.max_len:
            labels = torch.cat([labels, torch.zeros(self.max_len - len(labels), dtype=torch.long)])

        return {'input_ids': inputs['input_ids'].squeeze(), 'attention_mask': inputs['attention_mask'].squeeze(), 'labels': labels}

def custom_collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = pad_sequence([item['labels'] for item in batch], batch_first=True, padding_value=-100)

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Prepare datasets
train_dataset = CoreferenceDataset(tokenizer, conll_dataset['train'])
val_dataset = CoreferenceDataset(tokenizer, conll_dataset['validation'])
test_dataset = CoreferenceDataset(tokenizer, conll_dataset['test'])

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=custom_collate_fn)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    all_labels = []
    all_preds = []

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()

        preds = torch.argmax(logits, dim=2)
        active_labels = labels.view(-1)
        active_preds = preds.view(-1)

        active_labels = active_labels[active_labels != -100]
        active_preds = active_preds[active_labels != -100]

        all_labels.extend(active_labels.cpu().numpy())
        all_preds.extend(active_preds.cpu().numpy())
        losses.append(loss.item())

    avg_loss = np.mean(losses)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, precision, recall, f1

def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            preds = torch.argmax(logits, dim=2)
            active_labels = labels.view(-1)
            active_preds = preds.view(-1)

            active_labels = active_labels[active_labels != -100]
            active_preds = active_preds[active_labels != -100]

            all_labels.extend(active_labels.cpu().numpy())
            all_preds.extend(active_preds.cpu().numpy())
            losses.append(loss.item())

    avg_loss = np.mean(losses)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, precision, recall, f1

# Custom training loop with early stopping
num_epochs = 10
patience = 2  # Number of epochs with no improvement after which training will be stopped
best_val_loss = float('inf')
epochs_no_improve = 0

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)

    train_loss, train_accuracy, train_precision, train_recall, train_f1 = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}, Train F1: {train_f1:.4f}')

    val_loss, val_accuracy, val_precision, val_recall, val_f1 = eval_model(model, val_loader, device)
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}')

    # Check early stopping criteria
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered")
            break

# Evaluate the model on the test set after training
test_loss, test_accuracy, test_precision, test_recall, test_f1 = eval_model(model, test_loader, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


