# Environment Setup:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
datadir = "/content/drive/MyDrive/Proxytuner/"
if not os.path.exists(datadir):
  !ln -s "" $datadir
os.chdir(datadir)
!pwd

/content/drive/MyDrive/Proxytuner


In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Model Setup and Data Preparation:

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
accuracy_scores = []
f1_scores = []

In [None]:
class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, labels_to_id):
        self.tokenizer = tokenizer
        self.labels_to_id = labels_to_id
        self.texts = []
        self.labels = []

        # Read data
        with open(file_path, 'r', encoding='utf-8') as f:
            tokens, label_ids = [], []
            for line in f:
                line = line.strip()
                if line == "":
                    # end of an example; process and reset for the next example
                    if tokens:
                        self.texts.append(tokens)
                        self.labels.append(label_ids)
                        tokens, label_ids = [], []
                else:
                    token, label = line.split('\t')
                    tokens.append(token)
                    label_ids.append(self.labels_to_id[label])
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenization
        encoding = self.tokenizer(
            self.texts[idx],
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors="pt"
        )

        # Convert label IDs to tensor and handle padding
        labels = torch.LongTensor(self.labels[idx])
        labels_padded = torch.ones(128, dtype=torch.long) * -100  # Padding index for labels
        labels_padded[:len(labels)] = labels

        # Set up the dictionary to return
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = labels_padded
        return item


In [None]:
def create_dataset(file_path, tokenizer, label_list):
    # Map labels to IDs
    labels_to_id = {label: idx for idx, label in enumerate(label_list)}

    # Create the dataset
    dataset = NERDataset(file_path, tokenizer, labels_to_id)
    return dataset

In [None]:
class SmallExpertModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size):
        super(SmallExpertModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        # Apply the classifier to every time step
        logits = self.classifier(lstm_out)
        return logits


In [None]:
def train_small_model(file_path, tokenizer, label_list, epochs=50, learning_rate=1e-3):
    dataset = create_dataset(file_path, tokenizer, label_list)
    train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

    # Initialize the small expert model
    embedding_dim = 768  # Dimensionality of the embedding layer
    hidden_dim = 256
    output_dim = len(label_list)
    vocab_size = tokenizer.vocab_size  # Vocabulary size for the embedding layer

    expert_model = SmallExpertModel(embedding_dim, hidden_dim, output_dim, vocab_size).to(device)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(expert_model.parameters(), lr=learning_rate)

    # Training loop
    expert_model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            # Ensure input_ids are converted to float if using embeddings directly
            outputs = expert_model(input_ids)

            # Reshape logits and labels to comply with the expected format for nn.CrossEntropyLoss
            logits = outputs.view(-1, output_dim)  # Reshape logits to [batch_size * sequence_length, num_labels]
            labels = labels.view(-1)  # Flatten labels to [batch_size * sequence_length]
            # print("Logits shape:", logits.shape)
            # print("Labels shape:", labels.shape)
            # Calculate the loss and backpropagate
            loss = criterion(logits.to(device), labels)
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

    # Initialize the anti-expert model with the same architecture but without training
    anti_expert_model = SmallExpertModel(embedding_dim, hidden_dim, output_dim, vocab_size).to(device)
    return expert_model, anti_expert_model


In [None]:
# # Read the contents of the text files
# def read_txt_file(file_path):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         lines = file.readlines()
#     return lines

# # File paths
# conll2003_file_path = "./ner_data/conll2003/train.txt"
# other_domain_file_path = "./ner_data/music/train.txt"
# combined_file_path = "./ner_data/combined_dataset/conll_music.txt"

# # Read the contents of the text files
# conll2003_lines = read_txt_file(conll2003_file_path)
# other_domain_lines = read_txt_file(other_domain_file_path)

# # Calculate repetition factor
# repetition_factor = len(conll2003_lines) // len(other_domain_lines)

# # Repeat the lines of the other domain dataset
# repeated_other_domain_lines = other_domain_lines * repetition_factor

# # Combine the lines
# combined_lines = conll2003_lines + repeated_other_domain_lines

# # Write the combined lines to a new text file
# with open(combined_file_path, 'w', encoding='utf-8') as combined_file:
#     combined_file.writelines(combined_lines)

# print(f"Combined dataset saved to {combined_file_path}")


In [None]:
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
from transformers import BertForTokenClassification
from sklearn.metrics import f1_score

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def adjust_bert_predictions(bert_logits, expert_logits, anti_expert_logits):
    # Calculate the expertise difference
    expertise_difference = expert_logits - anti_expert_logits

    # Adjust BERT's logits using the calculated difference
    adjusted_logits = bert_logits + expertise_difference
    return adjusted_logits

In [None]:
def make_adjusted_predictions_dataset(file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list):
    dataset = create_dataset(file_path, tokenizer, label_list)
    data_loader = DataLoader(dataset, batch_size=16, shuffle=False)  # No need to shuffle during testing

    bert_model.eval()
    expert_model.eval()
    anti_expert_model.eval()

    all_adjusted_logits = []
    all_base_logits = []
    all_expert_logits = []
    all_anti_expert_logits = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)

            # Get logits from BERT, Expert, and Anti-Expert models
            bert_outputs = bert_model(input_ids).logits
            expert_outputs = expert_model(input_ids)
            anti_expert_outputs = anti_expert_model(input_ids)

            # Adjust BERT's logits
            adjusted_logits = adjust_bert_predictions(bert_outputs, expert_outputs, anti_expert_outputs)
            all_adjusted_logits.append(adjusted_logits)
            all_base_logits.append(bert_outputs)
            all_expert_logits.append(expert_outputs)
            all_anti_expert_logits.append(anti_expert_outputs)


    # Concatenate all logits from all batches
    all_adjusted_logits = torch.cat(all_adjusted_logits, dim=0)
    all_base_logits = torch.cat(all_base_logits, dim=0)
    all_expert_logits = torch.cat(all_expert_logits, dim=0)
    all_anti_expert_logits = torch.cat(all_anti_expert_logits, dim=0)

    return all_adjusted_logits, all_base_logits, all_expert_logits, all_anti_expert_logits


In [None]:
def get_accuracy(adjusted_logits, file_path, tokenizer, label_list):
  probabilities = torch.softmax(adjusted_logits, dim=-1)
  predictions = torch.argmax(probabilities, dim=-1)
  true_labels = []
  dataset = create_dataset(file_path, tokenizer, label_list)
  data_loader = DataLoader(dataset, batch_size=16, shuffle=False)
  for batch in data_loader:
      batch_true_labels = batch['labels'].to(device)
      true_labels.append(batch_true_labels)

  true_labels = torch.cat(true_labels, dim=0)


  # Flatten predictions and true labels
  flattened_predictions = predictions.view(-1).cpu().numpy()
  flattened_true_labels = true_labels.view(-1).cpu().numpy()

  # Remove padding tokens from true labels and predictions
  mask = (flattened_true_labels != -100)
  flattened_true_labels = flattened_true_labels[mask]
  flattened_predictions = flattened_predictions[mask]

  # Compute accuracy
  accuracy = accuracy_score(flattened_true_labels, flattened_predictions)
  print(f"Accuracy: {accuracy:.4f}")

  f1 = f1_score(flattened_true_labels, flattened_predictions, average='weighted')
  print(f"F1 score: {f1:.4f}")
  return accuracy, f1


In [None]:
train_file_path = "./ner_data/music/test.txt"
test_file_path = "./ner_data/music/train.txt"
label_list = ['O', 'B-musicgenre', 'I-musicgenre', 'B-song', 'I-song', 'B-band', 'I-band',
              'B-album', 'I-album', 'B-musicalartist', 'I-musicalartist', 'B-musicalinstrument',
              'I-musicalinstrument', 'B-award', 'I-award', 'B-event', 'I-event', 'B-country',
              'I-country', 'B-location', 'I-location', 'B-organisation', 'I-organisation',
              'B-person', 'I-person', 'B-misc', 'I-misc']

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
accuracy, f1 = get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
accuracy_scores.append(accuracy)
f1_scores.append(f1)
# print("Results from Base logits")
# get_accuracy(base_logits, test_file_path, tokenizer, label_list)
# print("Results from Expert logits")
# get_accuracy(expert_logits, test_file_path, tokenizer, label_list)
# print("Results from Anti-Expert logits")
# get_accuracy(anti_expert_logits, test_file_path, tokenizer, label_list)




Epoch 1/50, Loss: 1.0707906484603882
Epoch 2/50, Loss: 1.175512433052063
Epoch 3/50, Loss: 0.9448374509811401
Epoch 4/50, Loss: 1.594366431236267
Epoch 5/50, Loss: 1.4827059507369995
Epoch 6/50, Loss: 1.8560222387313843
Epoch 7/50, Loss: 1.2364422082901
Epoch 8/50, Loss: 0.5915889143943787
Epoch 9/50, Loss: 0.8058557510375977
Epoch 10/50, Loss: 0.5236943960189819
Epoch 11/50, Loss: 0.3472554683685303
Epoch 12/50, Loss: 0.3753148913383484
Epoch 13/50, Loss: 0.34741467237472534
Epoch 14/50, Loss: 0.3465626835823059
Epoch 15/50, Loss: 0.7915300726890564
Epoch 16/50, Loss: 0.10508536547422409
Epoch 17/50, Loss: 0.166172057390213
Epoch 18/50, Loss: 0.33911100029945374
Epoch 19/50, Loss: 0.1255120486021042
Epoch 20/50, Loss: 0.2241009920835495
Epoch 21/50, Loss: 0.2792472541332245
Epoch 22/50, Loss: 0.11981160938739777
Epoch 23/50, Loss: 0.16572533547878265
Epoch 24/50, Loss: 0.06991113722324371
Epoch 25/50, Loss: 0.0634121522307396
Epoch 26/50, Loss: 0.2729297876358032
Epoch 27/50, Loss: 0.

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Results from Adjusted logits
Accuracy: 0.6007
F1 score: 0.5431


In [None]:
def train_Bert_model(file_path, tokenizer, label_list, epochs=50):
    # Load pre-trained BERT model
    bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)
    optimizer = optim.AdamW(bert_model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    # Create dataset
    dataset = create_dataset(file_path, tokenizer, label_list)
    train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

    # Training loop
    bert_model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = bert_model(input_ids)

            # Compute loss
            loss = criterion(outputs.logits.view(-1, len(label_list)), labels.view(-1))
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}')

    return bert_model

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def test_Bert_model(model, file_path, tokenizer, label_list):
    model.eval()
    dataset = create_dataset(file_path, tokenizer, label_list)
    test_loader = DataLoader(dataset, batch_size=16, shuffle=False)

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids)
            predictions = torch.argmax(outputs.logits, dim=-1)

            # Collect predictions and labels
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Flatten labels and predictions
    all_labels = [label for sublist in all_labels for label in sublist]
    all_predictions = [pred for sublist in all_predictions for pred in sublist]

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_predictions)
    print(f"Accuracy: {accuracy:.4f}")

    # Calculate F1 score
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    print(f"F1 score: {f1:.4f}")

    return accuracy, f1


In [None]:
train_file_path = "./ner_data/music/train.txt"
test_file_path = "./ner_data/music/test.txt"
label_list = ['O', 'B-musicgenre', 'I-musicgenre', 'B-song', 'I-song', 'B-band', 'I-band',
              'B-album', 'I-album', 'B-musicalartist', 'I-musicalartist', 'B-musicalinstrument',
              'I-musicalinstrument', 'B-award', 'I-award', 'B-event', 'I-event', 'B-country',
              'I-country', 'B-location', 'I-location', 'B-organisation', 'I-organisation',
              'B-person', 'I-person', 'B-misc', 'I-misc']

# Create the dataset
bert_model = train_Bert_model(train_file_path, tokenizer, label_list)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50, Loss: 3.6289497954504832
Epoch 2/50, Loss: 1.880358440535409
Epoch 3/50, Loss: 1.8859858683177404
Epoch 4/50, Loss: 1.8299418006624495
Epoch 5/50, Loss: 1.8447983946119035
Epoch 6/50, Loss: 1.8700364487511771
Epoch 7/50, Loss: 1.8698253801890783
Epoch 8/50, Loss: 1.8766675336020333
Epoch 9/50, Loss: 1.8452668019703455
Epoch 10/50, Loss: 1.860329576901027
Epoch 11/50, Loss: 1.8348181077412196
Epoch 12/50, Loss: 1.8587547370365687
Epoch 13/50, Loss: 1.8788625172206335
Epoch 14/50, Loss: 1.8694498368671961
Epoch 15/50, Loss: 1.83830041544778
Epoch 16/50, Loss: 1.8801157815115792
Epoch 17/50, Loss: 1.8323993001665388
Epoch 18/50, Loss: 1.8554250512804304
Epoch 19/50, Loss: 1.8914777040481567
Epoch 20/50, Loss: 1.8452007600239344
Epoch 21/50, Loss: 1.839153630392892
Epoch 22/50, Loss: 1.8614376102175032
Epoch 23/50, Loss: 1.8693219763892037
Epoch 24/50, Loss: 1.8436578171593803
Epoch 25/50, Loss: 1.8110975537981306
Epoch 26/50, Loss: 1.84382506779262
Epoch 27/50, Loss: 1.8910584

In [None]:
accuracy, f1 = test_Bert_model(bert_model, test_file_path, tokenizer, label_list)

Accuracy: 0.1981
F1 score: 0.0655


In [None]:
train_file_path = "./ner_data/ai/train.txt"
test_file_path = "./ner_data/ai/test.txt"
label_list = ["O", "B-field", "I-field", "B-task", "I-task", "B-product", "I-product", "B-algorithm", "I-algorithm", "B-researcher", "I-researcher", "B-metrics", "I-metrics", "B-programlang", "I-programlang", "B-conference", "I-conference", "B-university", "I-university", "B-country", "I-country", "B-person", "I-person", "B-organisation", "I-organisation", "B-location", "I-location", "B-misc", "I-misc"]


# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
accuracy, f1 = get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
accuracy_scores.append(accuracy)
f1_scores.append(f1)

Epoch 1/50, Loss: 2.6757233142852783
Epoch 2/50, Loss: 1.7787096500396729
Epoch 3/50, Loss: 1.6894540786743164
Epoch 4/50, Loss: 1.4639577865600586
Epoch 5/50, Loss: 1.5837465524673462
Epoch 6/50, Loss: 1.4051146507263184
Epoch 7/50, Loss: 1.0360360145568848
Epoch 8/50, Loss: 1.4014614820480347
Epoch 9/50, Loss: 0.8052220940589905
Epoch 10/50, Loss: 0.9003109335899353
Epoch 11/50, Loss: 0.8782998323440552
Epoch 12/50, Loss: 0.5167825222015381
Epoch 13/50, Loss: 0.8584740161895752
Epoch 14/50, Loss: 0.7150780558586121
Epoch 15/50, Loss: 0.6742351651191711
Epoch 16/50, Loss: 0.5124561786651611
Epoch 17/50, Loss: 0.3024795949459076
Epoch 18/50, Loss: 0.33895745873451233
Epoch 19/50, Loss: 0.21179495751857758
Epoch 20/50, Loss: 0.37265923619270325
Epoch 21/50, Loss: 0.2082100361585617
Epoch 22/50, Loss: 0.14610619843006134
Epoch 23/50, Loss: 0.36366936564445496
Epoch 24/50, Loss: 0.18076945841312408
Epoch 25/50, Loss: 0.19847938418388367
Epoch 26/50, Loss: 0.17952807247638702
Epoch 27/50, 

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results from Adjusted logits
Accuracy: 0.6781
F1 score: 0.5902


In [None]:
train_file_path = "./ner_data/literature/train.txt"
test_file_path = "./ner_data/literature/test.txt"
label_list = ["O", "B-book", "I-book", "B-writer", "I-writer", "B-award", "I-award", "B-poem", "I-poem", "B-event", "I-event", "B-magazine", "I-magazine", "B-literarygenre", "I-literarygenre", 'B-country', 'I-country', "B-person", "I-person", "B-location", "I-location", 'B-organisation', 'I-organisation', 'B-misc', 'I-misc']

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
accuracy, f1 = get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
accuracy_scores.append(accuracy)
f1_scores.append(f1)

Epoch 1/50, Loss: 2.4225876331329346
Epoch 2/50, Loss: 1.702901840209961
Epoch 3/50, Loss: 1.2227262258529663
Epoch 4/50, Loss: 1.80735445022583
Epoch 5/50, Loss: 1.7784662246704102
Epoch 6/50, Loss: 1.2684688568115234
Epoch 7/50, Loss: 1.2449848651885986
Epoch 8/50, Loss: 1.1433204412460327
Epoch 9/50, Loss: 1.0347270965576172
Epoch 10/50, Loss: 0.9720298051834106
Epoch 11/50, Loss: 0.6538082361221313
Epoch 12/50, Loss: 0.7682870626449585
Epoch 13/50, Loss: 0.8581915497779846
Epoch 14/50, Loss: 0.6217719912528992
Epoch 15/50, Loss: 0.46641144156455994
Epoch 16/50, Loss: 0.3201490640640259
Epoch 17/50, Loss: 0.36732131242752075
Epoch 18/50, Loss: 0.3468407690525055
Epoch 19/50, Loss: 0.20014673471450806
Epoch 20/50, Loss: 0.23919492959976196
Epoch 21/50, Loss: 0.17988871037960052
Epoch 22/50, Loss: 0.180881068110466
Epoch 23/50, Loss: 0.19125641882419586
Epoch 24/50, Loss: 0.18102161586284637
Epoch 25/50, Loss: 0.1039678305387497
Epoch 26/50, Loss: 0.14909544587135315
Epoch 27/50, Loss

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results from Adjusted logits
Accuracy: 0.6415
F1 score: 0.5520


In [None]:
train_file_path = "./ner_data/politics/test.txt"
test_file_path = "./ner_data/politics/train.txt"
label_list = ['O', 'B-country', 'B-politician', 'I-politician', 'B-election', 'I-election', 'B-person', 'I-person', 'B-organisation', 'I-organisation', 'B-location', 'B-misc', 'I-location', 'I-country', 'I-misc', 'B-politicalparty', 'I-politicalparty', 'B-event', 'I-event']

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
accuracy, f1 = get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
accuracy_scores.append(accuracy)
f1_scores.append(f1)

Epoch 1/50, Loss: 1.6258951425552368
Epoch 2/50, Loss: 1.3930851221084595
Epoch 3/50, Loss: 1.2307376861572266
Epoch 4/50, Loss: 1.2027338743209839
Epoch 5/50, Loss: 0.9015422463417053
Epoch 6/50, Loss: 0.8208214044570923
Epoch 7/50, Loss: 0.6202168464660645
Epoch 8/50, Loss: 0.5688968896865845
Epoch 9/50, Loss: 0.5806240439414978
Epoch 10/50, Loss: 0.5224183797836304
Epoch 11/50, Loss: 0.4191683232784271
Epoch 12/50, Loss: 0.31254467368125916
Epoch 13/50, Loss: 0.30535367131233215
Epoch 14/50, Loss: 0.198614239692688
Epoch 15/50, Loss: 0.201168954372406
Epoch 16/50, Loss: 0.24832430481910706
Epoch 17/50, Loss: 0.14115175604820251
Epoch 18/50, Loss: 0.12717105448246002
Epoch 19/50, Loss: 0.09347664564847946
Epoch 20/50, Loss: 0.10440166294574738
Epoch 21/50, Loss: 0.07428496330976486
Epoch 22/50, Loss: 0.09084834903478622
Epoch 23/50, Loss: 0.12093643099069595
Epoch 24/50, Loss: 0.06477463990449905
Epoch 25/50, Loss: 0.05740691348910332
Epoch 26/50, Loss: 0.04036489874124527
Epoch 27/5

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results from Adjusted logits
Accuracy: 0.6059
F1 score: 0.5418


In [None]:
train_file_path = "./ner_data/science/test.txt"
test_file_path = "./ner_data/science/train.txt"
label_list = ['O', 'B-scientist', 'I-scientist', 'B-person', 'I-person', 'B-university', 'I-university', 'B-organisation', 'I-organisation', 'B-country', 'I-country', 'B-location', 'I-location', 'B-discipline', 'I-discipline', 'B-enzyme', 'I-enzyme', 'B-protein', 'I-protein', 'B-chemicalelement', 'I-chemicalelement', 'B-chemicalcompound', 'I-chemicalcompound', 'B-astronomicalobject', 'I-astronomicalobject', 'B-academicjournal', 'I-academicjournal', 'B-event', 'I-event', 'B-theory', 'I-theory', 'B-award', 'I-award', 'B-misc', 'I-misc']

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
accuracy, f1 = get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
accuracy_scores.append(accuracy)
f1_scores.append(f1)

Epoch 1/50, Loss: 1.8180031776428223
Epoch 2/50, Loss: 1.541435956954956
Epoch 3/50, Loss: 1.3895117044448853
Epoch 4/50, Loss: 1.4568172693252563
Epoch 5/50, Loss: 1.0831571817398071
Epoch 6/50, Loss: 0.9061042666435242
Epoch 7/50, Loss: 0.8704869151115417
Epoch 8/50, Loss: 0.7098038196563721
Epoch 9/50, Loss: 0.5622011423110962
Epoch 10/50, Loss: 0.5435489416122437
Epoch 11/50, Loss: 0.43850836157798767
Epoch 12/50, Loss: 0.35388848185539246
Epoch 13/50, Loss: 0.3335142135620117
Epoch 14/50, Loss: 0.28757917881011963
Epoch 15/50, Loss: 0.22688508033752441
Epoch 16/50, Loss: 0.169193834066391
Epoch 17/50, Loss: 0.173414409160614
Epoch 18/50, Loss: 0.16365858912467957
Epoch 19/50, Loss: 0.14983482658863068
Epoch 20/50, Loss: 0.11406044661998749
Epoch 21/50, Loss: 0.10891503840684891
Epoch 22/50, Loss: 0.077363021671772
Epoch 23/50, Loss: 0.07227202504873276
Epoch 24/50, Loss: 0.0735524371266365
Epoch 25/50, Loss: 0.07692818343639374
Epoch 26/50, Loss: 0.05141071230173111
Epoch 27/50, L

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results from Adjusted logits
Accuracy: 0.6266
F1 score: 0.5583


In [None]:
domains = ['music', 'ai', 'literature', 'politics', 'science']
results_df = pd.DataFrame({'Domain': domains, 'Accuracy': accuracy_scores, 'F1 Score': f1_scores})

results_df['Accuracy'] = results_df['Accuracy'].round(2)
results_df['F1 Score'] = results_df['F1 Score'].round(2)
# Display the results
print("Results:")
results_df

Results:


Unnamed: 0,Domain,Accuracy,F1 Score
0,music,0.6,0.54
1,ai,0.68,0.59
2,literature,0.64,0.55
3,politics,0.61,0.54
4,science,0.63,0.56


In [None]:
# Define the label lists for each domain
politics_labels = ['O', 'B-country', 'B-politician', 'I-politician', 'B-election', 'I-election', 'B-person', 'I-person', 'B-organisation', 'I-organisation', 'B-location', 'B-misc', 'I-location', 'I-country', 'I-misc', 'B-politicalparty', 'I-politicalparty', 'B-event', 'I-event']
science_labels = ['O', 'B-scientist', 'I-scientist', 'B-person', 'I-person', 'B-university', 'I-university', 'B-organisation', 'I-organisation', 'B-country', 'I-country', 'B-location', 'I-location', 'B-discipline', 'I-discipline', 'B-enzyme', 'I-enzyme', 'B-protein', 'I-protein', 'B-chemicalelement', 'I-chemicalelement', 'B-chemicalcompound', 'I-chemicalcompound', 'B-astronomicalobject', 'I-astronomicalobject', 'B-academicjournal', 'I-academicjournal', 'B-event', 'I-event', 'B-theory', 'I-theory', 'B-award', 'I-award', 'B-misc', 'I-misc']
music_labels = ['O', 'B-musicgenre', 'I-musicgenre', 'B-song', 'I-song', 'B-band', 'I-band', 'B-album', 'I-album', 'B-musicalartist', 'I-musicalartist', 'B-musicalinstrument', 'I-musicalinstrument', 'B-award', 'I-award', 'B-event', 'I-event', 'B-country', 'I-country', 'B-location', 'I-location', 'B-organisation', 'I-organisation', 'B-person', 'I-person', 'B-misc', 'I-misc']
literature_labels = ["O", "B-book", "I-book", "B-writer", "I-writer", "B-award", "I-award", "B-poem", "I-poem", "B-event", "I-event", "B-magazine", "I-magazine", "B-literarygenre", "I-literarygenre", 'B-country', 'I-country', "B-person", "I-person", "B-location", "I-location", 'B-organisation', 'I-organisation', 'B-misc', 'I-misc']
ai_labels = ["O", "B-field", "I-field", "B-task", "I-task", "B-product", "I-product", "B-algorithm", "I-algorithm", "B-researcher", "I-researcher", "B-metrics", "I-metrics", "B-programlang", "I-programlang", "B-conference", "I-conference", "B-university", "I-university", "B-country", "I-country", "B-person", "I-person", "B-organisation", "I-organisation", "B-location", "I-location", "B-misc", "I-misc"]

# Combine all labels into one list
combined_labels = politics_labels + science_labels + music_labels + literature_labels + ai_labels

# Remove duplicates and preserve order
combined_labels = list(dict.fromkeys(combined_labels))

print(combined_labels)


['O', 'B-country', 'B-politician', 'I-politician', 'B-election', 'I-election', 'B-person', 'I-person', 'B-organisation', 'I-organisation', 'B-location', 'B-misc', 'I-location', 'I-country', 'I-misc', 'B-politicalparty', 'I-politicalparty', 'B-event', 'I-event', 'B-scientist', 'I-scientist', 'B-university', 'I-university', 'B-discipline', 'I-discipline', 'B-enzyme', 'I-enzyme', 'B-protein', 'I-protein', 'B-chemicalelement', 'I-chemicalelement', 'B-chemicalcompound', 'I-chemicalcompound', 'B-astronomicalobject', 'I-astronomicalobject', 'B-academicjournal', 'I-academicjournal', 'B-theory', 'I-theory', 'B-award', 'I-award', 'B-musicgenre', 'I-musicgenre', 'B-song', 'I-song', 'B-band', 'I-band', 'B-album', 'I-album', 'B-musicalartist', 'I-musicalartist', 'B-musicalinstrument', 'I-musicalinstrument', 'B-book', 'I-book', 'B-writer', 'I-writer', 'B-poem', 'I-poem', 'B-magazine', 'I-magazine', 'B-literarygenre', 'I-literarygenre', 'B-field', 'I-field', 'B-task', 'I-task', 'B-product', 'I-produc

In [None]:
train_file_path = "./all/combined_train.txt"
test_file_path = "./all/combined_test.txt"
label_list = combined_labels

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
accuracy, f1 = get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
accuracy_scores.append(accuracy)
f1_scores.append(f1)

Epoch 1/50, Loss: 1.9446995258331299
Epoch 2/50, Loss: 2.0965521335601807
Epoch 3/50, Loss: 2.0780014991760254
Epoch 4/50, Loss: 1.4251056909561157
Epoch 5/50, Loss: 1.3207924365997314
Epoch 6/50, Loss: 1.2082256078720093
Epoch 7/50, Loss: 1.1301593780517578
Epoch 8/50, Loss: 0.8470916152000427
Epoch 9/50, Loss: 0.8159968852996826
Epoch 10/50, Loss: 0.5798310041427612
Epoch 11/50, Loss: 0.5087597370147705
Epoch 12/50, Loss: 0.5224311947822571
Epoch 13/50, Loss: 0.42370039224624634
Epoch 14/50, Loss: 0.3406713604927063
Epoch 15/50, Loss: 0.29042237997055054
Epoch 16/50, Loss: 0.21046821773052216
Epoch 17/50, Loss: 0.18546825647354126
Epoch 18/50, Loss: 0.2008923888206482
Epoch 19/50, Loss: 0.17866815626621246
Epoch 20/50, Loss: 0.13390757143497467
Epoch 21/50, Loss: 0.13431555032730103
Epoch 22/50, Loss: 0.10104426741600037
Epoch 23/50, Loss: 0.07286270707845688
Epoch 24/50, Loss: 0.10074407607316971
Epoch 25/50, Loss: 0.10700071603059769
Epoch 26/50, Loss: 0.10367215424776077
Epoch 27/

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results from Adjusted logits
Accuracy: 0.6108
F1 score: 0.5240


In [None]:
class NER_Domain_Dataset(Dataset):
    def __init__(self, file_path, tokenizer, labels_to_id):
        self.tokenizer = tokenizer
        self.labels_to_id = labels_to_id
        self.texts = []
        self.labels = []
        self.domains = []  # New list to store domain labels

        # Read data
        with open(file_path, 'r', encoding='utf-8') as f:
            tokens, label_ids, domain_ids = [], [], []
            for line in f:
                line = line.strip()
                if line == "":
                    # end of an example; process and reset for the next example
                    if tokens:
                        self.texts.append(tokens)
                        self.labels.append(label_ids)
                        self.domains.append(domain_ids)  # Store the domain labels
                        tokens, label_ids, domain_ids = [], [], []
                else:
                    domain, token, label = line.split('\t')  # Split by tabs
                    tokens.append(token)
                    label_ids.append(self.labels_to_id[label])
                    domain_ids.append(int(domain))  # Convert domain label to integer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenization
        encoding = self.tokenizer(
            self.texts[idx],
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors="pt"
        )

        # Convert label IDs to tensor and handle padding
        labels = torch.LongTensor(self.labels[idx])
        labels_padded = torch.ones(128, dtype=torch.long) * -100  # Padding index for labels
        labels_padded[:len(labels)] = labels

        # Set up the dictionary to return
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = labels_padded
        item['domains'] = torch.tensor(self.domains[idx])  # Add domain labels to the item
        return item

NameError: name 'Dataset' is not defined

In [None]:
def create_domain_dataset(file_path, tokenizer, label_list):
    # Map labels to IDs
    labels_to_id = {label: idx for idx, label in enumerate(label_list)}

    # Create the dataset
    dataset = NER_Domain_Dataset(file_path, tokenizer, labels_to_id)
    return dataset

In [None]:
train_file_path = "./all/combined_train.txt"
test_file_path = "./all/combined_test.txt"
label_list = combined_labels

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
accuracy, f1 = get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
accuracy_scores.append(accuracy)
f1_scores.append(f1)

In [None]:
# import os
# import glob

# # Path to the directory containing train and test files for each domain
# train_dir = "./ner_data/*/train.txt"
# test_dir = "./ner_data/*/test.txt"

# # Initialize empty strings to store combined train and test data
# combined_train_data = ""
# combined_test_data = ""

# # Read train files for all domains and append to combined_train_data
# for file in glob.glob(train_dir):
#     with open(file, 'r') as f:
#         combined_train_data += f.read() + "\n"

# # Read test files for all domains and append to combined_test_data
# for file in glob.glob(test_dir):
#     with open(file, 'r') as f:
#         combined_test_data += f.read() + "\n"

# # Write the combined train data to a new file
# with open("./all/combined_train.txt", 'w') as f:
#     f.write(combined_train_data)

# # Write the combined test data to a new file
# with open("./all/combined_test.txt", 'w') as f:
#     f.write(combined_test_data)

# print("Combined train data saved to 'combined_train.txt'")
# print("Combined test data saved to 'combined_test.txt'")


Combined train data saved to 'combined_train.txt'
Combined test data saved to 'combined_test.txt'


In [None]:
# import glob
# import os

# # Path to the directory containing train and test files for each domain
# train_dir = "./ner_data/*/train.txt"
# test_dir = "./ner_data/*/test.txt"

# # Method 1: Adding label to each line
# combined_train_data_line_label = ""
# combined_test_data_line_label = ""

# # Method 2: Adding label to the entire document
# combined_train_data_doc_label = ""
# combined_test_data_doc_label = ""

# # Iterate through each domain
# for domain_index, domain_path in enumerate(glob.glob(train_dir)):
#     # Method 1: Adding label to each line
#     with open(domain_path, 'r') as f:
#         for line in f:
#             combined_train_data_line_label += f"{domain_index}\t{line}"

#     # Method 2: Adding label to the entire document
#     with open(domain_path, 'r') as f:
#         doc_content = f.read()
#         combined_train_data_doc_label += f"{domain_index}\t{doc_content}\n"

# # Save the combined data with labels for Method 1
# with open("./domain_label/combined_train_line_label.txt", 'w') as f:
#     f.write(combined_train_data_line_label)

# # Save the combined data with labels for Method 2
# with open("./domain_label/combined_train_doc_label.txt", 'w') as f:
#     f.write(combined_train_data_doc_label)

# print("Combined train data with labels saved.")

# # Repeat the same process for test data
# # Iterate through each domain
# for domain_index, domain_path in enumerate(glob.glob(test_dir)):
#     # Method 1: Adding label to each line
#     with open(domain_path, 'r') as f:
#         for line in f:
#             combined_test_data_line_label += f"{domain_index}\t{line}"

#     # Method 2: Adding label to the entire document
#     with open(domain_path, 'r') as f:
#         doc_content = f.read()
#         combined_test_data_doc_label += f"{domain_index}\t{doc_content}\n"

# # Save the combined data with labels for Method 1
# with open("./domain_label/combined_test_line_label.txt", 'w') as f:
#     f.write(combined_test_data_line_label)

# # Save the combined data with labels for Method 2
# with open("./domain_label/combined_test_doc_label.txt", 'w') as f:
#     f.write(combined_test_data_doc_label)

# print("Combined test data with labels saved.")


Combined train data with labels saved.
Combined test data with labels saved.


In [None]:
train_file_path = "./ner_data/combined_dataset/conll_music.txt"
test_file_path = "./ner_data/music/test.txt"
label_list = ['O', 'B-musicgenre', 'I-musicgenre', 'B-song', 'I-song', 'B-band', 'I-band',
              'B-album', 'I-album', 'B-musicalartist', 'I-musicalartist', 'B-musicalinstrument',
              'I-musicalinstrument', 'B-award', 'I-award', 'B-event', 'I-event', 'B-country',
              'I-country', 'B-location', 'I-location', 'B-organisation', 'I-organisation',
              'B-person', 'I-person', 'B-misc', 'I-misc']

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
print("Results from Base logits")
get_accuracy(base_logits, test_file_path, tokenizer, label_list)
print("Results from Expert logits")
get_accuracy(expert_logits, test_file_path, tokenizer, label_list)
print("Results from Anti-Expert logits")
get_accuracy(anti_expert_logits, test_file_path, tokenizer, label_list)

Epoch 1/50, Loss: 0.22542531788349152
Epoch 2/50, Loss: 0.3994554579257965
Epoch 3/50, Loss: 0.19470170140266418
Epoch 4/50, Loss: 0.213877871632576
Epoch 5/50, Loss: 0.17817609012126923
Epoch 6/50, Loss: 0.08368116617202759
Epoch 7/50, Loss: 0.14772160351276398
Epoch 8/50, Loss: 0.13223271071910858
Epoch 9/50, Loss: 0.12847769260406494
Epoch 10/50, Loss: 0.1697915494441986
Epoch 11/50, Loss: 0.14490973949432373
Epoch 12/50, Loss: 0.12253077328205109
Epoch 13/50, Loss: 0.12795716524124146
Epoch 14/50, Loss: 0.09354793280363083
Epoch 15/50, Loss: 0.08183693885803223
Epoch 16/50, Loss: 0.10796994715929031
Epoch 17/50, Loss: 0.1214682087302208
Epoch 18/50, Loss: 0.11130332201719284
Epoch 19/50, Loss: 0.12833459675312042
Epoch 20/50, Loss: 0.08328857272863388
Epoch 21/50, Loss: 0.1634770929813385
Epoch 22/50, Loss: 0.10231215506792068
Epoch 23/50, Loss: 0.07035768032073975
Epoch 24/50, Loss: 0.07744460552930832
Epoch 25/50, Loss: 0.11645201593637466
Epoch 26/50, Loss: 0.11697728931903839
E

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results from Adjusted logits
Accuracy: 0.5757
F1 score: 0.4791
Results from Base logits
Accuracy: 0.0284
F1 score: 0.0123
Results from Expert logits
Accuracy: 0.5809
F1 score: 0.4799
Results from Anti-Expert logits
Accuracy: 0.0359
F1 score: 0.0509


In [None]:
train_file_path = "./ner_data/conll2003/train.txt"
test_file_path = "./ner_data/conll2003/test.txt"
label_list = ['O', 'B-organisation', 'I-organisation', 'B-person', 'I-person', 'B-location', 'I-location', 'B-misc', 'I-misc']

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits, base_logits, expert_logits, anti_expert_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
print("Results from Adjusted logits")
get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)
print("Results from Base logits")
get_accuracy(base_logits, test_file_path, tokenizer, label_list)
print("Results from Expert logits")
get_accuracy(expert_logits, test_file_path, tokenizer, label_list)
print("Results from Anti-Expert logits")
get_accuracy(anti_expert_logits, test_file_path, tokenizer, label_list)




Epoch 1/50, Loss: 0.3977467119693756
Epoch 2/50, Loss: 0.4069274663925171
Epoch 3/50, Loss: 0.6941981315612793
Epoch 4/50, Loss: 0.6062135100364685
Epoch 5/50, Loss: 0.41429606080055237
Epoch 6/50, Loss: 0.2667192816734314
Epoch 7/50, Loss: 0.3267687261104584
Epoch 8/50, Loss: 0.19777198135852814
Epoch 9/50, Loss: 0.1795627921819687
Epoch 10/50, Loss: 0.1420699954032898
Epoch 11/50, Loss: 0.09733942896127701
Epoch 12/50, Loss: 0.22917130589485168
Epoch 13/50, Loss: 0.25425130128860474
Epoch 14/50, Loss: 0.17225027084350586
Epoch 15/50, Loss: 0.2149767428636551
Epoch 16/50, Loss: 0.13717620074748993
Epoch 17/50, Loss: 0.19100897014141083
Epoch 18/50, Loss: 0.06589536368846893
Epoch 19/50, Loss: 0.17665784060955048
Epoch 20/50, Loss: 0.1447449028491974
Epoch 21/50, Loss: 0.32925817370414734
Epoch 22/50, Loss: 0.2287328988313675
Epoch 23/50, Loss: 0.12167973816394806
Epoch 24/50, Loss: 0.13208100199699402
Epoch 25/50, Loss: 0.15178610384464264
Epoch 26/50, Loss: 0.36878249049186707
Epoch 

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results from Adjusted logits
Accuracy: 0.8117
F1 score: 0.7751
Results from Base logits
Accuracy: 0.0976
F1 score: 0.1454
Results from Expert logits
Accuracy: 0.8128
F1 score: 0.7759
Results from Anti-Expert logits
Accuracy: 0.1499
F1 score: 0.2273


In [None]:
train_file_path = "./ner_data/politics/train.txt"
test_file_path = "./ner_data/politics/test.txt"
label_list = ['O', 'B-country', 'B-politician', 'I-politician', 'B-election', 'I-election', 'B-person', 'I-person', 'B-organisation', 'I-organisation', 'B-location', 'B-misc', 'I-location', 'I-country', 'I-misc', 'B-politicalparty', 'I-politicalparty', 'B-event', 'I-event']

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list)).to(device)

adjusted_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)




Epoch 1/50, Loss: 2.73720121383667
Epoch 2/50, Loss: 2.4956588745117188
Epoch 3/50, Loss: 2.341888189315796
Epoch 4/50, Loss: 2.191075086593628
Epoch 5/50, Loss: 1.9190109968185425
Epoch 6/50, Loss: 1.9749963283538818
Epoch 7/50, Loss: 1.9130913019180298
Epoch 8/50, Loss: 1.6417940855026245
Epoch 9/50, Loss: 1.595287799835205
Epoch 10/50, Loss: 1.5895259380340576
Epoch 11/50, Loss: 1.3027013540267944
Epoch 12/50, Loss: 1.5907151699066162
Epoch 13/50, Loss: 1.319096565246582
Epoch 14/50, Loss: 1.524005651473999
Epoch 15/50, Loss: 1.2947558164596558
Epoch 16/50, Loss: 1.1210548877716064
Epoch 17/50, Loss: 1.2298129796981812
Epoch 18/50, Loss: 1.3885982036590576
Epoch 19/50, Loss: 1.1997357606887817
Epoch 20/50, Loss: 1.1736689805984497
Epoch 21/50, Loss: 1.5108555555343628
Epoch 22/50, Loss: 1.1268004179000854
Epoch 23/50, Loss: 1.0821619033813477
Epoch 24/50, Loss: 0.9109289646148682
Epoch 25/50, Loss: 1.245213270187378
Epoch 26/50, Loss: 1.2401418685913086
Epoch 27/50, Loss: 1.19729375

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: The size of tensor a (128) must match the size of tensor b (4096) at non-singleton dimension 1

In [None]:
train_file_path = "./ner_data/science/train.txt"
test_file_path = "./ner_data/science/test.txt"
label_list = ['O', 'B-scientist', 'I-scientist', 'B-person', 'I-person', 'B-university', 'I-university', 'B-organisation', 'I-organisation', 'B-country', 'I-country', 'B-location', 'I-location', 'B-discipline', 'I-discipline', 'B-enzyme', 'I-enzyme', 'B-protein', 'I-protein', 'B-chemicalelement', 'I-chemicalelement', 'B-chemicalcompound', 'I-chemicalcompound', 'B-astronomicalobject', 'I-astronomicalobject', 'B-academicjournal', 'I-academicjournal', 'B-event', 'I-event', 'B-theory', 'I-theory', 'B-award', 'I-award', 'B-misc', 'I-misc']

# Create the dataset
expert_model, anti_expert_model = train_small_model(train_file_path, tokenizer, label_list)
bert_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list))

adjusted_logits = make_adjusted_predictions_dataset(test_file_path, tokenizer, bert_model, expert_model, anti_expert_model, label_list)
get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)




Epoch 1/20, Loss: 3.431079387664795
Epoch 2/20, Loss: 3.260760545730591
Epoch 3/20, Loss: 3.203333854675293
Epoch 4/20, Loss: 2.9550888538360596
Epoch 5/20, Loss: 2.818697214126587
Epoch 6/20, Loss: 2.5196352005004883
Epoch 7/20, Loss: 2.1582717895507812
Epoch 8/20, Loss: 1.9835264682769775
Epoch 9/20, Loss: 1.8423062562942505
Epoch 10/20, Loss: 1.8925508260726929
Epoch 11/20, Loss: 1.5167534351348877
Epoch 12/20, Loss: 1.4755611419677734
Epoch 13/20, Loss: 1.2472034692764282
Epoch 14/20, Loss: 1.5121597051620483
Epoch 15/20, Loss: 1.5376965999603271
Epoch 16/20, Loss: 1.5862619876861572
Epoch 17/20, Loss: 1.1666269302368164
Epoch 18/20, Loss: 1.421738862991333
Epoch 19/20, Loss: 1.3909367322921753
Epoch 20/20, Loss: 1.5014952421188354


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.6505
F1 score: 0.0248


In [None]:
get_accuracy(adjusted_logits, test_file_path, tokenizer, label_list)

Accuracy: 0.6505
F1 score: 0.5193


In [None]:
# Get predicted labels


# Print shapes
print("Shapes:")
print("Adjusted logits:", adjusted_logits.shape)
print("Probabilities:", probabilities.shape)
print("Predictions:", predictions.shape)

Shapes:
Adjusted logits: torch.Size([465, 128, 27])
Probabilities: torch.Size([465, 128, 27])
Predictions: torch.Size([465, 128])


In [None]:
from sklearn.metrics import accuracy_score

# Flatten predictions and true labels
flattened_predictions = predictions.view(-1).cpu().numpy()
flattened_true_labels = true_labels.view(-1).cpu().numpy()

# Remove padding tokens from true labels and predictions
mask = (flattened_true_labels != -100)
flattened_true_labels = flattened_true_labels[mask]
flattened_predictions = flattened_predictions[mask]

# Compute accuracy
accuracy = accuracy_score(flattened_true_labels, flattened_predictions)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5793


In [None]:
from sklearn.metrics import accuracy_score

# Flatten predictions and true labels
flattened_predictions = []
true_labels = []

dataset = create_dataset("./ner_data/music/test.txt", tokenizer, label_list)
data_loader = DataLoader(dataset, batch_size=32, shuffle=False)
adjusted_predictions = torch.argmax(probabilities, dim=-1)

for i, batch in enumerate(data_loader):
    input_ids = batch['input_ids']
    # Assuming adjusted_predictions is a tensor of shape (batch_size, sequence_length, num_labels)
    batch_predictions = adjusted_predictions[i]

    # Convert the tensor to a numpy array
    batch_predictions_np = batch_predictions.cpu().numpy()

    # Get the lengths of non-padded sequences
    non_padded_lengths = torch.sum(input_ids != tokenizer.pad_token_id, dim=1)

    # Iterate over each sequence in the batch
    for j, length in enumerate(non_padded_lengths):
        # Get the predictions for this sequence
        sequence_predictions = batch_predictions_np[j, :length].tolist()
        # Get the true labels for this sequence
        sequence_labels = batch['labels'][j][:length].tolist()

        # Append individual values
        flattened_predictions.extend(sequence_predictions)
        true_labels.extend(sequence_labels)

# Compute accuracy
accuracy = accuracy_score(true_labels, flattened_predictions)
print(f"Accuracy: {accuracy:.4f}")


KeyError: 'B-song'

In [None]:
def get_true_labels(file_path, tokenizer, label_list):
    dataset = create_dataset(file_path, tokenizer, label_list)
    data_loader = DataLoader(dataset, batch_size=32, shuffle=False)

    all_labels = []
    for batch in data_loader:
        labels = batch['labels']
        input_ids = batch['input_ids']

        non_padded_lengths = (input_ids != tokenizer.pad_token_id).sum(1)
        for j, length in enumerate(non_padded_lengths):
            all_labels.extend(labels[j][:length].tolist())

    return np.array(all_labels)

In [None]:
from sklearn.metrics import accuracy_score
predictions = torch.argmax(probabilities, dim=-1)


adjusted_predictions = torch.argmax(probabilities, dim=-1)

print(adjusted_logits.shape)
print(probabilities.shape)
print(adjusted_predictions.shape)


# Assuming you have a function to get true labels for the test dataset
true_labels = get_true_labels("./ner_data/music/test.txt", tokenizer, label_list)

# Extract the NumPy array from the list
true_labels_flat = true_labels[0]
print(true_labels.shape)
print(true_labels_flat.shape)

# Flatten the array
true_labels_flat = true_labels_flat.flatten()
print(true_labels_flat.shape)
print(true_labels_flat)

adjusted_predictions_flat = adjusted_predictions.numpy().flatten()

# Calculate accuracy
accuracy = accuracy_score(true_labels_flat, adjusted_predictions_flat)
print(f"Accuracy: {accuracy:.4f}")

torch.Size([465, 128, 27])
torch.Size([465, 128, 27])
torch.Size([465, 128])
(23889,)
()
(1,)
[0]


ValueError: Found input variables with inconsistent numbers of samples: [1, 59520]

# Model Training:

# Evaluation

# Results