In [1]:
!cp -r /content/drive/MyDrive/RISE_Assignment/tokenized_filtered_dataset /content/

In [2]:
!pip install datasets
!pip install transformers[torch]
!pip install wandb
!pip install seqeval
!pip install evaluate

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-p

In [3]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
from transformers import BertTokenizerFast

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Step 3: Filter out Non-English Examples
from datasets import load_dataset
from datasets import load_from_disk
import os

def load_and_filter_dataset():
    dataset = load_dataset("Babelscape/multinerd")
    english_dataset = dataset.filter(lambda example: example['lang'] == 'en')
    return english_dataset

# Check if the processed English dataset already exists
if os.path.exists("./tokenized_english_dataset"):

    tokenized_english_dataset = load_from_disk("./tokenized_english_dataset")
else:
    english_dataset = load_and_filter_dataset()
    # The tokenization and saving will be handled in the next step.


In [8]:
# Step 4 & 6: Tokenize and Align Labels for System A and B
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding="max_length", is_split_into_words=True)
    # labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # label_ids = []
        # for word_idx in word_ids:
        #     if word_idx is None:
        #         label_ids.append(-100)
        #     elif label_all_tokens:
        #         label_ids.append(label[word_idx])
        #     else:
        #         label_ids.append(-100)
        # labels.append(label_ids)
    tokenized_inputs["labels"] = examples["ner_tags"]
    return tokenized_inputs

# Tokenize the English dataset for System A
if not os.path.exists("./tokenized_english_dataset"):
    tokenized_english_dataset = english_dataset.map(tokenize_and_align_labels, batched=True)
    tokenized_english_dataset.save_to_disk("./tokenized_english_dataset")
# Step 5: Preprocess Dataset for Specific Entity Types (System B)
def filter_labels_system_b(examples):
    # Mapping of required entities to their indices
    required_entities = {
        1, 2,   # B-PER, I-PER
        3, 4,   # B-ORG, I-ORG
        5, 6,   # B-LOC, I-LOC
        13, 14, # B-DIS, I-DIS
        7, 8    # B-ANIM, I-ANIM
    }

    label_mapping = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 13: 9, 14: 10}
    # Filter labels, map to 0 if not in required entities
    examples["ner_tags"] = [label_mapping[label] if label in required_entities else 0 for label in examples["ner_tags"]]    # examples["ner_tags"] = filtered_labels
    return examples

# Apply filtering and tokenization for System B
if os.path.exists("./tokenized_filtered_dataset"):
    tokenized_filtered_dataset = load_from_disk("./tokenized_filtered_dataset")
else:
    filtered_dataset = english_dataset.map(filter_labels_system_b)
    tokenized_filtered_dataset = filtered_dataset.map(tokenize_and_align_labels, batched=True)
    tokenized_filtered_dataset.save_to_disk("./tokenized_filtered_dataset")


In [7]:
# Step 4 & 6 & 7: Fine-tune the Model System A
from transformers import BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import evaluate
seqeval = evaluate.load("seqeval")



# Model Initialization function
def model_init(num_labels):
    return BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

# Function to find the number of unique labels in the dataset
# Technically, it can be easier to directly set the label_num in system A as 31 and in system B as 11 as what we have known from the dataset.
# But just to still write down the function to generalize in different datasets.
def get_num_labels(dataset):
    unique_labels = set()
    for example in dataset:
        unique_labels.update(example['ner_tags'])
    return len(unique_labels)

label_list_a = [i for i in range(31)]

def compute_metrics_a(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Convert predictions and labels from integers to their corresponding string labels
    true_predictions = [
        [str(label_list_a[p]) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [str(label_list_a[l]) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# Get the number of labels for System A
# num_labels_a = get_num_labels(tokenized_english_dataset['train'])
num_labels_a = 31

# Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Training Arguments with model checkpoint saving
training_args_a = TrainingArguments(
    output_dir="./results_system_a",
    evaluation_strategy="steps",
    eval_steps=2000,  # Evaluate every 2000 steps
    learning_rate=2e-5,
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb",  # Enable wandb logging
    run_name="system_a_training_run"  # Name of the wandb run for System A
)

# Initialize and Train Trainer for System A
trainer_system_a = Trainer(
    model_init=lambda: model_init(num_labels_a),
    args=training_args_a,
    train_dataset=tokenized_english_dataset["train"],
    eval_dataset=tokenized_english_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_a
)

trainer_system_a.train()

evaluation_results_a = trainer_system_a.evaluate(tokenized_english_dataset["test"])

# Output evaluation result
print("Evaluation Results for System A:", evaluation_results_a)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mhenryluo[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
2000,0.1179,0.089557
4000,0.0848,0.065507
6000,0.0696,0.055865


Step,Training Loss,Validation Loss
2000,0.1179,0.089557
4000,0.0848,0.065507
6000,0.0696,0.055865
8000,0.0579,0.055666
10000,0.0495,0.049367
12000,0.0458,0.047713


Evaluation Results for System A: {'eval_loss': 0.047107815742492676, 'eval_runtime': 246.704, 'eval_samples_per_second': 133.391, 'eval_steps_per_second': 3.336, 'epoch': 2.0}


In [10]:
# Step 4 & 6 & 7: Fine-tune the Model System B
from transformers import BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import evaluate
seqeval = evaluate.load("seqeval")



# Model Initialization function
def model_init(num_labels):
    return BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

# Function to find the number of unique labels in the dataset
# Technically, it can be easier to directly set the label_num in system A as 31 and in system B as 11 as what we have known from the dataset.
# But just to still write down the function to generalize in different datasets.
def get_num_labels(dataset):
    unique_labels = set()
    for example in dataset:
        unique_labels.update(example['ner_tags'])
    return len(unique_labels)

label_list_b = [i for i in range(11)]

def compute_metrics_b(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Convert predictions and labels from integers to their corresponding string labels
    true_predictions = [
        [str(label_list_b[p]) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [str(label_list_b[l]) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# Get the number of labels for System B
# num_labels_b = get_num_labels(tokenized_english_dataset['train'])
num_labels_b = 11

# Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Training Arguments with model checkpoint saving
training_args_b = TrainingArguments(
    output_dir="./results_system_b",
    evaluation_strategy="steps",
    eval_steps=4000,  # Evaluate every 2000 steps
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb",  # Enable wandb logging
    run_name="system_b_training_run"  # Name of the wandb run for System A
)

# Initialize and Train Trainer for System B
trainer_system_b = Trainer(
    model_init=lambda: model_init(num_labels_b),
    args=training_args_b,
    train_dataset=tokenized_filtered_dataset["train"],
    eval_dataset=tokenized_filtered_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics_b
)

trainer_system_b.train()

evaluation_results_b = trainer_system_b.evaluate(tokenized_filtered_dataset["test"])

# Output evaluation result
print("Evaluation Results for System B:", evaluation_results_b)



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mhenryluo[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
4000,0.0737,0.066246
8000,0.05,0.048323


Step,Training Loss,Validation Loss
4000,0.0737,0.066246
8000,0.05,0.048323
12000,0.0392,0.043115
16000,0.0305,0.039687
20000,0.0262,0.037295
24000,0.0208,0.03652


Evaluation Results for System B: {'eval_loss': 0.030493086203932762, 'eval_runtime': 315.663, 'eval_samples_per_second': 104.25, 'eval_steps_per_second': 5.214, 'epoch': 2.0}


In [11]:

!cp -r /content/results_system_b /content/drive/MyDrive/RISE_Assignment/




In [13]:
import evaluate

seqeval = evaluate.load("seqeval")
import numpy as np



label_list_b = [i for i in range(11)]



def compute_metrics_b(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Convert predictions and labels from integers to their corresponding string labels
    true_predictions = [
        [str(label_list_b[p]) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [str(label_list_b[l]) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]



    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



trainer_system_b.compute_metrics=compute_metrics_b

evaluation_results_b = trainer_system_b.evaluate(tokenized_filtered_dataset["test"])

# Output evaluation result
print("Evaluation Results for System B:", evaluation_results_b)





Evaluation Results for System B: {'eval_loss': 0.030493086203932762, 'eval_precision': 0.5105421686746988, 'eval_recall': 0.49130434782608695, 'eval_f1': 0.500738552437223, 'eval_accuracy': 0.9909891218211945, 'eval_runtime': 349.5822, 'eval_samples_per_second': 94.135, 'eval_steps_per_second': 4.708, 'epoch': 2.0}


In [83]:
# # Step 4 & 6 & 7: Fine-tune the Model System B


# # Get the number of labels for System B (after preprocessing)
# # num_labels_b = get_num_labels(tokenized_filtered_dataset['train'])
# num_labels_b = 11

# # Update training arguments for System B
# training_args_b = TrainingArguments(
#     output_dir="./results_system_b",
#     evaluation_strategy="steps",
#     eval_steps=2000,  # Evaluate every 2000 steps
#     learning_rate=2e-5,
#     per_device_train_batch_size=40,
#     per_device_eval_batch_size=40,
#     num_train_epochs=2,
#     weight_decay=0.01,
#     save_strategy="epoch",
#     save_total_limit=2
# )

# # Initialize and Train Trainer for System B
# trainer_system_b = Trainer(
#     model_init=lambda: model_init(num_labels_b),
#     args=training_args_b,
#     train_dataset=tokenized_filtered_dataset["train"],
#     eval_dataset=tokenized_filtered_dataset["validation"],
#     tokenizer=tokenizer,
#     data_collator=data_collator
# )

# trainer_system_b.train()


# evaluation_results_b = trainer_system_b.evaluate(tokenized_filtered_dataset["test"])

# # Output evaluation results
# print("Evaluation Results for System B:", evaluation_results_b)

RuntimeError: ignored

[[0, 0, 0, 0, 5, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 0],
 [0, 0, 1, 2, 0, 1, 2, 2, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 10, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 5, 0],
 [0,

In [52]:
print(five_samples)

Dataset({
    features: ['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 262560
})
