In [1]:
# %% Import Statements (may be more than necessary)
import json
import torch
import random
import datasets
import evaluate
import numpy as np
import transformers
import pandas as pd
from tqdm.auto import tqdm
from itertools import groupby
from operator import itemgetter
from datasets import Dataset, load_dataset
from transformers import BartTokenizer, DataCollatorForSeq2Seq, BartForConditionalGeneration
from transformers import AdamW, Seq2SeqTrainer, Seq2SeqTrainingArguments, get_scheduler, Trainer, TrainingArguments, GenerationConfig, set_seed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %% Ensure CUDA Availability
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
!nvidia-smi

True
NVIDIA A100-SXM4-80GB
Thu May  2 20:00:41 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:41:00.0 Off |                    0 |
| N/A   32C    P0             61W /  500W |       4MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
          

In [3]:
# %% Random Seed Function (Charles)
from transformers.utils import logging

def set_logging_and_seed(seed=42):
    # Set logging output settings
    logging.set_verbosity_info()
    logger = logging.get_logger("transformers")
    logging.set_verbosity(30)
    logger.warning("WARN")

    # Set random seed outputs
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if (torch.cuda.is_available()): torch.cuda.manual_seed_all(seed)
    set_seed(seed)
    return

In [4]:
# %% Load ATOMIC Dataset (Charles)
set_logging_and_seed(seed=42)

atomic_train = pd.read_csv("/scratch/gpfs/kc4642/Datasets/ATOMIC/train.tsv", sep="\t").values.tolist()
atomic_train = [s for s in atomic_train if "_" not in s[0] and type(s[-1]) == type("") and len(s[-1].split(" ")) == 1 and s[-1] != "none"]

WARN


In [5]:
# %% Construct ATOMIC Dataset (Charles)
set_logging_and_seed(seed=42)

relationship_dictionary = {
    "xWant" : ". As a result, PersonX wants ",
    "oEffect" : ". As a result, Y or others will ", 
    "oReact" : ". As a result, Y or others feels ",
    "xReason" : " because ",
    "xNeed" : ". But before, PersonX needed ",
    "AtLocation" : " located or found at ",
    "xEffect" : ". As a result, PersonX will ",
    "ObjectUse" : " used for ",
    "MadeUpOf" : " made up of ",
    "CapableOf" : " is capable of ",
    "Causes" : " causes ",
    "HinderedBy" : " can be hindered by ",
    "xIntent" : " because PersonX wanted ",
    "xReact" : ". As a result, PersonX feels ",
    "Desires" : " desires ",
    "HasProperty" : " can be characterized as having ",
    "NotDesires" : " does not desire ",
    "oWant" : ". As a result, Y or others want ",
    "xAttr" : ". PersonX is seen as ",
    "HasSubEvent" : " includes the event "}

atomic_training_dataset = {"inputs" : [], "labels" : []}
for i in range(len(atomic_train)):
    label = atomic_train[i][-1]
    sentence = atomic_train[i][0] + relationship_dictionary[atomic_train[i][1]] + "<mask>."
    
    if (atomic_train[i][1] not in ["MadeUpOf", "HasProperty"]): continue
    atomic_training_dataset["inputs"].append(sentence)
    atomic_training_dataset["labels"].append(label)
    
atomic_training_dataset = Dataset.from_dict(atomic_training_dataset).shuffle(seed=42)
print(atomic_training_dataset)

WARN


Dataset({
    features: ['inputs', 'labels'],
    num_rows: 3809
})


In [4]:
# %% Load NumerSense Train Dataset (Mahsa)
set_logging_and_seed(seed=42)

train = pd.read_csv("train.tsv", sep="\t").values.tolist()

WARN


In [5]:
# %% Create NumerSense Train Dataset (Mahsa)
set_logging_and_seed(seed=42)

training_dataset = {"inputs" : [], "labels" : []}
for i in range(len(train)):
    training_dataset["inputs"].append(train[i][0])
    training_dataset["labels"].append(train[i][1]) # label
    pass

training_dataset = Dataset.from_dict(training_dataset).shuffle(seed=42)
print(training_dataset)

WARN


Dataset({
    features: ['inputs', 'labels'],
    num_rows: 10443
})


In [6]:
# %% Define model & tokenizer (Kellen)
from transformers import AutoModel, AutoModelForMaskedLM, AutoModelForSeq2SeqLM # AutoModelWithLMHead
from transformers import AutoModelForCausalLM, AutoModelWithLMHead, AutoTokenizer
set_logging_and_seed(seed=42)

model_path = "/scratch/gpfs/kc4642/Models/bart-large"
tokenizer_path = "/scratch/gpfs/kc4642/Tokenizers/bart-large-tokenizer"

model = AutoModelForMaskedLM.from_pretrained(model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Add pad token if not previously seen
if tokenizer.mask_token != "<mask>":
    tokenizer.add_special_tokens({"mask_token" : "<mask>"})
    model.resize_token_embeddings(len(tokenizer))
    
# Freeze the base model if required
freeze = False

if (freeze == True):
    for param in model.roberta.parameters(): param.requires_grad = False # model.bert.parameters() for BERT-Large
    # for param in model.lm_head.parameters(): param.requires_grad = True # Uncomment for BART-Large

WARN


In [7]:
# %% Tokenize Dataset (Charles)
from transformers import DataCollatorForLanguageModeling
set_logging_and_seed(seed=42)

max_input_length = max_target_length = 128

def tokenize_function(examples):
    inputs = examples["inputs"]
    labels = examples["labels"]

    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)
    model_labels = tokenizer(labels, max_length=max_target_length, padding="max_length", truncation=True).input_ids
    
    padded_labels = []
    for label_example in model_labels:
        temp_label = [label if label != tokenizer.pad_token_id else -100 for label in label_example]
        padded_labels.append(temp_label)
    
    model_inputs["labels"] = padded_labels
    # model_inputs["labels"] = model_labels
    return model_inputs

# Map our tokenization scheme onto our datasets (uncomment whatever is necessary)
encoded_train = training_dataset.map(tokenize_function, batched=True)
# encoded_train = atomic_training_dataset.map(tokenize_function, batched=True)
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

WARN
                                                                   

In [None]:
# %% Manual Training Data Preparation (Kellen)
from torch.utils.data import DataLoader

set_logging_and_seed(seed=42)

# Remove unnecessary column names from our data
encoded_train = encoded_train.remove_columns(["inputs"])
encoded_train.set_format("torch")

# Create training dataloader
train_dataloader = DataLoader(encoded_train, shuffle=False, batch_size=32)

In [None]:
# %% Curriculum Learning (Kellen)
from transformers import get_scheduler, AdamW
set_logging_and_seed(seed=42)

# Define an optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Define lr scheduler
num_epochs = 15
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

# Training loop
progress_bar = tqdm(range(num_training_steps), position=0, leave=True)
model.to("cuda")

model.train()
for epoch in range(num_epochs):
    # Store our curriculum scores
    softmax_scores = []
    epoch_loss = 0.0
    for batch in train_dataloader:
        batch = {k: v.to("cuda") for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        # Append our per sample softmax scores
        logits = outputs.logits
        sample_index, mask_token_index = (batch["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)
        # softmax_logits = torch.softmax(logits[sample_index, mask_token_index, :], dim=1)
        # scores = softmax_logits[sample_index, batch["labels"][:, 1]]
        
        scores = logits[sample_index, mask_token_index, :][sample_index, batch["labels"][:, 1]]
        softmax_scores += scores.tolist()
        epoch_loss += loss.item()
        pass
    
    # Update our curriculum
    all_scores = np.array(softmax_scores)
    subset_idx = np.argsort(all_scores)[::-1] # sort scores in descending order, maximal scores first
    encoded_train = encoded_train.select(subset_idx)
    train_dataloader = DataLoader(encoded_train, batch_size=32, shuffle=False)
    
    # Print our loss per epoch
    print("Loss:", str(epoch_loss / len(train_dataloader)))
    pass

print("Outside of the training loop!")

In [8]:
# %% Instantiate Data Collator & Training (Mahsa)
from transformers import AutoModelForMaskedLM
set_logging_and_seed(seed=42)

training_args = TrainingArguments(output_dir="/tmp/", 
                                         evaluation_strategy="no",
                                         num_train_epochs=15,
                                         learning_rate=2e-5, 
                                         weight_decay=0.01, 
                                         per_device_train_batch_size=32,
                                         per_device_eval_batch_size=32,
                                         fp16=True, # Uncomment only when on GPU
                                         push_to_hub=False)

trainer = Trainer(model=model, 
                         args=training_args, 
                         train_dataset=encoded_train, 
                         eval_dataset=encoded_train, 
                         data_collator=data_collator,
                         tokenizer=tokenizer)

WARN


In [9]:
# %% Train Model (Mahsa)
set_logging_and_seed(seed=42)

trainer.train()

WARN
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,3.1615
1000,2.6723
1500,2.3673
2000,2.1776
2500,2.0269
3000,1.9533
3500,1.8436
4000,1.7761
4500,1.7676


TrainOutput(global_step=4905, training_loss=2.1528380912135257, metrics={'train_runtime': 716.8869, 'train_samples_per_second': 218.507, 'train_steps_per_second': 6.842, 'total_flos': 3.64992688809792e+16, 'train_loss': 2.1528380912135257, 'epoch': 15.0})

In [10]:
# %% Save Model (Kellen)
set_logging_and_seed(seed=42)

# Replace path below with own desired path
trainer.save_model("/scratch/gpfs/kc4642/Models/Trained_Numersense/bert-large-linear-FT")

WARN


In [9]:
# %% Evaluate Model (Mahsa)
set_logging_and_seed(seed=42)

def generate_json_file(input_filename,output_filename, model, tokenizer):
    # Read all sentences from the file
    with open(input_filename, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines()]

    # Open the output JSONL file for writing
    with open(output_filename, 'w', encoding='utf-8') as f:
        for sentence in tqdm(sentences, desc="Processing sentences", position=0, leave=True):
            # Replace "<mask>" with the actual mask token used by the tokenizer
            mask_token = tokenizer.mask_token
            input_ids = tokenizer(sentence.replace("<mask>", mask_token), return_tensors="pt").input_ids

            # Compute logits with no gradient calculation
            with torch.no_grad():
                logits = model(input_ids.to("cuda")).logits

            # Find the position of the mask token
            mask_token_index = (input_ids[0] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0].item()

            # Apply softmax to logits at the mask token position
            softmax_logits = torch.softmax(logits[0, mask_token_index], dim=0)

            # List of candidates
            candidates = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "no"]

            # Calculate scores for each candidate
            results = []
            for candidate in candidates:
                candidate_id = tokenizer.convert_tokens_to_ids(candidate)
                candidate_score = softmax_logits[candidate_id].item()  # Extract softmax score for each candidate
                results.append({"word": candidate, "score": candidate_score})

            # Sort results by score descending
            results.sort(key=lambda x: x['score'], reverse=True)

            # Create the result dictionary
            result_data = {
                "probe": sentence,
                "result_list": results
            }

            # Write result to the JSONL file
            json_string = json.dumps(result_data)
            f.write(json_string + '\n')

WARN


In [12]:
# %% Extract JSON Results (Kellen)
set_logging_and_seed(seed=42)

# Define test split
test = "core"

if test == "core":
    input_filename = "test_initial_cleaned.txt"
    output_filename= "result_core.jsonl"
else:
    input_filename = "test_dataset.txt"
    output_filename= "result_all.jsonl"

# Generate our JSON results
generate_json_file(input_filename, output_filename, model.to("cuda"), tokenizer)
print("Completed!")

WARN
Processing sentences: 100%|██████████| 1132/1132 [00:18<00:00, 62.88it/s]

Completed!



