In [None]:
import json
import os
import argparse
import torch
import numpy as np
from tqdm import tqdm
import evaluate
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
    
device = "cuda:0" if torch.cuda.is_available() else "cpu"    

class Experiment:

    def __init__(self, learning_rate, model, epochs, batch_size, neg, dataset_path):
        self.model_name = model
        self.dataset_path = dataset_path
        self.learning_rate = learning_rate
        self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")# MANUALLY ENTER THE TOKENIZER NAME
        self.dataset = self.process_dataset(dataset_path, neg)
        self.tokenized_datasets = self.dataset.map(self.tokenize_function, batched=True)
        self.model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2).to(device)
        self.metric = evaluate.load("glue", "mrpc")
        self.training_args = TrainingArguments(
                output_dir= os.getcwd() + "\\output",
                logging_steps = 500,
                evaluation_strategy="steps",
                eval_steps = 500,
                num_train_epochs = epochs,
                learning_rate = learning_rate,
                per_device_train_batch_size = batch_size
                )

    def process_dataset(self, dataset_path, neg):
        #convert dataset into json for dataset loader
        d_file = open(dataset_path, 'r')
        d_json = json.load(d_file)
        formatted_examples = []
        for example in tqdm(d_json, desc="Loading Dataset"):
            #create an entry for each positive example
            positive_ids = list(np.array(example["positive_idxs"]) - 1)
            candidate = [example["derivation"][equation_id][1] for equation_id in positive_ids]
            context = (str(example['derivation'][:-1]) + ' [SEP] ' + str(example['derivation'][-1][0])).replace('[[','[').replace(']]',']').replace('\\\\','\\')
            input_text = context + " [SEP] " + " ".join(candidate)
            formatted_examples.append({"text": input_text, "label": 1})
            #create an entry for each negative example
            count_neg = 0
            for negative in example["negatives"]:
                if count_neg == neg:
                    break
                input_text = context + " [SEP] " + negative
                formatted_examples.append({"text": input_text, 'label': 0})
                count_neg += 1
        print("Data examples", formatted_examples[:4])
        #split randomly between train, dev, and test set
        dataset = Dataset.from_list(formatted_examples)
        dataset_split = dataset.train_test_split(test_size=0.99)
        return dataset_split

    def tokenize_function(self, examples):
        return self.tokenizer(examples["text"], padding="max_length", truncation=True)

    def compute_metrics(self, eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        majority_class_preds = [1 for pred in predictions]
        majority_baseline_score = self.metric.compute(predictions=majority_class_preds, references=labels)
        print("majority_class_baseline:", majority_baseline_score)
        score = self.metric.compute(predictions=predictions, references=labels)
        print(score)
        return score

    def train_and_eval(self):
        trainer = Trainer(
            model = self.model,
            args = self.training_args,
            train_dataset = self.tokenized_datasets["train"],
            eval_dataset = self.tokenized_datasets["test"],
            compute_metrics = self.compute_metrics
        )
        trainer.evaluate()

if __name__ == '__main__':
    
    model = os.getcwd() + "\\models\\NES_steps=2.json_roberta-base"
    data_path = os.getcwd() + "\\data\\EVAL_NES_steps=2.json"
    
    
    #data_path = os.getcwd() + "\\data\\EVAL_NES_steps=3.json"
    #data_path = os.getcwd() + "\\data\\NES_VAR_RE_steps=3.json"
    #data_path = os.getcwd() + "\\data\\NES_EXPR_EXC_steps=3.json"
    #data_path = os.getcwd() + "\\data\\NES_OP_SWAP_steps=3.json"
    
    #data_path = os.getcwd() + "\\data\\EVAL_NES_steps=4.json"
    #data_path = os.getcwd() + "\\data\\NES_VAR_RE_steps=4.json"
    #data_path = os.getcwd() + "\\data\\NES_EXPR_EXC_steps=4.json"
    #data_path = os.getcwd() + "\\data\\NES_OP_SWAP_steps=4.json"
    
    torch.backends.cudnn.deterministic = True 
    seed = 42
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available:
        torch.cuda.manual_seed_all(seed)
    experiment = Experiment(
            learning_rate = 5e-5,
            batch_size = 8, 
            neg = 1, 
            epochs = 1, 
            model = model, 
            dataset_path = data_path
            )
    experiment.train_and_eval()
                


Could not locate the tokenizer configuration file, will try to use the model config instead.
https://huggingface.co/roberta-base/resolve/main/config.json not found in cache or force_download set to True, downloading to C:\Users\JCMea\.cache\huggingface\transformers\tmp_9qyekva


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/config.json in cache at C:\Users\JCMea/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
creating metadata file for C:\Users\JCMea/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\JCMea/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/vocab.json in cache at C:\Users\JCMea/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
creating metadata file for C:\Users\JCMea/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
https://huggingface.co/roberta-base/resolve/main/merges.txt not found in cache or force_download set to True, downloading to C:\Users\JCMea\.cache\huggingface\transformers\tmppfzgl2p3


Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/merges.txt in cache at C:\Users\JCMea/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
creating metadata file for C:\Users\JCMea/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
https://huggingface.co/roberta-base/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to C:\Users\JCMea\.cache\huggingface\transformers\tmp47kj8mo6


Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/tokenizer.json in cache at C:\Users\JCMea/.cache\huggingface\transformers\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
creating metadata file for C:\Users\JCMea/.cache\huggingface\transformers\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\Users\JCMea/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\Users\JCMea/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading

Data examples [{'text': "['premise', 'h{(v)} = \\log{(v)}'] [SEP] ['differentiate', 1, 'v'] [SEP] \\frac{d}{d v} h{(v)} = \\frac{d}{d v} \\log{(v)}", 'label': 1}, {'text': "['premise', 'h{(v)} = \\log{(v)}'] [SEP] ['differentiate', 1, 'v'] [SEP] \\int h{(v)}\\, dv = \\int \\log{(v)}\\, dv", 'label': 0}, {'text': "['premise', 'T{(l)} = \\cos{(l)}'] [SEP] ['divide', 1, '\\cos{(l)}'] [SEP] \\frac{T{(l)}}{\\cos{(l)}} = 1", 'label': 1}, {'text': "['premise', 'T{(l)} = \\cos{(l)}'] [SEP] ['divide', 1, '\\cos{(l)}'] [SEP] T^{2}{(l)} = T{(l)} \\cos{(l)}", 'label': 0}]


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

loading configuration file D:\PhD_on_D\Projects\Wikipedia_Derivations\Project 5\Classification\models\NES_steps=2.json_roberta-base\config.json
Model config RobertaConfig {
  "_name_or_path": "D:\\PhD_on_D\\Projects\\Wikipedia_Derivations\\Project 5\\Classification\\models\\NES_steps=2.json_roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 502