<a href="https://colab.research.google.com/github/kaimihata/geo-bert/blob/main/geo-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Notebook based on the HuggingFace script for Glue evaluation
# https://github.com/huggingface/transformers/blob/master/examples/text-classification/run_glue.py

# Google Colab Configuration
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets
from google.colab import drive
import warnings
drive.mount('/content/drive', force_remount=True)
warnings.filterwarnings('ignore')

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-1j4sxvwx
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-1j4sxvwx
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 8.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 60.6MB/s 
Building wheels for collected packages: trans

In [2]:
# Imports
import os
import logging
import random
import torch

from torch.nn import CrossEntropyLoss, MSELoss
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap as Iso

import numpy as np
from datasets import load_dataset, load_metric, load_from_disk, Dataset, DatasetDict

import transformers
from transformers.trainer_utils import is_main_process
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    BertModel
)

In [3]:
# Parameters
model_args = {
  "model_name": "bert-base-uncased", # tokenizer/embedding model
  "config_name": None, # config
  "cache_dir": "/content/drive/My Drive/Colab Notebooks/GLUEv2/cache" # preprocessed data stored in this cache
}

data_args = {
  "task_name": "sst2", # Task to eval on
  "max_seq_length": 128, # trunc all sentences to this length
  "pad_to_max_length": True, # pad shorter sentences to max length
  "train_file": "/content/drive/My Drive/Colab Notebooks/amazon_review_polarity/amazon_review_polarity_csv/test-sm.csv", # Download if None
  "validation_file": None # Download if None
}

training_args = {
  "output_dir": "/content/drive/My Drive/Colab Notebooks/GLUEv2/model_output", # The output directory where the model predictions and checkpoints will be written
  "do_train": True, # Run training
  "do_eval": True, # Run eval
  "do_predict": False, # Run predictions
  "evaluation_strategy": "epoch", # no, steps or epoch (eval location)
  "learning_rate": 1e-4, # training initial learning rate
  "local_rank": -1,
  "seed": 42,
  "num_train_epochs": 25
}

dim_params = {
  "l_pca_dim": 256,
  "s_pca_dim": 64,
  "iso_dim": 32,
  "stack_pca_dim": 96,
  "iso_neighbors": 96,
  "hidden_dim": 64
}

device = "cuda:0"

trainer_args = TrainingArguments(**training_args)

task_to_keys = {
  "amazon": ("sentence1", "sentence2"),
  "cola": ("sentence", None),
  "mnli": ("premise", "hypothesis"),
  "mrpc": ("sentence1", "sentence2"),
  "qnli": ("question", "sentence"),
  "qqp": ("question1", "question2"),
  "rte": ("sentence1", "sentence2"),
  "sst2": ("sentence", None),
  "stsb": ("sentence1", "sentence2"),
  "wnli": ("sentence1", "sentence2"),
}

num_datapoints = 8000
load_bert = True
load_embeddings = True

In [4]:
# Model and output setup
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if is_main_process(training_args["local_rank"]) else logging.WARN,
)

# Log on each process the small summary:
logger = logging.getLogger(__name__)
logger.info(f"Training/evaluation parameters {training_args}")
set_seed(training_args["seed"])

num_labels = 2

# Prepare Tokenizer 
config = AutoConfig.from_pretrained(
  model_args["model_name"],
  num_labels=num_labels,
  finetuning_task=data_args["task_name"]
)
tokenizer = AutoTokenizer.from_pretrained(
  model_args["model_name"],
  use_fast=True,
)

# Download Datasets
if data_args["task_name"] is not None:
    # Downloading and loading a dataset from the hub.
    datasets = load_dataset("glue", data_args["task_name"])
elif data_args["train_file"].endswith(".csv"):
    # Loading a dataset from local csv files
    datasets = load_dataset(
        "csv", data_files={"train": data_args["train_file"], "validation": data_args["validation_file"]}
    )
else:
    # Loading a dataset from local json files
    datasets = load_dataset(
        "json", data_files={"train": data_args["train_file"], "validation": data_args["validation_file"]}
    )

# Prepare Labels
if data_args["task_name"] is not None:
    is_regression = data_args["task_name"] == "stsb"
    if not is_regression:
        label_list = datasets["train"]["label"]
        num_labels = len(label_list)
    else:
        num_labels = 1
else:
    # Trying to have good defaults here, don't hesitate to tweak to your needs.
    is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"]
    if is_regression:
        num_labels = 1
    else:
        # A useful fast method:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
        label_list = datasets["train"].unique("label")
        label_list.sort()  # Let's sort it for determinism
        num_labels = len(label_list)

# Initial Preprocessing of Datasets
# Preprocessing the datasets
if data_args["task_name"] is not None:
    sentence1_key, sentence2_key = task_to_keys[data_args["task_name"]]
else:
    # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
    non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
    if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", "sentence2"
    else:
        if len(non_label_column_names) >= 2:
            sentence1_key, sentence2_key = non_label_column_names[:2]
        else:
            sentence1_key, sentence2_key = non_label_column_names[0], None

# Padding strategy
if data_args["pad_to_max_length"]:
    padding = "max_length"
    max_length = data_args["max_seq_length"]
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False
    max_length = None

# Some models have set the order of the labels to use, so let's make sure we do use it.
label_to_id = {v: i for i, v in enumerate(label_list)}


04/21/2021 06:00:35 - INFO - __main__ -   Training/evaluation parameters {'output_dir': '/content/drive/My Drive/Colab Notebooks/GLUEv2/model_output', 'do_train': True, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'learning_rate': 0.0001, 'local_rank': -1, 'seed': 42, 'num_train_epochs': 25}
04/21/2021 06:00:36 - INFO - filelock -   Lock 140078228421072 acquired on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…

04/21/2021 06:00:36 - INFO - filelock -   Lock 140078228421072 released on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170.lock





04/21/2021 06:00:37 - INFO - filelock -   Lock 140078127346000 acquired on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

04/21/2021 06:00:37 - INFO - filelock -   Lock 140078127346000 released on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock





04/21/2021 06:00:37 - INFO - filelock -   Lock 140078127198160 acquired on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…

04/21/2021 06:00:38 - INFO - filelock -   Lock 140078127198160 released on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock





04/21/2021 06:00:38 - INFO - filelock -   Lock 140078127198160 acquired on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…

04/21/2021 06:00:39 - INFO - filelock -   Lock 140078127198160 released on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7777.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4473.0, style=ProgressStyle(description…


Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7439277.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


In [5]:
# Download embedding model and embed training data
if load_bert:
  embedding_model = transformers.BertModel.from_pretrained(model_args["model_name"])
  embedding_model.to(device)

  model_with_weights = AutoModelForSequenceClassification.from_pretrained(
    model_args["model_name"],
    model_type=None,
  )
  embedding_model = model_with_weights.bert.cuda()

  def preprocess_function(examples):
    # Tokenize the texts
    args = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    result = tokenizer(*args, padding=padding, max_length=max_length, truncation=True)
    embedded_output = embedding_model(
      torch.tensor(
        result["input_ids"],
        device=torch.device(device)
      ),
      attention_mask=torch.tensor(
        result["attention_mask"],
        device=torch.device(device)
      ))
    return { "embedding": embedded_output[0][:,0,:].cpu().detach().numpy() }

  datasets = datasets.map(preprocess_function, batched=True, batch_size=128)

  datasets.save_to_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"])

else:
  # Load Datasets from cache
  datasets = load_from_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"])

04/21/2021 06:00:43 - INFO - filelock -   Lock 140074989152976 acquired on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…

04/21/2021 06:00:50 - INFO - filelock -   Lock 140074989152976 released on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=527.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [6]:
# Limit size of train set if needed
if (num_datapoints > 0):
  datasets["train"] = datasets["train"].filter(lambda example, index: index < num_datapoints, with_indices=True)
  print(datasets)

HBox(children=(FloatProgress(value=0.0, max=68.0), HTML(value='')))


DatasetDict({
    train: Dataset({
        features: ['embedding', 'idx', 'label', 'sentence'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['embedding', 'idx', 'label', 'sentence'],
        num_rows: 872
    })
    test: Dataset({
        features: ['embedding', 'idx', 'label', 'sentence'],
        num_rows: 1821
    })
})


In [7]:
if load_embeddings:
  # Dimensionality Reduction
  # Names of validation and test sets in case of MNLI dataset
  validation_set_name = "validation_matched" if data_args["task_name"] == "mnli" else "validation"
  test_set_name = "test_matched" if data_args["task_name"] == "mnli" else "test"

  # Number of rows for each set
  train_rows = len(datasets["train"])
  eval_rows = len(datasets[validation_set_name])
  test_rows = len(datasets[test_set_name])

  # Embedded Sets
  train_emb = np.asarray(datasets["train"]["embedding"])
  valid_emb = np.asarray(datasets[validation_set_name]["embedding"])
  test_emb = np.asarray(datasets[test_set_name]["embedding"])
  features = np.vstack((train_emb, test_emb, valid_emb))
  transformed_features = []

  # Reduce dimensions
  # Large PCA
  pca = PCA(n_components=dim_params["l_pca_dim"], svd_solver='full')
  transformed_features.append(pca.fit_transform(features))

  # IsoMAP
  iso = Iso(n_neighbors=dim_params["iso_neighbors"], n_components=dim_params["iso_dim"], n_jobs=-1)
  transformed_features.append(iso.fit_transform(features))

  # Small PCA
  pca = PCA(n_components=dim_params["s_pca_dim"], svd_solver='full')
  transformed_features.append(pca.fit_transform(features))

  # Second PCA
  pca = PCA(n_components=dim_params["stack_pca_dim"], svd_solver='full')
  transformed_features.append(pca.fit_transform(features))

  # Split dimensionality reduced datsets into train, test, eval
  # Thresholds
  c1 = train_rows
  c2 = train_rows + test_rows

  # Prepare Large PCA
  large_pca_embedded_train = Dataset.from_dict({
    "embedding": transformed_features[0][:c1],
    "label": datasets["train"]["label"]
  })
  large_pca_embedded_test = Dataset.from_dict({
    "embedding": transformed_features[0][c1:c2],
    "label": datasets["test"]["label"]
  })
  large_pca_embedded_eval = Dataset.from_dict({
    "embedding": transformed_features[0][c2:],
    "label": datasets["validation"]["label"]
  })
  large_pca_embedded = DatasetDict({
    "train": large_pca_embedded_train,
    "test": large_pca_embedded_test,
    "validation": large_pca_embedded_eval
  })

  # Prepare Small PCA
  small_pca_embedded_train = Dataset.from_dict({
    "embedding": transformed_features[2][:c1],
    "label": datasets["train"]["label"]
  })
  small_pca_embedded_test = Dataset.from_dict({
    "embedding": transformed_features[2][c1:c2],
    "label": datasets["test"]["label"]
  })
  small_pca_embedded_eval = Dataset.from_dict({
    "embedding": transformed_features[2][c2:],
    "label": datasets["validation"]["label"]
  })
  small_pca_embedded = DatasetDict({
    "train": small_pca_embedded_train,
    "test": small_pca_embedded_test,
    "validation": small_pca_embedded_eval
  })

  # Prepare Isomap
  isomap_embedded_train = Dataset.from_dict({
    "embedding": transformed_features[1][:c1],
    "label": datasets["train"]["label"]
  })
  isomap_embedded_test = Dataset.from_dict({
    "embedding": transformed_features[1][c1:c2],
    "label": datasets["test"]["label"]
  })
  isomap_embedded_eval = Dataset.from_dict({
    "embedding": transformed_features[1][c2:],
    "label": datasets["validation"]["label"]
  })
  isomap_embedded = DatasetDict({
    "train": isomap_embedded_train,
    "test": isomap_embedded_test,
    "validation": isomap_embedded_eval
  })

  # Prepare stack
  stack_embedded_train = Dataset.from_dict({
    "embedding": np.concatenate((transformed_features[3][:c1], transformed_features[1][:c1]), axis=1),
    "label": datasets["train"]["label"]
  })
  stack_embedded_test = Dataset.from_dict({
    "embedding": np.concatenate((transformed_features[3][c1:c2], transformed_features[1][c1:c2]), axis=1),
    "label": datasets["test"]["label"]
  })
  stack_embedded_eval = Dataset.from_dict({
    "embedding": np.concatenate((transformed_features[3][c2:], transformed_features[1][c2:]), axis=1),
    "label": datasets["validation"]["label"]
  })
  stack_embedded = DatasetDict({
    "train": stack_embedded_train,
    "test": stack_embedded_test,
    "validation": stack_embedded_eval
  })

  # Save for future use
  large_pca_embedded.save_to_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"] + "/large_pca/" + str(dim_params["l_pca_dim"]))
  small_pca_embedded.save_to_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"] + "/small_pca/" + str(dim_params["s_pca_dim"]))
  isomap_embedded.save_to_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"] + "/isomap/" + str(dim_params["iso_dim"]))
  stack_embedded.save_to_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"] + "/stack/" + str(dim_params["stack_pca_dim"]))
else:
  # Load from preloaded
  large_pca_embedded = load_from_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"] + "/large_pca/" + str(dim_params["l_pca_dim"]))
  small_pca_embedded = load_from_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"] + "/small_pca/" + str(dim_params["s_pca_dim"]))
  isomap_embedded = load_from_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"] + "/isomap/" + str(dim_params["iso_dim"]))
  stack_embedded = load_from_disk(model_args["cache_dir"] + "/datasets/" + data_args["task_name"] + "/stack/" + str(dim_params["stack_pca_dim"]))

In [8]:
# SELECT DATASET
datasets = large_pca_embedded

In [9]:
# Train/Eval Split
train_dataset = datasets["train"]
eval_dataset = datasets["validation_matched" if data_args["task_name"] == "mnli" else "validation"]

In [10]:
# Modified Logisitic Regression
class SimpleRegression(torch.nn.Module):
  # Constructor
  def __init__(self, input_dim, output_dim):
    super(SimpleRegression, self).__init__()
    self.num_labels = num_labels
    self.linear = torch.nn.Linear(input_dim, dim_params["hidden_dim"])
    self.relu = torch.nn.ReLU()
    self.linear2 = torch.nn.Linear(dim_params["hidden_dim"], output_dim)

  # forward pass
  def forward(self, embedding, labels=None):
    outputs = self.linear2(self.relu(self.linear(torch.tensor(embedding, device=device))))
    if labels is not None:
      labels = torch.tensor(labels, device=device)

    loss = torch.tensor([])
    if labels is not None:
        if self.num_labels == 1:
            #  We are doing regression
            loss_fct = MSELoss()
            loss = loss_fct(outputs.view(-1), labels.view(-1))
        else:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1))
    else:
      outputs=torch.argmax(outputs, dim=1)

    return (loss, outputs)

In [11]:
# Define/load evaluation and trainer
if data_args["task_name"] is not None:
  test_dataset = datasets["test_matched" if data_args["task_name"] == "mnli" else "test"]
  
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
  logger.info(f"Sample {index} of the training set: {train_dataset[index].keys()}.")

# Get the metric function
if data_args["task_name"] is not None:
  metric = load_metric("glue", data_args["task_name"])

def compute_metrics(p: EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
  if data_args["task_name"] is not None:
    result = metric.compute(predictions=preds, references=p.label_ids)
    if len(result) > 1:
      result["combined_score"] = np.mean(list(result.values())).item()
    return result
  elif is_regression:
    return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
  else:
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

model = SimpleRegression(len(train_dataset["embedding"][0]), num_labels)

# Initialize our Trainer
trainer = Trainer(
  model=model,
  args=trainer_args,
  train_dataset=train_dataset,
  eval_dataset=eval_dataset if training_args["do_eval"] else None,
  compute_metrics=compute_metrics,
  tokenizer=tokenizer,
  # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
  data_collator=default_data_collator if data_args["pad_to_max_length"] else None,
)

04/21/2021 06:12:40 - INFO - __main__ -   Sample 5238 of the training set: dict_keys(['embedding', 'label']).
04/21/2021 06:12:40 - INFO - __main__ -   Sample 912 of the training set: dict_keys(['embedding', 'label']).
04/21/2021 06:12:40 - INFO - __main__ -   Sample 204 of the training set: dict_keys(['embedding', 'label']).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1848.0, style=ProgressStyle(description…




In [12]:
# Training
if training_args["do_train"]:
    train_result = trainer.train(
        model_path=model_args["model_name"] if os.path.isdir(model_args["model_name"]) else None
    )
    metrics = train_result.metrics

    trainer.save_model()  # Saves the tokenizer too for easy upload

    output_train_file = os.path.join(training_args["output_dir"], "train_results.txt")
    if trainer.is_world_process_zero():
        with open(output_train_file, "w") as writer:
            logger.info("***** Train results *****")
            for key, value in sorted(metrics.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")

        # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
        trainer.state.save_to_json(os.path.join(training_args["output_dir"], "trainer_state.json"))

# Evaluation
eval_results = {}
if training_args["do_eval"]:
    logger.info("*** Evaluate ***")

    # Loop to handle MNLI double evaluation (matched, mis-matched)
    tasks = [data_args["task_name"]]
    eval_datasets = [eval_dataset]
    if data_args["task_name"] == "mnli":
        tasks.append("mnli-mm")
        eval_datasets.append(datasets["validation_mismatched"])

    for eval_dataset, task in zip(eval_datasets, tasks):
        eval_result = trainer.evaluate(eval_dataset=eval_dataset)

        output_eval_file = os.path.join(training_args["output_dir"], f"eval_results_{task}.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info(f"***** Eval results {task} *****")
                for key, value in sorted(eval_result.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

        eval_results.update(eval_result)

if training_args["do_predict"]:
    logger.info("*** Test ***")

    # Loop to handle MNLI double evaluation (matched, mis-matched)
    tasks = [data_args["task_name"]]
    test_datasets = [test_dataset]
    if data_args["task_name"] == "mnli":
        tasks.append("mnli-mm")
        test_datasets.append(datasets["test_mismatched"])

    for test_dataset, task in zip(test_datasets, tasks):
        # Removing the `label` columns because it contains -1 and Trainer won't like that.
        test_dataset.remove_columns_("label")
        predictions = trainer.predict(test_dataset=test_dataset).predictions
        predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)

        output_test_file = os.path.join(training_args["output_dir"], f"test_results_{task}.txt")
        if trainer.is_world_process_zero():
            with open(output_test_file, "w") as writer:
                logger.info(f"***** Test results {task} *****")
                writer.write("index\tprediction\n")
                for index, item in enumerate(predictions):
                    if is_regression:
                        writer.write(f"{index}\t{item:3.3f}\n")
                    else:
                        item = label_list[item]
                        writer.write(f"{index}\t{item}\n")

print("Evaluation Results:" + str(eval_results))

Epoch,Training Loss,Validation Loss,Accuracy
1,9.3879,9.728293,0.770642
2,4.5065,7.847508,0.78555
3,1.4269,3.750697,0.818807
4,0.5151,0.731075,0.848624
5,0.374,0.425203,0.845183
6,0.3491,0.378043,0.84289
7,0.3332,0.360447,0.848624
8,0.332,0.352176,0.854358
9,0.3208,0.34643,0.855505
10,0.3227,0.34595,0.847477


04/21/2021 06:12:58 - INFO - /usr/local/lib/python3.7/dist-packages/datasets/metric.py -   Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
04/21/2021 06:13:14 - INFO - /usr/local/lib/python3.7/dist-packages/datasets/metric.py -   Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
04/21/2021 06:13:30 - INFO - /usr/local/lib/python3.7/dist-packages/datasets/metric.py -   Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
04/21/2021 06:13:45 - INFO - /usr/local/lib/python3.7/dist-packages/datasets/metric.py -   Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
04/21/2021 06:14:00 - INFO - /usr/local/lib/python3.7/dist-packages/datasets/metric.py -   Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
04/21/2021 06:14:16 - INFO - /usr/local/lib/python3.7/dist-packages/datasets/metric.py -   Removing /root/.cache/huggingface/metrics/glue/ss

04/21/2021 06:19:15 - INFO - /usr/local/lib/python3.7/dist-packages/datasets/metric.py -   Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
04/21/2021 06:19:16 - INFO - __main__ -   ***** Eval results sst2 *****
04/21/2021 06:19:16 - INFO - __main__ -     epoch = 25.0
04/21/2021 06:19:16 - INFO - __main__ -     eval_accuracy = 0.8451834862385321
04/21/2021 06:19:16 - INFO - __main__ -     eval_loss = 0.3460017144680023
04/21/2021 06:19:16 - INFO - __main__ -     eval_mem_cpu_alloc_delta = 0
04/21/2021 06:19:16 - INFO - __main__ -     eval_mem_cpu_peaked_delta = 0
04/21/2021 06:19:16 - INFO - __main__ -     eval_mem_gpu_alloc_delta = 0
04/21/2021 06:19:16 - INFO - __main__ -     eval_mem_gpu_peaked_delta = 470077440
04/21/2021 06:19:16 - INFO - __main__ -     eval_runtime = 0.6429
04/21/2021 06:19:16 - INFO - __main__ -     eval_samples_per_second = 1356.355


Evaluation Results:{'eval_loss': 0.3460017144680023, 'eval_accuracy': 0.8451834862385321, 'eval_runtime': 0.6429, 'eval_samples_per_second': 1356.355, 'epoch': 25.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 470077440}
