In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import sys
sys.path.append('../scripts')

In [3]:
import os
# Disable weights and biases (if installed)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["WANDB_DISABLED"] = "true"

In [6]:
from pathlib import Path

from dataset import load_data, get_dataloader
from generative.run_experiment import get_training_args, get_trainer, get_tokenizer

# Training

In [7]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..'), job_name='foo', version_base='1.1')
config = compose(config_name='experiment.yaml')
config.model_name = "google/mt5-base"
config.metrics = ['exact_match', 'google_bleu']
config.learning_rate = 5e-5

In [8]:
training_args = get_training_args(config, report_to="none")
tokenizer = get_tokenizer(config)



In [9]:
base_path = Path('../../ggponc_annotation/notebooks/')
train_df, val_df, test_df = load_data(base_path / 'ggponc_ccnfs.tsv', base_path / 'ggponc_cnfs_controls_small.tsv')

In [10]:
# Ellipses
(~train_df.controls).sum(), (~val_df.controls).sum(), (~test_df.controls).sum()

(2241, 462, 462)

In [11]:
# Controls
(train_df.controls).sum(), (val_df.controls).sum(), (test_df.controls).sum()

(2269, 447, 449)

In [12]:
train_dataset, val_dataset, test_dataset = get_dataloader(train_df, val_df, test_df, tokenizer)

In [13]:
training_args.num_train_epochs = 2

In [14]:
trainer = get_trainer(config, tokenizer, training_args, train_dataset, val_dataset)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 4510
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1128
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Exact Match,Google Bleu
1,0.6656,0.252594,0.191419,0.694391
2,0.2789,0.11662,0.589659,0.741167


***** Running Evaluation *****
  Num examples = 909
  Batch size = 8
***** Running Evaluation *****
  Num examples = 909
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1128, training_loss=2.5238845356812716, metrics={'train_runtime': 662.3681, 'train_samples_per_second': 13.618, 'train_steps_per_second': 1.703, 'total_flos': 1888722478596096.0, 'train_loss': 2.5238845356812716, 'epoch': 2.0})

In [None]:
eval_metrics = trainer.evaluate(val_dataset)
print(eval_metrics)

test_metrics = trainer.evaluate(test_dataset, metric_key_prefix='test')
print(test_metrics)

# Error Analysis

In [16]:
from transformers import Text2TextGenerationPipeline
from evaluation import error_analysis, Metrics

In [17]:
pipeline = Text2TextGenerationPipeline(model=trainer.model, tokenizer=tokenizer, max_length=config.generation_max_length, device=0)

In [18]:
from evaluation import error_analysis, get_scores

In [19]:
import pandas as pd

def calculate_errors(out, sample):
    gen_text = [o['generated_text'] for o in out]
    errors = error_analysis(gen_text, sample.full_resolution, sample.raw_sentence)
    display(pd.concat([errors.error_type.value_counts(), errors.error_type.value_counts() / len(errors)], axis=1))    
    return errors

In [20]:
%%time
out_valid = pipeline(list(val_df.raw_sentence))

CPU times: user 7min 56s, sys: 161 ms, total: 7min 56s
Wall time: 7min 56s


In [None]:
errors_valid = calculate_errors(out_valid, val_df)

In [None]:
%%time
scores = get_scores(errors_valid, "eval")
scores

In [None]:
%%time
out_test = pipeline(list(test_df.raw_sentence))

In [None]:
errors_test = calculate_errors(out_test, test_cnfs)

In [None]:
%%time
scores = get_scores(errors_test, "test")
scores