In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import sys
sys.path.append('../scripts')

In [3]:
import os
# Disable weights and biases (if installed)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from pathlib import Path

from dataset import load_data, get_dataloader
from generative.transformers_util import get_training_args, get_trainer, get_tokenizer

# Training

In [5]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..'), job_name='foo', version_base='1.1')
config = compose(config_name='experiment.yaml')

In [6]:
training_args = get_training_args(config, report_to="none")
tokenizer = get_tokenizer(config)



In [7]:
base_path = Path('..')
train_df, val_df, test_df = load_data(base_path / config.data.cnf_tsv_path, base_path / config.data.controls_tsv_path)

In [8]:
# Ellipses
(~train_df.controls).sum(), (~val_df.controls).sum(), (~test_df.controls).sum()

(2317, 463, 488)

In [9]:
# Controls
(train_df.controls).sum(), (val_df.controls).sum(), (test_df.controls).sum()

(2239, 500, 529)

In [10]:
train_dataset, val_dataset, test_dataset = get_dataloader(train_df, val_df, test_df, tokenizer)

In [11]:
training_args.num_train_epochs = 10

In [12]:
trainer = get_trainer(config, tokenizer, training_args, train_dataset, val_dataset)

In [None]:
trainer.train()

In [None]:
eval_metrics = trainer.evaluate(val_dataset)
print(eval_metrics)

test_metrics = trainer.evaluate(test_dataset, metric_key_prefix='test')
print(test_metrics)

# Error Analysis

In [None]:
from transformers import Text2TextGenerationPipeline
from evaluation import error_analysis, Metrics

In [None]:
pipeline = Text2TextGenerationPipeline(model=trainer.model, tokenizer=tokenizer, max_length=config.generation_max_length, device=0)

In [None]:
from evaluation import error_analysis, get_scores, encode_decode
from notebook_util import show_errors

In [None]:
import pandas as pd

def calculate_errors(out, sample):
    gen_text = [o['generated_text'] for o in out]
    errors = error_analysis(gen_text, encode_decode(sample.full_resolution), encode_decode(sample.raw_sentence))
    errors = pd.concat([errors, sample[['file', 'sentence_id']].reset_index()], axis=1)
    return errors

In [None]:
%%time
out_valid = pipeline(list(val_df.raw_sentence))

In [None]:
errors_valid = calculate_errors(out_valid, val_df)

In [None]:
%%time
scores = get_scores(errors_valid, "eval")
scores

In [None]:
errors_valid.error_type.value_counts()

In [None]:
s = errors_valid[errors_valid.error_type != "tp"].sample(20)
show_errors(s)

In [None]:
%%time
out_test = pipeline(list(test_df.raw_sentence))

In [None]:
errors_test = calculate_errors(out_test, test_df)
errors_test.error_type.value_counts()

In [None]:
%%time
scores = get_scores(errors_test, "test")
scores

## Clean up

In [None]:
%%time
out_train = pipeline(list(train_df.raw_sentence))

errors_train = calculate_errors(out_train, train_df)

errors_train.error_type.value_counts()

In [None]:
errors_all = pd.concat([errors_train, errors_valid, errors_test])

In [None]:
errors_all.error_type.value_counts()

In [None]:
show_errors(errors_all[errors_all.error_type != "tp"].sort_values(['error_type', 'file', 'sentence_id']))