In [1]:
import sys
sys.path.append('../scripts')

In [2]:
import os
# Disable weights and biases (if installed)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
#os.environ["WANDB_PROJECT"] = "ggponc_ellipses"
os.environ["WANDB_DISABLED"] = "true"

In [3]:
from pathlib import Path

from generative.dataset import EllipsesDataset
from generative.run_experiment import get_training_args, get_trainer, get_tokenizer

  from .autonotebook import tqdm as notebook_tqdm


# Training

In [4]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..'), job_name='foo', version_base='1.1')
config = compose(config_name='experiment.yaml')
config.model_name = "google/mt5-base"
config.metrics = ['exact_match', 'google_bleu']
config.learning_rate = 5e-5

In [5]:
training_args = get_training_args(config, report_to=None)
tokenizer = get_tokenizer(config)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [6]:
import pandas as pd
df_ellipses = pd.read_csv('../../ggponc_annotation/notebooks/ggponc_ccnfs.tsv', sep='\t')
df_controls = pd.read_csv('../../ggponc_annotation/notebooks/ggponc_cnfs_controls_small.tsv', sep='\t')

In [7]:
train_cnfs = df_ellipses[df_ellipses.split == 'train']
valid_cnfs = df_ellipses[df_ellipses.split == 'dev']
test_cnfs = df_ellipses[df_ellipses.split == 'test']

train_controls = df_controls[df_controls.split == 'train']
valid_controls = df_controls[df_controls.split == 'dev']
test_controls = df_controls[df_controls.split == 'test']

len(train_cnfs), len(valid_cnfs),  len(test_cnfs), len(train_controls), len(valid_controls), len(test_controls)

(2241, 462, 462, 2269, 447, 449)

In [8]:
train_data = EllipsesDataset(pd.concat([train_cnfs.raw_sentence]), pd.concat([train_cnfs.full_resolution]), tokenizer)
val_data = EllipsesDataset(pd.concat([valid_cnfs.raw_sentence]), pd.concat([valid_cnfs.full_resolution]), tokenizer)
test_data = EllipsesDataset(pd.concat([test_cnfs.raw_sentence]), pd.concat([test_cnfs.full_resolution]), tokenizer)

In [9]:
#train_data = EllipsesDataset(pd.concat([train_cnfs.raw_sentence, train_controls.raw_sentence]), pd.concat([train_cnfs.full_resolution, train_controls.raw_sentence]), tokenizer)
#val_data = EllipsesDataset(pd.concat([valid_cnfs.raw_sentence, valid_controls.raw_sentence]), pd.concat([valid_cnfs.full_resolution, valid_controls.raw_sentence]), tokenizer)

In [10]:
training_args.num_train_epochs = 30

In [11]:
trainer = get_trainer(config, tokenizer, training_args, train_data, val_data)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/Florian.Borchert/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/Florian.Borchert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/Florian.Borchert/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [12]:
trainer.train()

***** Running training *****
  Num examples = 2241
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 8430
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Exact Match,Google Bleu
1,5.6074,0.650882,0.0,0.313107
2,0.5172,0.224983,0.194805,0.700083
3,0.2256,0.082987,0.569264,0.794333
4,0.1388,0.074682,0.638528,0.864628
5,0.1293,0.067145,0.638528,0.88182
6,0.0895,0.06264,0.675325,0.907652
7,0.0949,0.052798,0.727273,0.935579
8,0.0724,0.043203,0.774892,0.931939
9,0.0512,0.044705,0.772727,0.945208
10,0.0572,0.041364,0.777056,0.940287


***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  Num examples = 462
  Batch size = 8
***** Running Evaluation *****
  N

TrainOutput(global_step=8430, training_loss=0.3067725738950189, metrics={'train_runtime': 4805.7988, 'train_samples_per_second': 13.989, 'train_steps_per_second': 1.754, 'total_flos': 1.58202661833216e+16, 'train_loss': 0.3067725738950189, 'epoch': 30.0})

In [13]:
eval_metrics = trainer.evaluate(val_data)
print(eval_metrics)

test_metrics = trainer.evaluate(test_data, metric_key_prefix='test')
print(test_metrics)

***** Running Evaluation *****
  Num examples = 462
  Batch size = 8


KeyboardInterrupt: 

# Error Analysis

In [14]:
from transformers import Text2TextGenerationPipeline
from evaluation import error_analysis

In [15]:
pipeline = Text2TextGenerationPipeline(model=trainer.model, tokenizer=tokenizer, max_length=500, device=0)

In [16]:
valid_cnfs.raw_sentence.iloc[0]

'Hauptrisikofaktoren für das Auftreten eines Mundhöhlenkarzinoms sind chronischer Tabak- oder Alkoholabusus, wesentlich seltener auch andere Faktoren.'

In [17]:
valid_cnfs.full_resolution.iloc[0]

'Hauptrisikofaktoren für das Auftreten eines Mundhöhlenkarzinoms sind chronischer Tabakabusus oder Alkoholabusus, wesentlich seltener auch andere Faktoren.'

In [28]:
from evaluation import error_analysis

In [31]:
def error_analysis(predictions, gt_resolutions, original_sentences):
    d = difflib.Differ()
    res = []

    for pred_gen, true, sent in zip(predictions, gt_resolutions, original_sentences):
        entry = {'pred' : pred_gen, 'ground_truth' : true, 'original' : sent}    
        if pred_gen == true:
            entry['error_type'] = 'tp'
        elif pred_gen == sent:
            entry['error_type'] = 'fn'
        else:
            op_codes = difflib.SequenceMatcher(None, true, pred_gen).get_opcodes()
            counts = Counter([o[0] for o in op_codes])
            del counts["equal"]
            if len(counts) > 1:
                entry['error_type'] = 'complex'
            else:
                entry['error_type'] = list(counts.keys())[0]
        res.append(entry)
    return pd.DataFrame(res)

In [32]:
import pandas as pd

def calculate_errors(out, sample):
    gen_text = [o['generated_text'] for o in out]
    errors = error_analysis(gen_text, sample.full_resolution, sample.raw_sentence)
    display(pd.concat([errors.error_type.value_counts(), errors.error_type.value_counts() / len(errors)], axis=1))    
    return errors

In [19]:
%%time
out_valid = pipeline(list(valid_cnfs.raw_sentence))

CPU times: user 4min 14s, sys: 98.8 ms, total: 4min 14s
Wall time: 4min 14s


In [35]:
errors_valid = calculate_errors(out_valid, valid_cnfs)

Unnamed: 0,error_type,error_type.1
tp,382,0.82684
delete,27,0.058442
replace,18,0.038961
insert,16,0.034632
fn,10,0.021645
complex,9,0.019481


In [54]:
ed = nltk.edit_distance
def metric(p, g, o):
    d = ed(p,g)
    k = ed(p,o)
    l = ed(o,g)
    if d == 0:
        return 1
    return 1 - (d / (k + l))

In [56]:
my_score = errors_valid.apply(lambda r: metric(r['pred'], r['ground_truth'], r['original']), axis=1)
my_score.mean()

0.9313555280996356

In [57]:
%%time
out_test = pipeline(list(test_cnfs.raw_sentence))

CPU times: user 4min 42s, sys: 89.3 ms, total: 4min 42s
Wall time: 4min 42s


In [58]:
errors_test = calculate_errors(out_test, test_cnfs)

Unnamed: 0,error_type,error_type.1
tp,376,0.813853
delete,26,0.056277
complex,20,0.04329
insert,16,0.034632
replace,13,0.028139
fn,11,0.02381


In [59]:
my_score = errors_test.apply(lambda r: metric(r['pred'], r['ground_truth'], r['original']), axis=1)
my_score.mean()

0.9269631059037321