### Import previous work

Import from our main project and also import the `reload` method if we decide to hot-reload a module


In [1]:
import os.path
import sys

from datasets import Dataset

sys.path.extend(['../server', '../client'])

import benchmark 
format_f1_scores = benchmark.format_f1_scores
f1_scores = benchmark.f1_scores

from importlib import reload

Perform training on GPU

In [2]:
import torch 
device = torch.device("cuda")

Import data:

In [3]:
import csv
from collections import defaultdict

from sklearn import metrics
from sklearn.model_selection import train_test_split

from random import sample

SEED = 1337

with open('../data/atis/train.tsv', 'rt') as f:
    reader = csv.reader(f, delimiter='\t')
    data = list(reader)
    data_x = [row[0] for row in data]
    data_y = [row[1] for row in data]

data_x_train, data_x_val, data_y_train, data_y_val = train_test_split(data_x, data_y, test_size=0.2, random_state=SEED)

def val_accuracy(f):
    print(metrics.accuracy_score(data_y_val, f(data_x_val)))

data_x_words = sorted({w for x in data_x_train for w in x.split(' ')})

by_label = defaultdict(list)
for x, y in zip(data_x_train, data_y_train):
    by_label[y].append(x)
labels = set(by_label)
base_labels = sorted({l for label in labels for l in label.split('+')})

sample_20 = sample(range(len(data_x_train)), 20)

SCORE_DATA = { 
    "train": (data_x_train, data_y_train), 
    "val": (data_x_val, data_y_val),
    "20": ([data_x_train[i] for i in sample_20], [data_y_train[i] for i in sample_20])
}

def confusion(model, on_data : tuple[list[str], list[str]]):
    cm = defaultdict(lambda: defaultdict(int))
    for x, y in zip(*on_data):
        cm[y][model(x)] += 1
    return cm

def print_f1_scores(model, data_key="train"):
    print(format_f1_scores(f1_scores(confusion(model, SCORE_DATA[data_key]))))

Here are the hypotheses we prepared last time:

In [4]:
base_hypotheses = [
    'This example asks to explain an abbreviation',
    'This example asks a question about an aircraft type',
    'This example asks about the cost of a flight ticket',
    'This example asks for a list of airlines',
    'This example asks for an airport name',
    'This example asks about the seating capacity of a plane',
    'This example asks about the cheapest fare in the database',
    'This example asks for a city name',
    'This example asks about the distance between two points',
    'This example asks to find flights',
    'This example asks to provide a flight number',
    'This example asks to provide a flight time',
    'This example asks for a rental car or taxi price',
    'This example asks for ground transportation options',
    'This example asks about available meal choice of the flight',
    'This example asks to provide a quantity as an answer',
    'This example asks about a restriction',   
]

assert len(base_hypotheses) == len(base_labels)

Test that we an use the Trainer:

In [8]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification

Let's load our models:

In [9]:
tr_model_names = [
    "MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33", 
    "pszemraj/xtremedistil-l12-h384-uncased-zeroshot-v1.1",
    "MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli", # 100m parameters
    "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", # 200m parameters
    "facebook/bart-large-mnli", # 400m parameters
    ]

auto_models = [AutoModelForSequenceClassification.from_pretrained(n) for n in tr_model_names]
auto_tokenizers = [AutoTokenizer.from_pretrained(n) for n in tr_model_names]


and our finetuning data:

In [42]:
base_hypotheses_by_label = dict(zip(base_labels, base_hypotheses))

def balanced_positive_negative_examples(sentence, label):
    positive_labels = label.split('+')
    negative_labels = set(base_labels).difference(positive_labels)
    factor = len(negative_labels) // len(positive_labels)

    for i in range(factor):
        for l in positive_labels:
            yield sentence, base_hypotheses_by_label[l], True
            
    for l in negative_labels:
        yield sentence, base_hypotheses_by_label[l], False
        
all_finetune_examples: list[tuple[str, str, bool]] = [
    example 
    for sentence, label in zip(*SCORE_DATA["train"])
    for example in balanced_positive_negative_examples(sentence, label)
]

train_finetune_examples, eval_finetune_examples = train_test_split(all_finetune_examples, 
                                                                   random_state=SEED, 
                                                                   train_size=10**5)
len(train_finetune_examples), len(eval_finetune_examples)

(100000, 18576)

### Implement datasets

In [44]:
from datasets import Dataset

def to_arrow_dataset(examples: list[str, str, bool]):
    return Dataset.from_dict({
        'sentence' : [ex[0] for ex in examples], 
        'hypothesis': [ex[1] for ex in examples],
        'correct': [ex[2] for ex in examples],
    })

train_finetune_dataset, eval_finetune_dataset = map(to_arrow_dataset, (train_finetune_examples, eval_finetune_examples))

Here's our EntailmentClassifier:

In [68]:
from datasets import Dataset


class EntailmentClassifier:
    def __init__(self, model_index, device=device):
        self.device = device
        self.tokenizer = auto_tokenizers[model_index]
        self.model = auto_models[model_index].to(self.device)
        self.entailment_label_id = next(ix for ix, v in self.model.config.id2label.items() if v == "entailment")

    def _simple_classify(self, sentence):
        def gen():
            for t, l in zip(base_hypotheses, base_labels):
                tokenized_sentence = self.tokenizer.tokenize([sentence, t], add_special_tokens=True)
                input_ids = self.tokenizer.encode([sentence, t], 
                                                  add_special_tokens=True, 
                                                  return_tensors='pt'
                                                  ).to(self.device)
                logits = self.model(input_ids)[0][0]
                probs = logits.softmax(dim=0)
                entail_prob = probs[self.entailment_label_id].item()
                yield entail_prob, l

        return sorted(list(gen()), reverse=True)

    def classify(self, sentence):
        input_ids = self.tokenizer.batch_encode_plus(
                                    [[sentence, t] for t in base_hypotheses], 
                                   add_special_tokens=True, 
                                   padding=True,
                                   truncation=True,
                                   return_tensors='pt').to(self.device)
        logits = self.model(**input_ids)['logits'][:, self.entailment_label_id]
        probs = logits.softmax(dim=0).tolist()
    
        assert len(probs) == len(base_hypotheses) == len(base_labels)
        return sorted(zip(probs, base_labels), reverse=True)
    
    def as_input_data(self, dataset: Dataset, max_examples=None):
            def tokenize_function(batch):
                assert self.entailment_label_id in {0, 1}
                
                def as_entailment_label(value: bool):
                    if value:
                        return self.entailment_label_id
                    else:
                        return 1 - self.entailment_label_id
                
                input_batch = self.tokenizer.batch_encode_plus(    
                        [(s, h) for s, h in zip(batch['sentence'], batch['hypothesis'])],
                        add_special_tokens=True, 
                        padding=True,
                        truncation=True,
                        return_tensors='pt'
                    ).to(self.device)
                
                input_batch['labels'] = torch.tensor([as_entailment_label(c) for c in batch['correct']]
                                                     ).to(self.device)
                input_batch['input_sentences'] = self.tokenizer.batch_decode(input_batch['input_ids'])
                
                return input_batch

            if max_examples is not None:
                dataset = dataset.select(range(max_examples))

            return dataset.map(tokenize_function, batched=True, remove_columns=['sentence', 'hypothesis', 'correct'])
    
    def create_finetune_trainer(self, args: TrainingArguments, max_examples=None, compute_metrics=None) -> Trainer:
        train_d = self.as_input_data(train_finetune_dataset, max_examples=max_examples)
        eval_d = self.as_input_data(eval_finetune_dataset, max_examples=max_examples)
        return Trainer(
            model = self.model,
            args = args,
            compute_metrics=compute_metrics,
            train_dataset = train_d,
            eval_dataset = eval_d,
            tokenizer = self.tokenizer,
        )
    
e, e1 = map(EntailmentClassifier, range(0,2))

Create the ec models and verify that they work the same as last time:

In [57]:
for sentence, answer in zip(SCORE_DATA["20"][0], SCORE_DATA["20"][1]):
    print(sentence)
    for v, label in e1.classify(sentence):
        print(" ", "*" if label == answer else " ", f"{v:0.2f} {label}")

i would like to fly from denver to pittsburgh on the cheapest flight possible
    0.15 airfare
    0.13 cheapest
    0.13 airline
    0.11 city
  * 0.10 flight
    0.07 abbreviation
    0.06 flight_no
    0.06 flight_time
    0.05 quantity
    0.04 distance
    0.03 airport
    0.03 aircraft
    0.02 ground_service
    0.01 restriction
    0.01 capacity
    0.01 ground_fare
    0.00 meal
show me all flights arriving at dallas
  * 0.23 flight
    0.21 airline
    0.13 flight_time
    0.13 aircraft
    0.10 flight_no
    0.04 airport
    0.03 ground_service
    0.03 abbreviation
    0.02 quantity
    0.02 city
    0.01 restriction
    0.01 airfare
    0.01 capacity
    0.01 distance
    0.00 cheapest
    0.00 meal
    0.00 ground_fare
i need a flight from st. petersburg to miami for thursday
    0.19 flight_time
    0.15 ground_service
    0.12 flight_no
  * 0.10 flight
    0.10 distance
    0.09 restriction
    0.05 airport
    0.05 city
    0.03 capacity
    0.03 abbreviation
    0.03 

In [55]:
train_finetune_dataset[0:5]

{'sentence': ['what does ground transportation look like in oakland',
  "hello i 'm trying to find a flight that goes from pittsburgh to denver after 2 pm",
  'how much does it cost to fly on american from dallas to baltimore',
  'how much does flight ua 281 from boston to denver cost',
  'what is the least expensive business class flight between atlanta and dallas'],
 'hypothesis': ['This example asks for a list of airlines',
  'This example asks to find flights',
  'This example asks about the cost of a flight ticket',
  'This example asks about the cost of a flight ticket',
  'This example asks to explain an abbreviation'],
 'correct': [False, True, True, True, False]}

In [69]:
e1.as_input_data(train_finetune_dataset, max_examples=5)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'input_sentences'],
    num_rows: 5
})

In [70]:
_['input_sentences']

['[CLS] what does ground transportation look like in oakland [SEP] this example asks for a list of airlines [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 "[CLS] hello i'm trying to find a flight that goes from pittsburgh to denver after 2 pm [SEP] this example asks to find flights [SEP]",
 '[CLS] how much does it cost to fly on american from dallas to baltimore [SEP] this example asks about the cost of a flight ticket [SEP] [PAD]',
 '[CLS] how much does flight ua 281 from boston to denver cost [SEP] this example asks about the cost of a flight ticket [SEP] [PAD] [PAD] [PAD]',
 '[CLS] what is the least expensive business class flight between atlanta and dallas [SEP] this example asks to explain an abbreviation [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]']

### Try a training

In [87]:
quick_training_args = TrainingArguments(
  output_dir = "models",                  # Output directory
  num_train_epochs = 1,              # Total number of training epochs
  per_device_train_batch_size = 32,  # Batch size per device during training
  per_device_eval_batch_size = 256,  # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.1,               # Strength of weight decay
)


In [71]:
qt1 = e1.create_finetune_trainer(quick_training_args, max_examples=10)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [72]:
qt1.train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'input_sentences'],
    num_rows: 10
})

In [73]:
qt1.train()

Step,Training Loss


TrainOutput(global_step=1, training_loss=0.7468543648719788, metrics={'train_runtime': 0.3901, 'train_samples_per_second': 25.633, 'train_steps_per_second': 2.563, 'total_flos': 43743609840.0, 'train_loss': 0.7468543648719788, 'epoch': 1.0})

In [75]:
qt1 = e1.create_finetune_trainer(quick_training_args)
qt1.train()

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/18576 [00:00<?, ? examples/s]

Step,Training Loss
500,0.0923
1000,0.0254
1500,0.0157
2000,0.0099
2500,0.0096
3000,0.0073


TrainOutput(global_step=3125, training_loss=0.025843038120269776, metrics={'train_runtime': 143.401, 'train_samples_per_second': 697.345, 'train_steps_per_second': 21.792, 'total_flos': 808348973478144.0, 'train_loss': 0.025843038120269776, 'epoch': 1.0})

In [76]:
qt1.model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12,

In [77]:
auto_tokenizers.append(auto_tokenizers[1])
auto_models.append(qt1.model)
ec_qt1 = EntailmentClassifier(-1)

In [78]:
for sentence, answer in zip(SCORE_DATA["20"][0], SCORE_DATA["20"][1]):
    print(sentence)
    for v, label in ec_qt1.classify(sentence):
        print(" ", "*" if label == answer else " ", f"{v:0.2f} {label}")

i would like to fly from denver to pittsburgh on the cheapest flight possible
  * 1.00 flight
    0.00 ground_service
    0.00 abbreviation
    0.00 quantity
    0.00 restriction
    0.00 distance
    0.00 meal
    0.00 city
    0.00 cheapest
    0.00 ground_fare
    0.00 airfare
    0.00 flight_time
    0.00 aircraft
    0.00 airline
    0.00 airport
    0.00 capacity
    0.00 flight_no
show me all flights arriving at dallas
  * 1.00 flight
    0.00 abbreviation
    0.00 city
    0.00 airport
    0.00 ground_service
    0.00 airfare
    0.00 ground_fare
    0.00 airline
    0.00 aircraft
    0.00 meal
    0.00 cheapest
    0.00 flight_no
    0.00 capacity
    0.00 restriction
    0.00 flight_time
    0.00 distance
    0.00 quantity
i need a flight from st. petersburg to miami for thursday
  * 1.00 flight
    0.00 airline
    0.00 abbreviation
    0.00 airport
    0.00 city
    0.00 flight_no
    0.00 distance
    0.00 meal
    0.00 capacity
    0.00 ground_fare
    0.00 aircraft
    0

In [84]:
from timer import Timer

def convert_to_gpu_dataset(x):
    d = Dataset.from_dict({'data': x})
    return d.with_format("torch", device=device)

SCORE_DATASET = {
    key: [convert_to_gpu_dataset(d) for d in values] 
    for key, values in SCORE_DATA.items()
}  

def time_my_models(model_list, key:str="val"):
    with Timer() as t:
        for model in model_list:
            print(format_f1_scores(f1_scores(confusion_batch(model, SCORE_DATASET[key]))))
            t.lap()
      
        print("this took {} seconds".format(t.laps))


In [85]:
def ec_as_my_model(ec: EntailmentClassifier):
    def my_model(xs):
        return [ec.classify(x)[0][1] for x in xs]
    return my_model

def confusion_batch(model, on_data : tuple[Dataset, Dataset]):
    results = model(on_data[0]['data'])
    cm = defaultdict(lambda: defaultdict(int))
    for y, y_ in zip(on_data[1]['data'], results):
        cm[y][y_] += 1
    return cm

EntailmentClassifier.as_my_model = ec_as_my_model

In [86]:
time_my_models([e1.as_my_model(), ec_qt1.as_my_model()])

  abbreviation ([36m15[0m, [36m1[0m, [36m0[0m): [32m0.97[0m
  aircraft ([36m16[0m, [36m0[0m, [36m4[0m): [32m0.89[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m83[0m, [36m1[0m, [36m1[0m): [32m0.99[0m
  airline ([36m30[0m, [36m2[0m, [36m0[0m): [32m0.97[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  capacity ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  city ([36m3[0m, [36m1[0m, [36m0[0m): [32m0.86[0m
  distance ([36m7[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  flight ([36m667[0m, [36m9[0m, [36m4[0m): [32m0.99[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m3[0m, [36m2[0m, [36m0[0m): [32m0.75[0m
  flight_time ([36m6[0m, [36m0[0m, [36m2[0m): [32m0.86[0m
  ground_fare ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  ground_service ([36

### Try a full training

In [88]:
auto_models = [AutoModelForSequenceClassification.from_pretrained(n) for n in tr_model_names]
auto_tokenizers = [AutoTokenizer.from_pretrained(n) for n in tr_model_names]


In [89]:
def training_args(model_index):
    return TrainingArguments(
        output_dir = f"models/5ep-lr-{model_index}",                  # Output directory
        num_train_epochs = 5,              # Total number of training epochs
        per_device_train_batch_size = 128,  # Batch size per device during training
        per_device_eval_batch_size = 256,  # Batch size for evaluation
        warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
        weight_decay = 0.01,               # Strength of weight decay
        lr_scheduler_type="inverse_sqrt",
        save_steps = 1000,
        seed=SEED,
    )


for model_index in range(0, 2):
    entailment_model = EntailmentClassifier(model_index)
    args = training_args(model_index)
    trainer = entailment_model.create_finetune_trainer(args)
    time_my_models([entailment_model.as_my_model()])
    trainer.train()
    time_my_models([entailment_model.as_my_model()])
    

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/18576 [00:00<?, ? examples/s]

  abbreviation ([36m9[0m, [36m2[0m, [36m6[0m): [32m0.69[0m
  aircraft ([36m16[0m, [36m37[0m, [36m4[0m): [32m0.44[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m33[0m, [36m11[0m, [36m51[0m): [32m0.52[0m
  airline ([36m14[0m, [36m72[0m, [36m16[0m): [32m0.24[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m4[0m, [36m17[0m, [36m1[0m): [32m0.31[0m
  capacity ([36m2[0m, [36m3[0m, [36m3[0m): [32m0.4[0m
  cheapest ([36m0[0m, [36m26[0m, [36m0[0m): [32m0.0[0m
  city ([36m3[0m, [36m41[0m, [36m0[0m): [32m0.13[0m
  distance ([36m3[0m, [36m3[0m, [36m4[0m): [32m0.46[0m
  flight ([36m420[0m, [36m30[0m, [36m251[0m): [32m0.75[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m2[0m, [36m8[0m, [36m1[0m): [32m0.31[0m
  flight_time ([36m2[0m, [36m96[0m, [36m6[0m): [32m0.038[0m
  ground_fa

Step,Training Loss
500,0.1176
1000,0.0187
1500,0.0113
2000,0.0081
2500,0.0057
3000,0.0055
3500,0.0034


  abbreviation ([36m14[0m, [36m1[0m, [36m1[0m): [32m0.93[0m
  aircraft ([36m16[0m, [36m0[0m, [36m4[0m): [32m0.89[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m83[0m, [36m4[0m, [36m1[0m): [32m0.97[0m
  airline ([36m30[0m, [36m1[0m, [36m0[0m): [32m0.98[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  capacity ([36m4[0m, [36m0[0m, [36m1[0m): [32m0.89[0m
  city ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  distance ([36m7[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  flight ([36m668[0m, [36m6[0m, [36m3[0m): [32m0.99[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m3[0m, [36m3[0m, [36m0[0m): [32m0.67[0m
  flight_time ([36m7[0m, [36m0[0m, [36m1[0m): [32m0.93[0m
  ground_fare ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  ground_service ([36

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/18576 [00:00<?, ? examples/s]

  abbreviation ([36m10[0m, [36m6[0m, [36m5[0m): [32m0.65[0m
  aircraft ([36m19[0m, [36m155[0m, [36m1[0m): [32m0.2[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m26[0m, [36m10[0m, [36m58[0m): [32m0.43[0m
  airline ([36m19[0m, [36m46[0m, [36m11[0m): [32m0.4[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m4[0m, [36m0[0m, [36m1[0m): [32m0.89[0m
  capacity ([36m2[0m, [36m0[0m, [36m3[0m): [32m0.57[0m
  cheapest ([36m0[0m, [36m3[0m, [36m0[0m): [32m0.0[0m
  city ([36m0[0m, [36m8[0m, [36m3[0m): [32m0.0[0m
  distance ([36m5[0m, [36m9[0m, [36m2[0m): [32m0.48[0m
  flight ([36m315[0m, [36m10[0m, [36m356[0m): [32m0.63[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m2[0m, [36m29[0m, [36m1[0m): [32m0.12[0m
  flight_time ([36m2[0m, [36m95[0m, [36m6[0m): [32m0.038[0m
  ground_fare

Step,Training Loss
500,0.066
1000,0.0094
1500,0.0042
2000,0.0022
2500,0.0017
3000,0.0014
3500,0.0014


  abbreviation ([36m15[0m, [36m1[0m, [36m0[0m): [32m0.97[0m
  aircraft ([36m16[0m, [36m0[0m, [36m4[0m): [32m0.89[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m82[0m, [36m1[0m, [36m2[0m): [32m0.98[0m
  airline ([36m29[0m, [36m2[0m, [36m1[0m): [32m0.95[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  capacity ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  city ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  distance ([36m7[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  flight ([36m668[0m, [36m10[0m, [36m3[0m): [32m0.99[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m3[0m, [36m2[0m, [36m0[0m): [32m0.75[0m
  flight_time ([36m7[0m, [36m0[0m, [36m1[0m): [32m0.93[0m
  ground_fare ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  ground_service ([36

We forgot to save the model! Anyway, since both are good, but the first one is faster and has not yet stabilized the training loss, let's continue its training from the checkpoint: 

In [93]:
def training_args(model_index):
    return TrainingArguments(
        output_dir = f"models/5ep-lr-{model_index}",                  # Output directory
        num_train_epochs = 10,             # Total number of training epochs
        per_device_train_batch_size = 128, # Batch size per device during training
        per_device_eval_batch_size = 256,  # Batch size for evaluation
        warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
        weight_decay = 0.01,               # Strength of weight decay
        lr_scheduler_type="inverse_sqrt",
        save_strategy='epoch',
        seed=SEED,
    )

entailment_model = EntailmentClassifier(0)
args = training_args(0)

trainer = entailment_model.create_finetune_trainer(args)
time_my_models([entailment_model.as_my_model()])
trainer.train()
time_my_models([entailment_model.as_my_model()])

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/18576 [00:00<?, ? examples/s]

  abbreviation ([36m14[0m, [36m1[0m, [36m1[0m): [32m0.93[0m
  aircraft ([36m16[0m, [36m0[0m, [36m4[0m): [32m0.89[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m83[0m, [36m3[0m, [36m1[0m): [32m0.98[0m
  airline ([36m30[0m, [36m1[0m, [36m0[0m): [32m0.98[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  capacity ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  city ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  distance ([36m7[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  flight ([36m669[0m, [36m8[0m, [36m2[0m): [32m0.99[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m3[0m, [36m3[0m, [36m0[0m): [32m0.67[0m
  flight_time ([36m6[0m, [36m0[0m, [36m2[0m): [32m0.86[0m
  ground_fare ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  ground_service ([36m

Step,Training Loss
500,0.0043
1000,0.0051
1500,0.003
2000,0.0023
2500,0.002
3000,0.0019
3500,0.0015
4000,0.0009
4500,0.001
5000,0.0011


  abbreviation ([36m15[0m, [36m1[0m, [36m0[0m): [32m0.97[0m
  aircraft ([36m17[0m, [36m0[0m, [36m3[0m): [32m0.92[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m82[0m, [36m4[0m, [36m2[0m): [32m0.96[0m
  airline ([36m30[0m, [36m1[0m, [36m0[0m): [32m0.98[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  capacity ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  city ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  distance ([36m7[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  flight ([36m667[0m, [36m7[0m, [36m4[0m): [32m0.99[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m3[0m, [36m3[0m, [36m0[0m): [32m0.67[0m
  flight_time ([36m6[0m, [36m0[0m, [36m2[0m): [32m0.86[0m
  ground_fare ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  ground_service ([36m

We see that the models are quite good, but we never implemented the multiclass labels.
Let's rectify that.

In [90]:
multi_indices = [ix for ix, v in enumerate(data_y_train) if '+' in v]
SCORE_DATA["train_multi_indices"] = ([SCORE_DATA["train"][0][ix] for ix in multi_indices],
        [SCORE_DATA["train"][1][ix] for ix in multi_indices])

SCORE_DATA["train_multi_indices"][0][:10]

['may i please see airlines and flight numbers from new york to toronto on the same date june seventeenth also arriving in toronto before noon thank you',
 'all flights and fares from atlanta to dallas round trip after 12 pm less than 1100 dollars',
 'give me the flights and fares on december twenty seventh from indianapolis to orlando',
 'show me all flights and fares from denver to san francisco',
 'all flights and fares from pittsburgh to dallas round trip after 12 pm less than 1100 dollars',
 'show all flights and fares from denver to san francisco',
 'please give me a list of all the flights between dallas and baltimore and their cost',
 'first flights and fares from pittsburgh to atlanta on a thursday',
 'show me the costs and times for flights from san francisco to atlanta',
 'all flights and fares from dallas']

In [95]:

for sentence, answer in zip(SCORE_DATA["train_multi_indices"][0], SCORE_DATA["train_multi_indices"][1]):
    print(sentence)
    base_answers = {a:ix for ix, a in enumerate(answer.split('+'), 1)}
    for v, label in entailment_model.classify(sentence):
        print(" ", base_answers.get(label, " "), f"{v:0.2f} {label}")

may i please see airlines and flight numbers from new york to toronto on the same date june seventeenth also arriving in toronto before noon thank you
  2 0.62 flight_no
  1 0.38 airline
    0.00 flight
    0.00 airfare
    0.00 flight_time
    0.00 ground_service
    0.00 restriction
    0.00 quantity
    0.00 abbreviation
    0.00 ground_fare
    0.00 aircraft
    0.00 capacity
    0.00 airport
    0.00 meal
    0.00 city
    0.00 distance
    0.00 cheapest
all flights and fares from atlanta to dallas round trip after 12 pm less than 1100 dollars
  1 0.58 flight
  2 0.42 airfare
    0.00 airport
    0.00 city
    0.00 capacity
    0.00 abbreviation
    0.00 flight_time
    0.00 ground_service
    0.00 distance
    0.00 aircraft
    0.00 cheapest
    0.00 ground_fare
    0.00 flight_no
    0.00 quantity
    0.00 meal
    0.00 restriction
    0.00 airline
give me the flights and fares on december twenty seventh from indianapolis to orlando
  1 0.53 flight
  2 0.47 airfare
    0.00 flig

In [97]:
MULTICLASS_PENALTY = 0.10
labels = list(labels)

multiclass_labels = [
    tuple(base_labels.index(l) for l in label.split('+')) for label in labels
]

multiclass_labels


[(12,),
 (11,),
 (13,),
 (7,),
 (0,),
 (5,),
 (10,),
 (8,),
 (2,),
 (1,),
 (14,),
 (15,),
 (3, 10),
 (9,),
 (16,),
 (3,),
 (4,),
 (9, 2),
 (2, 11),
 (6,)]

In [99]:
PROB_THRESHOLD = 0.2

def ec_classify_multi(self, sentence):
        input_ids = self.tokenizer.batch_encode_plus(
                                    [[sentence, t] for t in base_hypotheses], 
                                   add_special_tokens=True, 
                                   padding=True,
                                   truncation=True,
                                   return_tensors='pt').to(self.device)
        logits = self.model(**input_ids)['logits'][:, self.entailment_label_id]
        probs = logits.softmax(dim=0).tolist()    
        assert len(probs) == len(base_hypotheses) == len(base_labels)
        all_probs = [sum(probs[ix] - MULTICLASS_PENALTY for ix in mlabels) + MULTICLASS_PENALTY 
                     for mlabels in multiclass_labels]        

        return sorted(((p, l) for p, l in zip(all_probs, labels) if p >= PROB_THRESHOLD), reverse=True)

ec_classify_multi(entailment_model, "show me all flights and fares from dallas to san francisco")

[(0.8999246001243592, 'flight+airfare'),
 (0.5051178932189941, 'flight'),
 (0.494806706905365, 'airfare'),
 (0.3948105449711875, 'airfare+flight_time')]

In [101]:
EntailmentClassifier.classify_multi = ec_classify_multi

for sentence, answer in zip(SCORE_DATA["train_multi_indices"][0], SCORE_DATA["train_multi_indices"][1]):
    print(sentence)
    for v, label in entailment_model.classify_multi(sentence):
        print(" ", "*" if label == answer else " ", f"{v:0.2f} {label}")

may i please see airlines and flight numbers from new york to toronto on the same date june seventeenth also arriving in toronto before noon thank you
  * 0.90 airline+flight_no
    0.52 airline
    0.48 flight_no
all flights and fares from atlanta to dallas round trip after 12 pm less than 1100 dollars
  * 0.90 flight+airfare
    0.52 flight
    0.48 airfare
    0.38 airfare+flight_time
give me the flights and fares on december twenty seventh from indianapolis to orlando
  * 0.90 flight+airfare
    0.51 airfare
    0.49 flight
    0.41 airfare+flight_time
show me all flights and fares from denver to san francisco
  * 0.90 flight+airfare
    0.51 airfare
    0.49 flight
    0.41 airfare+flight_time
all flights and fares from pittsburgh to dallas round trip after 12 pm less than 1100 dollars
  * 0.90 flight+airfare
    0.58 flight
    0.42 airfare
    0.32 airfare+flight_time
show all flights and fares from denver to san francisco
  * 0.90 flight+airfare
    0.55 airfare
    0.45 flight

In [102]:
def ec_as_my_model_multi(ec: EntailmentClassifier):
    def my_model(xs):
        return [ec.classify_multi(x)[0][1] for x in xs]
    return my_model

EntailmentClassifier.as_my_model_multi = ec_as_my_model_multi

In [103]:
time_my_models([entailment_model.as_my_model_multi()])

  abbreviation ([36m15[0m, [36m1[0m, [36m0[0m): [32m0.97[0m
  aircraft ([36m17[0m, [36m0[0m, [36m3[0m): [32m0.92[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m83[0m, [36m1[0m, [36m1[0m): [32m0.99[0m
  airline ([36m29[0m, [36m1[0m, [36m1[0m): [32m0.97[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  capacity ([36m5[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  city ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  distance ([36m7[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  flight ([36m667[0m, [36m5[0m, [36m4[0m): [32m0.99[0m
  flight+airfare ([36m3[0m, [36m1[0m, [36m0[0m): [32m0.86[0m
  flight_no ([36m3[0m, [36m3[0m, [36m0[0m): [32m0.67[0m
  flight_time ([36m7[0m, [36m0[0m, [36m1[0m): [32m0.93[0m
  ground_fare ([36m3[0m, [36m0[0m, [36m0[0m): [32m1.0[0m
  ground_service ([36

In [106]:
trainer.save_model('../server/models/ml-xtremedistil-l6-h256-in-tune-1.0-10ep')

In [108]:
import os

save_path = '../server/models/ml-xtremedistil-l6-h256-in-tune-1.0-10ep'

with open(os.path.join(save_path, 'base_labels.tsv'), 'wt', encoding='utf-8') as f:
    w = csv.writer(f, delimiter='\t')
    w.writerows(sorted(base_hypotheses_by_label.items()))
    
with open(os.path.join(save_path, 'labels.txt'), 'wt', encoding='utf-8') as f:
    print(*labels, sep='\n', file=f)

In [109]:
entailment_model.classify_multi("what are the flights from san francisco to denver")

[(0.9998464584350586, 'flight'), (0.8998540645133289, 'flight+airfare')]

In [110]:
args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=