Import from our main project and also import the `reload` method if we decide to hot-reload a module

In [2]:
import sys
sys.path.extend(['../server', '../client'])

import benchmark 
format_f1_scores = benchmark.format_f1_scores
f1_scores = benchmark.f1_scores

from importlib import reload


Perform training on GPU

In [3]:
import torch 
device = torch.device("cuda")

Import data (do not use NumPy arrays this time)

In [4]:
import csv
from collections import defaultdict

from sklearn import metrics
from sklearn.model_selection import train_test_split

SEED = 1337

with open('../data/atis/train.tsv', 'rt') as f:
    reader = csv.reader(f, delimiter='\t')
    data = list(reader)
    data_x = [row[0] for row in data]
    data_y = [row[1] for row in data]

data_x_train, data_x_val, data_y_train, data_y_val = train_test_split(data_x, data_y, test_size=0.2, random_state=SEED)

def val_accuracy(f):
    print(metrics.accuracy_score(data_y_val, f(data_x_val)))

data_x_words = sorted({w for x in data_x_train for w in x.split(' ')})

by_label = defaultdict(list)
for x, y in zip(data_x_train, data_y_train):
    by_label[y].append(x)
labels = set(by_label)

SCORE_DATA = { "train": [data_x_train, data_y_train], "val": [data_x_val, data_y_val]}

def confusion(model, on_data : tuple[list[str], list[str]]):
    cm = defaultdict(lambda: defaultdict(int))
    for x, y in zip(*on_data):
        cm[y][model(x)] += 1
    return cm

def print_f1_scores(model, data_key="train"):
    print(format_f1_scores(f1_scores(confusion(model, SCORE_DATA[data_key]))))


Import the same transformers classifiers as the last time and use the GPU:

In [17]:
from transformers import pipeline

tr_model_names = [
    "MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli", # 100m parameters
    "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", # 200m parameters
    "facebook/bart-large-mnli", # 400m parameters
]

tr_models = [pipeline("zero-shot-classification", model=name, device=device) 
             for name in tr_model_names]

BATCH_SIZE = 128

def into_my_model(transformer_model):
    def as_model(xs):
        results = transformer_model(xs, batch_size=BATCH_SIZE, candidate_labels=sorted(labels))
        return [result['labels'][0] for result in results]
    return as_model

my_models = [into_my_model(model) for model in tr_models] 

Perform a quick test:

In [18]:
SCORE_DATA["2"] = ['fly to nyc', 'take a taxi to penn station'], ['flight', 'ground_service']
my_models[0](SCORE_DATA["2"][0])

['flight', 'capacity']

Let's use a more professional-looking context manager when measuring time:



In [23]:
import time

class Timer:
    def __enter__(self):
        self.start = time.time()
        self.laps = []
        return self

    def __exit__(self, *args):
        pass
    
    @property 
    def elapsed(self):
        return time.time() - self.start
    
    def lap(self):
        self.laps.append(self.elapsed)
        self.start += self.laps[-1]    

    @property 
    def average_lap(self):
        return sum(self.laps) / len(self.laps)

# Usage example:
with Timer() as t:
    assert t.elapsed < 0.1
    time.sleep(0.5)
    assert 0.4 < t.elapsed < 0.6
    t.lap()
    assert t.elapsed < 0.1
    assert len(t.laps) == 1
    t.lap()
    assert 0.2 < t.average_lap < 0.3


Convert our naive arrays to GPU tensors

In [12]:
from datasets import Dataset

def convert_to_gpu_dataset(x):
    d = Dataset.from_dict({'data': x})
    return d.with_format("torch", device=device)

SCORE_DATASET = {
    key: [convert_to_gpu_dataset(d) for d in values] 
    for key, values in SCORE_DATA.items()
}    

In [20]:
def confusion_batch(model, on_data : tuple[Dataset, Dataset]):
    results = model(on_data[0]['data'])
    cm = defaultdict(lambda: defaultdict(int))
    for y, y_ in zip(on_data[1]['data'], results):
        cm[y][y_] += 1
    return cm
        

Now that we use the GPU for inference and the datasets are preloaded,
we can check the inference time and the validation results.

In [24]:
with Timer() as t:
    for model in my_models:
        print(format_f1_scores(f1_scores(confusion_batch(model, SCORE_DATASET["train"]))))
        t.lap()
  
    print("this took {} seconds".format(t.laps))


  abbreviation ([36m56[0m, [36m115[0m, [36m37[0m): [32m0.42[0m
  aircraft ([36m30[0m, [36m7[0m, [36m28[0m): [32m0.63[0m
  airfare ([36m14[0m, [36m3[0m, [36m305[0m): [32m0.083[0m
  airfare+flight_time ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airline ([36m70[0m, [36m59[0m, [36m48[0m): [32m0.57[0m
  airline+flight_no ([36m0[0m, [36m1[0m, [36m1[0m): [32m0.0[0m
  airport ([36m12[0m, [36m65[0m, [36m1[0m): [32m0.27[0m
  capacity ([36m6[0m, [36m66[0m, [36m5[0m): [32m0.14[0m
  cheapest ([36m1[0m, [36m149[0m, [36m0[0m): [32m0.013[0m
  city ([36m5[0m, [36m3[0m, [36m10[0m): [32m0.43[0m
  distance ([36m7[0m, [36m153[0m, [36m6[0m): [32m0.081[0m
  flight ([36m2259[0m, [36m135[0m, [36m496[0m): [32m0.88[0m
  flight+airfare ([36m9[0m, [36m122[0m, [36m5[0m): [32m0.12[0m
  flight_no ([36m0[0m, [36m31[0m, [36m9[0m): [32m0.0[0m
  flight_time ([36m17[0m, [36m15[0m, [36m27[0m): [32m0.45[

In [22]:
with Timer() as t:
    for model in my_models:
        print(format_f1_scores(f1_scores(confusion_batch(model, SCORE_DATASET["val"]))))
        t.lap()
  
    print("this took {} seconds".format(t.laps))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  abbreviation ([36m8[0m, [36m24[0m, [36m7[0m): [32m0.34[0m
  aircraft ([36m12[0m, [36m2[0m, [36m8[0m): [32m0.71[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m6[0m, [36m0[0m, [36m78[0m): [32m0.13[0m
  airline ([36m18[0m, [36m15[0m, [36m12[0m): [32m0.57[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m21[0m, [36m0[0m): [32m0.32[0m
  capacity ([36m3[0m, [36m15[0m, [36m2[0m): [32m0.26[0m
  cheapest ([36m0[0m, [36m45[0m, [36m0[0m): [32m0.0[0m
  city ([36m2[0m, [36m3[0m, [36m1[0m): [32m0.5[0m
  distance ([36m1[0m, [36m50[0m, [36m6[0m): [32m0.034[0m
  flight ([36m548[0m, [36m38[0m, [36m123[0m): [32m0.87[0m
  flight+airfare ([36m2[0m, [36m31[0m, [36m1[0m): [32m0.11[0m
  flight_no ([36m0[0m, [36m8[0m, [36m3[0m): [32m0.0[0m
  flight_time ([36m2[0m, [36m1[0m, [36m6[0m): [32m0.36[0m
  ground_far

Let's also try this 33m parameter model:

In [25]:
small_model_name = "pszemraj/xtremedistil-l12-h384-uncased-zeroshot-v1.1"
small_model = into_my_model(pipeline("zero-shot-classification", device=device, model = small_model_name))

config.json:   0%|          | 0.00/956 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [28]:
def time_my_models(model_list, key:str="val"):
    with Timer() as t:
        for model in model_list:
            print(format_f1_scores(f1_scores(confusion_batch(model, SCORE_DATASET[key]))))
            t.lap()
      
        print("this took {} seconds".format(t.laps))

time_my_models([small_model, my_models[0]])

  abbreviation ([36m11[0m, [36m4[0m, [36m4[0m): [32m0.73[0m
  aircraft ([36m17[0m, [36m46[0m, [36m3[0m): [32m0.41[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m22[0m, [36m176[0m, [36m62[0m): [32m0.16[0m
  airfare+flight_time ([36m0[0m, [36m2[0m, [36m0[0m): [32m0.0[0m
  airline ([36m18[0m, [36m159[0m, [36m12[0m): [32m0.17[0m
  airline+flight_no ([36m0[0m, [36m6[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m8[0m, [36m0[0m): [32m0.56[0m
  capacity ([36m0[0m, [36m0[0m, [36m5[0m): [32m0.0[0m
  cheapest ([36m0[0m, [36m1[0m, [36m0[0m): [32m0.0[0m
  city ([36m0[0m, [36m18[0m, [36m3[0m): [32m0.0[0m
  distance ([36m4[0m, [36m23[0m, [36m3[0m): [32m0.24[0m
  flight ([36m326[0m, [36m14[0m, [36m345[0m): [32m0.64[0m
  flight+airfare ([36m0[0m, [36m12[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  fli

In [30]:
tr_model_names = [small_model_name, *tr_model_names]
my_models = [small_model, *tr_model_names]

Let's examine the models more closely:

In [40]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
auto_models = [AutoModelForSequenceClassification.from_pretrained(n) for n in tr_model_names]
auto_tokenizers = [AutoTokenizer.from_pretrained(n) for n in tr_model_names]

In [41]:
for m in auto_models:
    print(m)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12,

Add a new model with only 12m parameters:

In [43]:
very_small_model_name = "MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33"
very_small_model = into_my_model(pipeline("zero-shot-classification", device=device, model = small_model_name))

time_my_models([very_small_model])

  abbreviation ([36m11[0m, [36m4[0m, [36m4[0m): [32m0.73[0m
  aircraft ([36m17[0m, [36m46[0m, [36m3[0m): [32m0.41[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m22[0m, [36m176[0m, [36m62[0m): [32m0.16[0m
  airfare+flight_time ([36m0[0m, [36m2[0m, [36m0[0m): [32m0.0[0m
  airline ([36m18[0m, [36m159[0m, [36m12[0m): [32m0.17[0m
  airline+flight_no ([36m0[0m, [36m6[0m, [36m1[0m): [32m0.0[0m
  airport ([36m5[0m, [36m8[0m, [36m0[0m): [32m0.56[0m
  capacity ([36m0[0m, [36m0[0m, [36m5[0m): [32m0.0[0m
  cheapest ([36m0[0m, [36m1[0m, [36m0[0m): [32m0.0[0m
  city ([36m0[0m, [36m18[0m, [36m3[0m): [32m0.0[0m
  distance ([36m4[0m, [36m23[0m, [36m3[0m): [32m0.24[0m
  flight ([36m326[0m, [36m14[0m, [36m345[0m): [32m0.64[0m
  flight+airfare ([36m0[0m, [36m12[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  fli

In [45]:
tr_model_names

['pszemraj/xtremedistil-l12-h384-uncased-zeroshot-v1.1',
 'MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli',
 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli',
 'facebook/bart-large-mnli']

In [46]:
tr_model_names = [very_small_model_name, *tr_model_names]

tr_models = [pipeline("zero-shot-classification", model=name, device=device) 
             for name in tr_model_names]
my_models = [into_my_model(model) for model in tr_models] 

auto_models = [AutoModelForSequenceClassification.from_pretrained(n) for n in tr_model_names]
auto_tokenizers = [AutoTokenizer.from_pretrained(n) for n in tr_model_names]

for t in auto_tokenizers:
    print(t)
    print()

for m in auto_models:
    print(m)
    print()

config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/25.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

BertTokenizerFast(name_or_path='pszemraj/xtremedistil-l12-h384-uncased-zeroshot-v1.1', vo

Save the model information into a file:

In [58]:
with open('analysis4-models.md', 'wt') as f:
    print("# Models", end="\n\n\n", file=f)
    for n, t, m  in zip(tr_model_names, auto_tokenizers, auto_models):
        print("## " + n, 
              '', 
              'Tokenizer:',
              '```', 
              t, 
              '```', 
              ''
              'Model:', 
              '```', 
              m, 
              '```', 
              ''
              'Output labels:', 
              *(f'  - {v}: `{t}`' for v, t in m.config.id2label.items()),
              '',
              f'Number of parameters: {m.num_parameters() // 10 ** 6}m',
            file=f, sep="\n", end="\n\n\n")
    

With the classification we pose one hypothesis for each label, so we prefer to use the base labels, of which there are fewer.

In [60]:
base_labels = sorted({l for label in labels for l in label.split('+')})
base_labels

['abbreviation',
 'aircraft',
 'airfare',
 'airline',
 'airport',
 'capacity',
 'cheapest',
 'city',
 'distance',
 'flight',
 'flight_no',
 'flight_time',
 'ground_fare',
 'ground_service',
 'meal',
 'quantity',
 'restriction']

This is the base template used in the NLI and also in the [transformers library](https://github.com/huggingface/transformers/blob/3d2900e829ab16757632f9dde891f1947cfc4be0/src/transformers/pipelines/zero_shot_classification.py#L47):

In [61]:
template = "This example is {}." 

But we might be able to do better for a specific task:

In [122]:
base_templates = [
    'This example asks to explain an abbreviation',
    'This example asks a question about an aircraft type',
    'This example asks about the cost of a flight ticket',
    'This example asks for a list of airlines',
    'This example asks for an airport name',
    'This example asks about the seating capacity of a plane',
    'This example asks about the cheapest fare in the database',
    'This example asks for a city name',
    'This example asks about the distance between two points',
    'This example asks to find flights',
    'This example asks to provide a flight number',
    'This example asks to provide a flight time',
    'This example asks for a rental car or taxi price',
    'This example asks for ground transportation options',
    'This example asks about available meal choice of the flight',
    'This example asks to provide a quantity as an answer',
    'This example asks about a restriction',   
]

assert len(base_templates) == len(base_labels)

Let's experiment with using entailment classifiers:

In [135]:
DEBUG = print

class EntailmentClassifier:
    def __init__(self, model_index, device=device):
        self.device = device
        self.tokenizer = auto_tokenizers[model_index]
        self.model = auto_models[model_index].to(self.device)
        self.entailment_label_id = next(ix for ix, v in self.model.config.id2label.items() if v == "entailment")

    def classify(self, sentence):
        def gen():
            for t, l in zip(base_templates, base_labels):
                tokenized_sentence = self.tokenizer.tokenize([sentence, t], add_special_tokens=True)
                DEBUG(tokenized_sentence)
                input_ids = self.tokenizer.encode([sentence, t], add_special_tokens=True, return_tensors='pt').to(self.device)
                logits = self.model(input_ids)[0][0]
                probs = logits.softmax(dim=0)
                DEBUG(t, logits, probs)
                entail_prob = probs[self.entailment_label_id].item()
                yield entail_prob, l
        return sorted(list(gen()), reverse=True)
    
e = EntailmentClassifier(0)

In [118]:
e.classify(SCORE_DATA["2"][0][0])

['[CLS]', 'fly', 'to', 'nyc', '[SEP]', 'this', 'example', 'asks', 'to', 'explain', 'an', 'abbreviation', '[SEP]']
This example asks to explain an abbreviation tensor([-1.6629,  1.7722], grad_fn=<SelectBackward0>) tensor([0.0312, 0.9688], grad_fn=<SoftmaxBackward0>)
['[CLS]', 'fly', 'to', 'nyc', '[SEP]', 'this', 'example', 'asks', 'a', 'question', 'about', 'an', 'aircraft', 'type', '[SEP]']
This example asks a question about an aircraft type tensor([-1.5458,  1.6580], grad_fn=<SelectBackward0>) tensor([0.0390, 0.9610], grad_fn=<SoftmaxBackward0>)
['[CLS]', 'fly', 'to', 'nyc', '[SEP]', 'this', 'example', 'asks', 'about', 'the', 'cost', 'of', 'a', 'flight', 'ticket', '[SEP]']
This example asks about the cost of a flight ticket tensor([-1.7072,  1.8266], grad_fn=<SelectBackward0>) tensor([0.0284, 0.9716], grad_fn=<SoftmaxBackward0>)
['[CLS]', 'fly', 'to', 'nyc', '[SEP]', 'this', 'example', 'asks', 'for', 'a', 'list', 'of', 'airlines', '[SEP]']
This example asks for a list of airlines tenso

[(0.03902369737625122, 'aircraft'),
 (0.031216135248541832, 'abbreviation'),
 (0.028365809470415115, 'airfare'),
 (0.025125566869974136, 'city'),
 (0.021541139110922813, 'quantity'),
 (0.02149055153131485, 'airport'),
 (0.020283926278352737, 'distance'),
 (0.01928526535630226, 'cheapest'),
 (0.01464210543781519, 'ground_service'),
 (0.014165692031383514, 'ground_fare'),
 (0.01314655039459467, 'capacity'),
 (0.012822638265788555, 'flight_time'),
 (0.012364466674625874, 'flight_no'),
 (0.011626241728663445, 'restriction'),
 (0.011517428793013096, 'meal'),
 (0.008578207343816757, 'flight'),
 (0.00818164274096489, 'airline')]

In [101]:
from random import sample
sample_20 = sample(range(len(SCORE_DATA["train"][0])), 20)

SCORE_DATA["20"] = [SCORE_DATA["train"][0][i] for i in sample_20], [SCORE_DATA["train"][1][i] for i in sample_20]

SCORE_DATA["20"][0]

['i live in washington and i would like to make a trip to san francisco can you tell me which airlines connect with those two cities',
 'do you have an early morning direct flight from philadelphia to pittsburgh',
 'show me the lowest fare for a round trip flight from baltimore to dallas',
 'show me the flights from los angeles to pittsburgh which arrive at pittsburgh on monday',
 'saturday flights from las vegas to phoenix',
 'show me the flights from chicago to indianapolis',
 'please show me the flights from washington dc to san francisco california',
 'flights on friday from newark to tampa',
 'which flights from pittsburgh to atlanta have the lowest coach class fares',
 'show me the cheapest flights from san francisco to philadelphia',
 "in pittsburgh i 'd like to rent a car",
 'is there a flight from nashville to houston in the evening tomorrow',
 'what is the earliest flight from atlanta to boston',
 'flights from las vegas to montreal',
 'what is the latest flight you have depa

In [123]:
DEBUG = print

for sentence, answer in zip(SCORE_DATA["20"][0], SCORE_DATA["20"][1]):
    print(sentence)
    for v, label in e.classify(sentence):
        print(" ", "*" if label == answer else " ", f"{v:0.2f} {label}")

i live in washington and i would like to make a trip to san francisco can you tell me which airlines connect with those two cities
['[CLS]', 'i', 'live', 'in', 'washington', 'and', 'i', 'would', 'like', 'to', 'make', 'a', 'trip', 'to', 'san', 'francisco', 'can', 'you', 'tell', 'me', 'which', 'airlines', 'connect', 'with', 'those', 'two', 'cities', '[SEP]', 'this', 'example', 'asks', 'to', 'explain', 'an', 'abbreviation', '[SEP]']
This example asks to explain an abbreviation tensor([-1.1124,  1.1643], grad_fn=<SelectBackward0>) tensor([0.0931, 0.9069], grad_fn=<SoftmaxBackward0>)
['[CLS]', 'i', 'live', 'in', 'washington', 'and', 'i', 'would', 'like', 'to', 'make', 'a', 'trip', 'to', 'san', 'francisco', 'can', 'you', 'tell', 'me', 'which', 'airlines', 'connect', 'with', 'those', 'two', 'cities', '[SEP]', 'this', 'example', 'asks', 'a', 'question', 'about', 'an', 'aircraft', 'type', '[SEP]']
This example asks a question about an aircraft type tensor([-1.0849,  1.1539], grad_fn=<SelectBack

In [124]:
e.model.config.id2label

{0: 'entailment', 1: 'not_entailment'}

In [138]:
DEBUG = lambda *x: None
e1 = EntailmentClassifier(1)

for sentence, answer in zip(SCORE_DATA["20"][0], SCORE_DATA["20"][1]):
    print(sentence)
    for v, label in e1.classify(sentence):
        print(" ", "*" if label == answer else " ", f"{v:0.2f} {label}")

i live in washington and i would like to make a trip to san francisco can you tell me which airlines connect with those two cities
    0.94 airline
  * 0.85 flight
    0.72 ground_service
    0.64 abbreviation
    0.44 city
    0.27 flight_no
    0.24 aircraft
    0.17 airport
    0.05 quantity
    0.03 flight_time
    0.02 distance
    0.01 ground_fare
    0.01 restriction
    0.01 airfare
    0.01 cheapest
    0.01 capacity
    0.00 meal
do you have an early morning direct flight from philadelphia to pittsburgh
  * 0.93 flight
    0.89 flight_time
    0.81 flight_no
    0.44 city
    0.34 ground_service
    0.28 airport
    0.24 aircraft
    0.19 abbreviation
    0.08 airline
    0.05 distance
    0.04 restriction
    0.03 quantity
    0.02 airfare
    0.02 ground_fare
    0.01 capacity
    0.01 cheapest
    0.01 meal
show me the lowest fare for a round trip flight from baltimore to dallas
  * 0.91 airfare
    0.69 flight
    0.63 cheapest
    0.47 flight_no
    0.38 quantity
    0.3

In [127]:
def ec_as_my_model(ec: EntailmentClassifier):
    def my_model(xs):
        return [ec.classify(x)[0][1] for x in xs]
    return my_model

time_my_models(ec_as_my_model(ec) for ec in [e, e1])
    

  abbreviation ([36m1[0m, [36m14[0m, [36m14[0m): [32m0.067[0m
  aircraft ([36m20[0m, [36m828[0m, [36m0[0m): [32m0.046[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m5[0m, [36m5[0m, [36m79[0m): [32m0.11[0m
  airline ([36m0[0m, [36m0[0m, [36m30[0m): [32m0.0[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m0[0m, [36m0[0m, [36m5[0m): [32m0.0[0m
  capacity ([36m0[0m, [36m0[0m, [36m5[0m): [32m0.0[0m
  city ([36m0[0m, [36m7[0m, [36m3[0m): [32m0.0[0m
  distance ([36m1[0m, [36m8[0m, [36m6[0m): [32m0.12[0m
  flight ([36m0[0m, [36m0[0m, [36m671[0m): [32m0.0[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_time ([36m0[0m, [36m0[0m, [36m8[0m): [32m0.0[0m
  ground_fare ([36m1[0m, [36m1[0m, [36m2[0m): [32m0.4[0m
  ground_service ([3

Let's fo back and fix the models to use GPU:

In [129]:
def move_to_device(ec: EntailmentClassifier):
    ec.model = ec.model.to(device)

EntailmentClassifier.to_device = move_to_device

In [130]:
e.to_device()
e1.to_device()

In [139]:
DEBUG = lambda *v:None

time_my_models(ec_as_my_model(ec) for ec in [e, e1])

  abbreviation ([36m1[0m, [36m14[0m, [36m14[0m): [32m0.067[0m
  aircraft ([36m20[0m, [36m828[0m, [36m0[0m): [32m0.046[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m5[0m, [36m5[0m, [36m79[0m): [32m0.11[0m
  airline ([36m0[0m, [36m0[0m, [36m30[0m): [32m0.0[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m0[0m, [36m0[0m, [36m5[0m): [32m0.0[0m
  capacity ([36m0[0m, [36m0[0m, [36m5[0m): [32m0.0[0m
  city ([36m0[0m, [36m7[0m, [36m3[0m): [32m0.0[0m
  distance ([36m1[0m, [36m8[0m, [36m6[0m): [32m0.12[0m
  flight ([36m0[0m, [36m0[0m, [36m671[0m): [32m0.0[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_time ([36m0[0m, [36m0[0m, [36m8[0m): [32m0.0[0m
  ground_fare ([36m1[0m, [36m1[0m, [36m2[0m): [32m0.4[0m
  ground_service ([3

But actually we should try all hypotheses and once and use the softmax

In [156]:
DEBUG=print

def better_classify(self: EntailmentClassifier, sentence):
    input_ids = self.tokenizer.batch_encode_plus(
                                [[sentence, t] for t in base_templates], 
                               add_special_tokens=True, 
                               padding=True,
                               truncation=True,
                               return_tensors='pt').to(self.device)
    DEBUG(input_ids)
    logits = self.model(**input_ids)['logits'][:, self.entailment_label_id]
    probs = logits.softmax(dim=0).tolist()

    DEBUG(sentence, logits, probs)
    assert len(probs) == len(base_templates) == len(base_labels)
    return sorted(zip(probs, base_labels), reverse=True)

better_classify(e1, SCORE_DATA["20"][0][0])

{'input_ids': tensor([[  101,  1045,  2444,  1999,  2899,  1998,  1045,  2052,  2066,  2000,
          2191,  1037,  4440,  2000,  2624,  3799,  2064,  2017,  2425,  2033,
          2029,  7608,  7532,  2007,  2216,  2048,  3655,   102,  2023,  2742,
          5176,  2000,  4863,  2019, 22498,   102,     0,     0,     0,     0],
        [  101,  1045,  2444,  1999,  2899,  1998,  1045,  2052,  2066,  2000,
          2191,  1037,  4440,  2000,  2624,  3799,  2064,  2017,  2425,  2033,
          2029,  7608,  7532,  2007,  2216,  2048,  3655,   102,  2023,  2742,
          5176,  1037,  3160,  2055,  2019,  2948,  2828,   102,     0,     0],
        [  101,  1045,  2444,  1999,  2899,  1998,  1045,  2052,  2066,  2000,
          2191,  1037,  4440,  2000,  2624,  3799,  2064,  2017,  2425,  2033,
          2029,  7608,  7532,  2007,  2216,  2048,  3655,   102,  2023,  2742,
          5176,  2055,  1996,  3465,  1997,  1037,  3462,  7281,   102,     0],
        [  101,  1045,  2444,  1999

[(0.27769771218299866, 'airline'),
 (0.20017322897911072, 'flight'),
 (0.12410442531108856, 'ground_service'),
 (0.08375998586416245, 'aircraft'),
 (0.08209830522537231, 'abbreviation'),
 (0.0681559294462204, 'flight_no'),
 (0.04532009735703468, 'airport'),
 (0.04302385076880455, 'city'),
 (0.026334844529628754, 'quantity'),
 (0.013139057904481888, 'flight_time'),
 (0.00914670154452324, 'distance'),
 (0.006013883743435144, 'restriction'),
 (0.005658971145749092, 'airfare'),
 (0.004408823326230049, 'capacity'),
 (0.0041976431384682655, 'ground_fare'),
 (0.0034918934106826782, 'cheapest'),
 (0.0032746463548392057, 'meal')]

In [157]:
DEBUG=lambda *_: None

_old_classify = EntailmentClassifier.classify
EntailmentClassifier.classify = better_classify

time_my_models(ec_as_my_model(ec) for ec in [e, e1])

  abbreviation ([36m9[0m, [36m2[0m, [36m6[0m): [32m0.69[0m
  aircraft ([36m16[0m, [36m37[0m, [36m4[0m): [32m0.44[0m
  aircraft+flight+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airfare ([36m33[0m, [36m11[0m, [36m51[0m): [32m0.52[0m
  airline ([36m14[0m, [36m72[0m, [36m16[0m): [32m0.24[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m4[0m, [36m17[0m, [36m1[0m): [32m0.31[0m
  capacity ([36m2[0m, [36m3[0m, [36m3[0m): [32m0.4[0m
  cheapest ([36m0[0m, [36m26[0m, [36m0[0m): [32m0.0[0m
  city ([36m3[0m, [36m41[0m, [36m0[0m): [32m0.13[0m
  distance ([36m3[0m, [36m3[0m, [36m4[0m): [32m0.46[0m
  flight ([36m420[0m, [36m30[0m, [36m251[0m): [32m0.75[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m3[0m): [32m0.0[0m
  flight_no ([36m2[0m, [36m8[0m, [36m1[0m): [32m0.31[0m
  flight_time ([36m2[0m, [36m96[0m, [36m6[0m): [32m0.038[0m
  ground_fa

In [158]:
time_my_models((ec_as_my_model(ec) for ec in [e, e1]), key="train")

  abbreviation ([36m57[0m, [36m14[0m, [36m36[0m): [32m0.7[0m
  aircraft ([36m46[0m, [36m119[0m, [36m12[0m): [32m0.41[0m
  airfare ([36m112[0m, [36m40[0m, [36m207[0m): [32m0.48[0m
  airfare+flight_time ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airline ([36m64[0m, [36m338[0m, [36m54[0m): [32m0.25[0m
  airline+flight_no ([36m0[0m, [36m0[0m, [36m1[0m): [32m0.0[0m
  airport ([36m7[0m, [36m53[0m, [36m6[0m): [32m0.19[0m
  capacity ([36m2[0m, [36m7[0m, [36m9[0m): [32m0.2[0m
  cheapest ([36m1[0m, [36m72[0m, [36m0[0m): [32m0.027[0m
  city ([36m4[0m, [36m184[0m, [36m11[0m): [32m0.039[0m
  distance ([36m8[0m, [36m15[0m, [36m5[0m): [32m0.44[0m
  flight ([36m1725[0m, [36m104[0m, [36m1030[0m): [32m0.75[0m
  flight+airfare ([36m0[0m, [36m0[0m, [36m14[0m): [32m0.0[0m
  flight_no ([36m7[0m, [36m24[0m, [36m2[0m): [32m0.35[0m
  flight_time ([36m21[0m, [36m392[0m, [36m23[0m): [32m0.092

Let's now prepare the finetune dataset:

In [161]:
base_template_by_label = dict(zip(base_labels, base_templates))

def balanced_positive_negative_examples(sentence, label):
    positive_labels = label.split('+')
    negative_labels = set(base_labels).difference(positive_labels)
    factor = len(negative_labels) // len(positive_labels)

    for i in range(factor):
        for l in positive_labels:
            yield sentence, base_template_by_label[l], True
            
    for l in negative_labels:
        yield sentence, base_template_by_label[l], False
        
all_finetune_examples: list[tuple[str, str, bool]] = [
    example 
    for sentence, label in zip(*SCORE_DATA["train"])
    for example in balanced_positive_negative_examples(sentence, label)
]

all_finetune_examples[:30]

[('can i see ground transportation from long beach airport to downtown',
  'This example asks for ground transportation options',
  True),
 ('can i see ground transportation from long beach airport to downtown',
  'This example asks for ground transportation options',
  True),
 ('can i see ground transportation from long beach airport to downtown',
  'This example asks for ground transportation options',
  True),
 ('can i see ground transportation from long beach airport to downtown',
  'This example asks for ground transportation options',
  True),
 ('can i see ground transportation from long beach airport to downtown',
  'This example asks for ground transportation options',
  True),
 ('can i see ground transportation from long beach airport to downtown',
  'This example asks for ground transportation options',
  True),
 ('can i see ground transportation from long beach airport to downtown',
  'This example asks for ground transportation options',
  True),
 ('can i see ground transpo

In [162]:
len(all_finetune_examples)

118576

In [164]:
train_finetune_examples, eval_finetune_examples = train_test_split(all_finetune_examples, random_state=SEED, train_size=10**5)
len(train_finetune_examples), len(eval_finetune_examples)

(100000, 18576)

In [167]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
  output_dir = ".",                  # Output directory
  num_train_epochs = 5,              # Total number of training epochs
  per_device_train_batch_size = 64,  # Batch size per device during training
  per_device_eval_batch_size = 256,  # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.01,               # Strength of weight decay
)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [168]:
def as_dataset(self: EntailmentClassifier, examples:list[str, str, bool]):
        assert self.entailment_label_id in {0, 1}
        def as_entailment_label(value: bool):
            return self.entailment_label_id if value else 1-self.entailment_label_id
        
        dataset = self.tokenizer.batch_encode_plus(
                                [e[0:2] for e in examples], 
                               add_special_tokens=True, 
                               padding='max_length',
                               truncation=True,
                               return_tensors='pt').to(self.device)
        
        dataset['labels'] = torch.tensor([as_entailment_label(e[2]) for e in examples]).to(self.device)
        dataset['input_sentences'] = self.tokenizer.batch_decode(dataset['input_ids'])
        return dataset

EntailmentClassifier.as_dataset = as_dataset


In [174]:
def create_finetune_trainer(self: EntailmentClassifier, args: TrainingArguments) -> Trainer:
    return Trainer(
        model = self.model,
        args = training_args,
        train_dataset = self.as_dataset(train_finetune_examples),
        eval_dataset = self.as_dataset(eval_finetune_examples),
        tokenizer = self.tokenizer,
    )

EntailmentClassifier.create_finetune_trainer = create_finetune_trainer

quick_training_args = TrainingArguments(
  output_dir = ".",                  # Output directory
  num_train_epochs = 1,              # Total number of training epochs
  per_device_train_batch_size = 32,  # Batch size per device during training
  per_device_eval_batch_size = 256,  # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.1,               # Strength of weight decay
)

qt1 = e1.create_finetune_trainer(quick_training_args)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [172]:
import accelerate

In [184]:
import transformers
reload(transformers)
Trainer = transformers.Trainer
TrainingArguments = transformers.TrainingArguments

def create_finetune_trainer(self: EntailmentClassifier, args: TrainingArguments) -> Trainer:
    return Trainer(
        model = self.model,
        args = training_args,
        train_dataset = self.as_dataset(train_finetune_examples),
        eval_dataset = self.as_dataset(eval_finetune_examples),
        tokenizer = self.tokenizer,
    )

EntailmentClassifier.create_finetune_trainer = create_finetune_trainer

quick_training_args = TrainingArguments(
  output_dir = ".",                  # Output directory
  num_train_epochs = 1,              # Total number of training epochs
  per_device_train_batch_size = 32,  # Batch size per device during training
  per_device_eval_batch_size = 256,  # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.1,               # Strength of weight decay
)

qt1 = e1.create_finetune_trainer(quick_training_args)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [182]:
import sys

In [183]:
sys.path

['C:\\Users\\me\\AppData\\Local\\Programs\\PyCharm Professional\\plugins\\python\\helpers-pro\\jupyter_debug',
 'C:\\Users\\me\\AppData\\Local\\Programs\\PyCharm Professional\\plugins\\python\\helpers\\pydev',
 '/home/we/Code/intent-classifier/docs',
 '/home/we/Code/intent-classifier',
 '/home/we/.pyenv/versions/3.12.1/lib/python312.zip',
 '/home/we/.pyenv/versions/3.12.1/lib/python3.12',
 '/home/we/.pyenv/versions/3.12.1/lib/python3.12/lib-dynload',
 '',
 '/home/we/.virtualenvs/intent-classifier/lib/python3.12/site-packages',
 '../server',
 '../client',
 '/tmp/tmpdwrw4_tf']

In [185]:
accelerate

<module 'accelerate' from '/home/we/.virtualenvs/intent-classifier/lib/python3.12/site-packages/accelerate/__init__.py'>

We tried to train, but we encounted an issue and need to reload the transformers module.