In [None]:
# from huggingface_hub import notebook_login
# 
# notebook_login()

In [None]:
# from transformers.utils import send_example_telemetry
# send_example_telemetry("language_modeling_notebook_finetuning_nli", framework="tensorflow")

In [1]:
# in case of problems with the gpu memory
def clear_gpu_mem(): 
    from numba import cuda 
    device = cuda.get_current_device()
    device.reset()

clear_gpu_mem()

#### Load finetuning data

In [2]:
from sklearn.model_selection import train_test_split # for more convenient data splitting
import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict # to create Dataset objects
import pprint
import tensorflow as tf

import mlflow # for ml tracking

from string import Template # to template the premise and hypothesis for the NLI task

2023-12-19 15:15:13.158499: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-19 15:15:13.186034: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-19 15:15:13.186059: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-19 15:15:13.186076: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-19 15:15:13.191119: I tensorflow/core/platform/cpu_feature_g

In [3]:
pd.set_option("display.max_colwidth", None)
pd.set_option("colheader_justify", "left")

path = "../data/processed"
dataset_files = ["question_avoidance_preprocessed_dataset.parquet"]
finetuning_datasets = {}
for i in dataset_files:
    finetuning_datasets[i.split(".parquet")[0]] = pd.read_parquet(f"{path}/{i}", engine="pyarrow")

In [4]:
finetuning_datasets["question_avoidance_preprocessed_dataset"].sample(5)

Unnamed: 0,question,answer,label
279,"I'm a young guy with two weeks off and about $1,000 to spend, so I'm wondering what experiences have stuck with people most. What was your best adventure/program/vacation?",The first time I went to an actually nice beach after growing up going to the beaches on the Gulf of Mexico.,2
212,"It does not take Sherlock Holmes to work out that if a rich father employs accountants or financial advisers to produce figures that indicate that he does not have the financial wherewithal to pay maintenance to his children , there is a possibility that some fiddles will be going on . Where financial advisers and accountants are employed to suggest that a father can not afford to pay maintenance for his children , surely there should be another system to check whether something may be wrong ?","Family insecurity is now as important as economic insecurity as a force for poverty and disadvantage . We have new policies on child support , which we aim to implement as soon as possible . We are driving up the rate of payment—in terms of the cash that is now due , some 70 per cent . or more of maintenance is now paid . There remain some non - resident parents or absent fathers , however—perhaps 30 per cent.—who are not paying maintenance . The purpose of our new reforms is to spend less time on complicated assessment , in a move towards simplicity , and more time on enforcement . It is crucial that we undertake that as soon as possible .",0
340,"If the Home Secretary is not willing to consider handing the police in London over to an elected authority , will he consider handing them over to at least a partly elected authority , in line with the discussions going on about other powers in the context of the GLC abolition Bill being considered in the other place between Ministers and Conservative peers ?","Since returning from the United States , I have tried to play a full part in alerting the public to the problem of the glut of cocaine available in South America and likely to be diverted from the American to the British market . Already , a considerable response has been evident , notably in the formation of two customs teams specifically to deal with cocaine . Cocaine seizures this year are much higher than they were last year . That is a measure of success , but we intend to be diligent on this vital topic .",0
401,Do you think you're enjoying tennis again? It seemed after you won the French Open there was a lot of pressure; last year you had a tough match here. Are you feeling happier?,"I think, yes, yes. At least until beginning of 2013 until now I didn't waste the if I have match point, I didn't win the match, so... I think it's very tough, you know, because when I was playing Cirstea in Rod Laver, I was thinking about the match last year. Something you couldn't forget. Always in your mind, yeah.",2
375,"As the Gray report refers to major procurement activities , will the Minister tell me , and the House , what recent discussions he has had with commanders on the ground about the effectiveness of personal protection equipment for our troops in theatre - such as the Stourbridge war hero , 19-year - old Michelle Norris , who risked her life and was the first woman to gain the military cross for her work ?","Of course I noticed that rather startling figure when I read the Gray report myself . The right hon Gentleman , who has obviously read the report , will also have noticed that there is no evidential basis for that statement anywhere in it , nor is there an evidential basis for it anywhere else that I have ever come across . The very fact that the figure ranges between £ 1 billion and more than £ 2 billion shows , I think , how imprecise that statement inevitably is .",0


In [5]:
print("Available datasets:", list(finetuning_datasets.keys()))

Available datasets: ['question_avoidance_preprocessed_dataset']


#### Initialize mlflow

To launch the ui:

```shell
poetry run mlflow ui
```

In [33]:
mlflow.set_experiment("Question Dodging - Zero Shot MNLI")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# autologging
mlflow.tensorflow.autolog()

#### Set up GPU

In [7]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    print(gpus)
    tf.config.experimental.set_memory_growth(gpus[0], True)
    """
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    """;
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

2023-12-19 15:15:22.153930: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-19 15:15:22.170722: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


It's important to reformulate the premise and hypothesis fed into the model. Example:

#### Load zero-shot model

There is a number of zero-shot classification models that could be used. 

One example is [typeform/distilbert-base-uncased-mnli](https://huggingface.co/typeform/distilbert-base-uncased-mnli). It supports TF/Keras as well and performs okay-ish.

Other good options:
- https://huggingface.co/facebook/bart-large-mnli (for English only)
- https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli (outperforms other models)
- https://huggingface.co/joeddav/xlm-roberta-large-xnli (multilingual)

In [8]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoConfig

In [9]:
# loading the model
model_name = "typeform/distilbert-base-uncased-mnli"
#model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
#model_name = "roberta-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)
num_labels = len(config.id2label)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [10]:
config

DistilBertConfig {
  "_name_or_path": "typeform/distilbert-base-uncased-mnli",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "mnli",
  "hidden_dim": 3072,
  "id2label": {
    "0": "ENTAILMENT",
    "1": "NEUTRAL",
    "2": "CONTRADICTION"
  },
  "initializer_range": 0.02,
  "label2id": {
    "CONTRADICTION": 2,
    "ENTAILMENT": 0,
    "NEUTRAL": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.35.0",
  "vocab_size": 30522
}

In [11]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=config.id2label,
    label2id=config.label2id)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [12]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66955779 (255.42 MB)
Trainable params: 66955779 (255.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
model.layers[0]

<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertMainLayer at 0x7ff43c15feb0>

#### Load preprocesssed data

In [20]:
list_of_datasets = [finetuning_datasets[dataset] for dataset in finetuning_datasets]
data = pd.concat(list_of_datasets)

del finetuning_datasets
data.sample(3)

Unnamed: 0,question,answer,label
351,Does my hon Friend agree that the export tariff of 3p per kWh for households should be increased ?,"The coalition agreement commits the Government to a huge increase in energy from waste through anaerobic digestion , and to that end we brought the industry together in a meeting on 6 July , together with colleagues from Department for Environment , Food and Rural Affairs and the Department for Communities and Local Government , to drive the agenda forward . It is early days for the feed - in tariff scheme generally , and as we know it is a new scheme . I am fully aware of the specific problems with farm - based anaerobic digesters , which the hon Gentleman raised , and I am commissioning further technical work in my Department to try to deal with them .",0
170,MAZZTER WHATS UP BRO\n\nI know I've played a lot with you before but I don't remember where. Does the name Applebottom James or Bart Arkdukus Farnum ring a bell with you?,No. :(\n\nI have been on the 2fort2furious servers a lot though.\n\nMore recently they're a bit dead so I sometimes join a Lotus 24/7 2fort server for fun. Or random servers that look fun.,2
361,"Is it the case that the delivery unit monitors between four and six targets in each of the policy areas of health , education , transport and law and order , selected from the Departments ' public service agreements ? Will the Minister tell us which particular targets are selected at the moment ? Perhaps he could give us examples and put a note in the Library so that we can know what all the targets are and what the future priorities will be .","I am grateful to my hon Friend for those observations . Given my background in the Department of Trade and Industry , I am fully aware of the important work being done on getting services online . I am also aware that some of the most innovative and exciting work has been done at local government level . That is why I , with responsibility for e - transformation , will work closely with colleagues at the newly configured centre to ensure that we learn from what is best in local government and take the opportunity share best practice across the country .",0


In [21]:
example = tokenizer("Question: Are you here? Answer: I'm just an engineer", "This answer dodges the question", add_special_tokens=True, padding=True, truncation="only_first", return_attention_mask=True)

prediction = model.predict(example["input_ids"])
print(prediction.logits)
print(config.id2label[np.argmax(prediction.logits)])

[[-2.7766745  4.1372247 -1.8948877]]
NEUTRAL


One could use the `train_test_split` method from `datasets` ([source](https://huggingface.co/docs/datasets/v2.14.5/en/package_reference/main_classes)) which readily splits a dataset object to a train and test set, but using the sklearn one makes it easier to get a train, test, and validation split. 

In [22]:
X = data[["question", "answer"]]
y = data[["label"]]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [23]:
train_dataset = pd.concat([X_train, y_train], axis=1)
test_dataset = pd.concat([X_test, y_test], axis=1)
val_dataset = pd.concat([X_val, y_val], axis=1)

In [24]:
train_dataset = Dataset.from_pandas(train_dataset, preserve_index=False)
test_dataset = Dataset.from_pandas(test_dataset, preserve_index=False)
val_dataset = Dataset.from_pandas(val_dataset, preserve_index=False)

In [None]:
#del data, X, y

In [25]:
dataset = DatasetDict({"train": train_dataset, "test": test_dataset, "val": val_dataset})

In [26]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'label'],
        num_rows: 253
    })
    test: Dataset({
        features: ['question', 'answer', 'label'],
        num_rows: 85
    })
    val: Dataset({
        features: ['question', 'answer', 'label'],
        num_rows: 85
    })
})

In [27]:
dataset["train"]["question"][0]

'On a slightly lighter note, what do you think makes a good definitive fist pump: the quiet steely determination or fullon adrenaline spin your arms around?'

In [28]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66955779 (255.42 MB)
Trainable params: 66955779 (255.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#### Preprocessing the input sequence

In [31]:
#mlflow.end_run()

In [34]:
mlflow.start_run()

<ActiveRun: >

In [35]:
premise_template = Template("Question: $question. Answer: $answer")
hypothesis_template = Template("In this example, the answer evades or ignores the question.")

mlflow.log_params({
        "premise_template": premise_template.safe_substitute(),
        "hypothesis_template": hypothesis_template.safe_substitute(),
        "input_note": "passed as premise, hypothesis" # "passed into tokenizer as [premise, hypothesis]" 
    }
)

def preprocess_function(row, train=True):
    premise = premise_template.safe_substitute(question = row['question'], answer = row['answer'])
    hypothesis = hypothesis_template.safe_substitute()

    encoded = tokenizer(premise, hypothesis, add_special_tokens=True, padding=True, truncation="only_first", return_attention_mask=True)
    #encoded = tokenizer(premise, hypothesis, add_special_tokens=True,  truncation="only_first", return_attention_mask=True, return_tensors="tf")# padding=True,
    #return_tensors="np")
    if train:
        encoded["labels"] = row["label"]
    return encoded

Example before training

In [36]:
example = preprocess_function({"question": "Are you inside the house", "answer": "But I'm just an engineer"}, train=False)

print(example["input_ids"])
print(type(example["input_ids"]))
print(example)
#print(tokenizer.decode(example.input_ids))
#print(example.input_ids)

[101, 3160, 1024, 2024, 2017, 2503, 1996, 2160, 1012, 3437, 1024, 2021, 1045, 1005, 1049, 2074, 2019, 3992, 102, 1999, 2023, 2742, 1010, 1996, 3437, 26399, 2015, 2030, 26663, 1996, 3160, 1012, 102]
<class 'list'>
{'input_ids': [101, 3160, 1024, 2024, 2017, 2503, 1996, 2160, 1012, 3437, 1024, 2021, 1045, 1005, 1049, 2074, 2019, 3992, 102, 1999, 2023, 2742, 1010, 1996, 3437, 26399, 2015, 2030, 26663, 1996, 3160, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [38]:
tokenizer.decode(example["input_ids"])

"[CLS] question : are you inside the house. answer : but i'm just an engineer [SEP] in this example, the answer evades or ignores the question. [SEP]"

In [48]:
example_output = model(np.array(example["input_ids"]))
print(example_output.logits)
print(config.id2label[np.argmax(example_output.logits)])

tf.Tensor([[-2.0722811  4.369235  -3.2038815]], shape=(1, 3), dtype=float32)
NEUTRAL


In [49]:
encoded_dataset = dataset.map(preprocess_function, remove_columns=["question", "answer", "label"])

Map:   0%|          | 0/253 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

In [50]:
encoded_dataset["train"].features["labels"]

Value(dtype='int64', id=None)

In [51]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 253
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 85
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 85
    })
})

In [53]:
tokenizer.decode(encoded_dataset["train"]["input_ids"][0])

'[CLS] question : on a slightly lighter note, what do you think makes a good definitive fist pump : the quiet steely determination or fullon adrenaline spin your arms around?. answer : ( laughter. ) i think i have a few different versions. [SEP] in this example, the answer evades or ignores the question. [SEP]'

In [54]:
# a helper function to show the prediction results

def get_results(outputs, model, return_all_scores=True):
    scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
    if return_all_scores:
        return [
            [{"label": model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
                for item in scores
            ]
    else:
        return [
            {"label": model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores
        ]

In [55]:
dataset_batch_size = 4 # 16

tf_train_dataset = model.prepare_tf_dataset(
    encoded_dataset["train"],
    shuffle=True,
    batch_size=dataset_batch_size,
    tokenizer=tokenizer
)

tf_validation_dataset = model.prepare_tf_dataset(
    encoded_dataset["val"],
    shuffle=False,
    batch_size=dataset_batch_size,
    tokenizer=tokenizer,
)

tf_test_dataset = model.prepare_tf_dataset(
    encoded_dataset["test"],
    shuffle=False,
    batch_size=dataset_batch_size,
    tokenizer=tokenizer,
)


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [56]:
# now the dataset is ready to be fed into the model to fit
tf_train_dataset

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(4, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(4, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(4,), dtype=tf.int64, name=None))>

In [57]:
from transformers import create_optimizer

batch_size = 4
num_epochs = 10
number_of_training_examples = tf_train_dataset.cardinality().numpy()
batches_per_epoch = number_of_training_examples // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=2e-4, num_warmup_steps=0, num_train_steps=total_train_steps, weight_decay_rate=0.3
)

In [58]:
model.compile(optimizer=optimizer) # run_eagerly=True, 

In [60]:
# The issue was that BatchEncoding objects are not accepted, they need to be converted into a dict first
# https://github.com/huggingface/transformers/issues/20709


In [61]:
# evaluating loss before finetuning the model on our "target data"
before_finetuning_history = model.evaluate(tf_test_dataset)



In [62]:
# we are looking at Mean loss
print(model.metrics)
print(before_finetuning_history)

[<keras.src.metrics.base_metric.Mean object at 0x7ff337d1e4d0>]
3.3094873428344727


In [63]:
mlflow.log_metric("loss before finetuning", before_finetuning_history)

In [None]:
#import os
#os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

In [64]:
from evaluate import load

In [65]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard
from transformers.keras_callbacks import KerasMetricCallback

# remember to install git-lfs
# !apt install git-lfs

def compute_metrics(eval_predictions):
    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

#metric = model.metrics[0]

metric = load("glue", "mnli")
metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=tf_validation_dataset
)

push_to_hub_model_id = "question-dodging-finetuned-distilbert-base-uncased-mnli"
tensorboard_callback = TensorBoard(log_dir="./text_classification_model_save/logs")

push_to_hub_callback = PushToHubCallback(
    output_dir="./text_classification_model_save",
    tokenizer=tokenizer,
    hub_model_id=push_to_hub_model_id,
)

callbacks = [metric_callback, tensorboard_callback] #, push_to_hub_callback]


/home/snek/a-politicians-answer/notebooks/text_classification_model_save is already a clone of https://huggingface.co/i-be-snek/question-dodging-finetuned-distilbert-base-uncased-mnli. Make sure you pull the latest changes with `repo.git_pull()`.


In [66]:
# clear_gpu_mem()

In [67]:
history = model.fit(
        tf_train_dataset,
        validation_data=tf_validation_dataset,
        epochs=num_epochs,
        #batch_size=2,
        verbose=1,
        callbacks=callbacks
    )



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: /tmp/tmp743_abzq/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmp743_abzq/model/data/model/assets


In [69]:
after_finetuning_history = model.evaluate(tf_test_dataset)
after_finetuning_history



0.7330948114395142

In [70]:
mlflow.end_run()

In [72]:
example_output = model(np.array(example["input_ids"]))
print(example_output.logits)
print(config.id2label[np.argmax(example_output.logits)])

tf.Tensor([[-0.26290268 -0.4130244   0.45110038]], shape=(1, 3), dtype=float32)
CONTRADICTION


In [74]:
tokenizer.decode(example["input_ids"])

"[CLS] question : are you inside the house. answer : but i'm just an engineer [SEP] in this example, the answer evades or ignores the question. [SEP]"