## Checks and prep

In [1]:
# verify some package versions
import sklearn
import transformers
import torch
import pandas
import numpy

print(f"{sklearn.__version__=}")
print(f"{transformers.__version__=}")
print(f"{torch.__version__=}")
print(f"{pandas.__version__=}")
print(f"{numpy.__version__=}")

sklearn.__version__='1.5.2'
transformers.__version__='4.45.1'
torch.__version__='2.2.2+cu118'
pandas.__version__='2.2.3'
numpy.__version__='1.26.4'


In [2]:
# some variables
from pathlib import Path

datasets_dir = Path("../artifacts/datasets/dataset_2024-10-01T12-00-00/")
models_dir = Path("../artifacts/models/")
model_training_dir = models_dir / "model_current"

In [3]:
# prep output dirs
for dir_path in [datasets_dir, models_dir, model_training_dir]:
    dir_path.mkdir(exist_ok=True)

In [4]:
# verify torch, gpu, and cuda
print(f"{torch.cuda.is_available()=}")

if torch.cuda.is_available():
    print("GPU found")
    !nvidia-smi
    print(f"""{torch.cuda.device_count()=}
{torch.cuda.current_device()=}
{torch.torch.cuda.get_device_name(0)=}
""")

torch.cuda.is_available()=True
GPU found
Wed Oct  9 11:58:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A10G                    Off | 00000000:00:1E.0 Off |                    0 |
|  0%   25C    P0              56W / 300W |      3MiB / 23028MiB |      4%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                           

In [5]:
# check input files exist
for fname in ["train.csv", "test.csv"]:
    fp = datasets_dir / fname
    print(fp)
    assert fp.exists(), fp

../artifacts/datasets/dataset_2024-10-01T12-00-00/train.csv
../artifacts/datasets/dataset_2024-10-01T12-00-00/test.csv


## Load and Process Data

In [6]:
# pre-process utilities
import pandas as pd
import csv

blank2mask = lambda x: x.replace("[BLANK]", "[MASK]")

def pre_process_df(df_name: str):
    df = pd.read_csv(df_name, index_col=0, quoting=csv.QUOTE_NONNUMERIC)
    df.text = df.text.map(blank2mask)  
    return df

In [7]:
# Load the processed train and test data
df_train = pre_process_df(datasets_dir / 'train.csv')
df_test = pre_process_df(datasets_dir / 'test.csv')
df_train.shape, df_test.shape

((114950, 6), (14721, 5))

In [8]:
# split into train and val set
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(df_train, test_size=0.1)

len(train_df), len(valid_df)

(103455, 11495)

## Training Code

In [9]:
from transformers import DistilBertTokenizerFast, DistilBertForMultipleChoice

model_checkpoint = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)
model = DistilBertForMultipleChoice.from_pretrained(model_checkpoint)

Some weights of DistilBertForMultipleChoice were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

# x is like 'choice3' -> get id -> 2
choice2id = lambda x: int(x[-1]) - 1

# duplicate text 4 times and replace [MASK] with choices
# [text_choice1, text_choice2, text_choice3, text_choice4]
# skipping choices 5 & 6
def process_text(x):
    text = x['text']
    choices = x['choice1':'choice4'].tolist()
    text = [text.replace('[MASK]', c) for c in choices]
    return text

# Common for train, val and test
class MCDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        # flatten list for tokenizer -> [sent0*4, sent1*4 ...]
        text = [sent for sample in df.apply(process_text, axis=1).tolist() 
                for sent in sample]
        self.encodings = tokenizer(text, truncation=True, max_length=168)
        # Un-flatten, {k:[[sent0*4], [sent1*4] ...]}
        self.encodings = {k: [v[i:i+4] for i in range(0, len(v), 4)] 
                          for k,v in self.encodings.items()}
        self.label_flag = False
        if 'label' in df.columns:
            self.label_flag = True
            self.labels = df.label.map(choice2id).tolist()
            assert len(self.encodings['input_ids']) == len(self.labels)
    
    def __getitem__(self, idx: int):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.label_flag:
            item['labels'] = self.labels[idx]
        else: # set dummy labels if test set
            item['labels'] = -100
        return item
    
    def __len__(self):
        return len(self.encodings['input_ids'])

In [11]:
text = [sent for sample in df_train.apply(process_text, axis=1).tolist() 
                for sent in sample]
print(text[10:15])

['<! -- Beginning of the Coveo Searchbox section. Keep it here. -- >\n                    ', '<! -- Beginning of the Coveo Searchbox subdivision. Keep it here. -- >\n                    ', '<div class="CoveoFacet" data-title="Salesforce Owner " data-field="@sfownername" data-computed-field="@sfopportunityamountconverted"\n              data-sort-criteria="ComputedFieldDescending" data-number-of-values="7" data-tab="Salesforce"></div>\n             <div class="CoveoFacet" data-title="Opportunity Type" data-field="@sfopportunitytype " data-computed-field="@sfopportunityamountconverted"\n              data-sort-criteria="ComputedFieldDescending" data-number-of-values="5" data-tab="Salesforce"></div>\n             <div class="CoveoFacet" data-title="Opportunity Level" data-field="@sfopportunitystagename', '<div class="CoveoFacet" data-title="Salesforce Owner " data-field="@sfownername" data-computed-field="@sfopportunityamountconverted"\n              data-sort-criteria="ComputedFieldDesce

In [12]:
# check some encodings
tokenizer(text[10:12], truncation=True, max_length=168)

{'input_ids': [[101, 1026, 999, 1011, 1011, 2927, 1997, 1996, 11821, 2080, 3945, 8758, 2930, 1012, 2562, 2009, 2182, 1012, 1011, 1011, 1028, 102], [101, 1026, 999, 1011, 1011, 2927, 1997, 1996, 11821, 2080, 3945, 8758, 12572, 1012, 2562, 2009, 2182, 1012, 1011, 1011, 1028, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
# process all encodings
encodings = tokenizer(text, truncation=True, max_length=168)

In [14]:
# A data collater for multiple choice
# taken from https://github.com/huggingface/notebooks/blob/master

from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # features is a list of dicts with len = batch_sz
        label_name = "label" if "label" in features[0].keys() else "labels"
        # list of labels
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])

        # a list -> with "batch_size" lists ->  
        # each with "num_choice" dicts -> each representing a choice 
        flattened_features = [[{k: v[i] for k, v in feature.items()} 
                               for i in range(num_choices)] 
                              for feature in features]
    
        # a list with "batch_size*num_choice"  dicts
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten from (batch_size*num_choice, max_len) -> (batch_size, num_choice, max_len)
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [15]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [16]:
from transformers import  TrainingArguments, Trainer, IntervalStrategy

sz = train_df.shape[0]
num_train_epochs = 3
per_device_train_batch_size = 128
per_device_eval_batch_size = 128
auto_find_batch_size = True
resume_from_checkpoint = True

if not torch.cuda.is_available():
    print("⚠️ No GPU found. Running in debug mode. Will not train a sensible model. ⚠️")
    sz = 500  # reduced dataset size
    num_train_epochs = 1
    per_device_train_batch_size = 8
    per_device_eval_batch_size = 8
    auto_find_batch_size = False
    resume_from_checkpoint = False

# intialize the datasets
train_dataset = MCDataset(train_df.iloc[:sz])
valid_dataset = MCDataset(valid_df.iloc[:sz])

# setup the trainer
args = TrainingArguments(
    output_dir = model_training_dir / f"{model_checkpoint}-mcq" / "results",
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    auto_find_batch_size=auto_find_batch_size,
    num_train_epochs=num_train_epochs,
    learning_rate=5e-5,
    weight_decay=0.001,
    warmup_steps=0,
    eval_strategy=IntervalStrategy.STEPS,
    eval_steps=50,
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

# train
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

# final eval
trainer.evaluate()

	per_device_train_batch_size: 128 (from args) != 64 (from trainer_state.json)


Step,Training Loss,Validation Loss


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.05743272975087166,
 'eval_accuracy': 0.9805132746696472,
 'eval_runtime': 57.1028,
 'eval_samples_per_second': 201.304,
 'eval_steps_per_second': 1.576,
 'epoch': 3.7105751391465676}

In [17]:
# save the model
from datetime import datetime

date_str = datetime.utcnow().strftime("%Y-%m-%dT%H-%M-%S")
exported_model_fname = f"model_{date_str}"
trainer.save_model(models_dir / exported_model_fname)

# Verify the model

In [18]:
from transformers import DistilBertTokenizerFast, DistilBertForMultipleChoice
import torch
from transformers.modeling_outputs import MultipleChoiceModelOutput
from transformers.tokenization_utils_base import BatchEncoding

In [19]:
from pathlib import Path
path = Path(models_dir / exported_model_fname)

In [20]:
tokenizer: DistilBertTokenizerFast = DistilBertTokenizerFast.from_pretrained(path)
model: DistilBertForMultipleChoice = DistilBertForMultipleChoice.from_pretrained(path)

## Single example

In [21]:
question = (
    "About this course ... Presentation Training 101 is a four-part [MASK] to presenting that’s "
    "designed to help you to better develop, prepare and deliver presentations. ... This video "
    "series is ... Coveo Training Catalog Deflection Panel Salesforce Lightning Sign In ...")
choices_map = {
    "choice1": "founding",
    "choice2": "introduction",
    "choice3": "institution",
    "choice4": "unveiling",
}

qa_pairs = [[question, choice] for choice in choices_map.values()]
encoding: BatchEncoding = tokenizer(qa_pairs, return_tensors='pt', padding=True)
inputs = {k: v.unsqueeze(0) for k, v in encoding.items()}  # input_ids, attention_mask

model_output: MultipleChoiceModelOutput = model(**inputs)

max_index = torch.argmax(model_output.logits)
choices = tuple(choices_map.values())
predicted_choice = choices[max_index]

print(predicted_choice)

introduction
