In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import numpy as np
import pandas as pd
import evaluate
from datasets import load_dataset, load_metric, list_metrics
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [3]:
version = 'TPrN'
feature_set = 'mergedText'
no_of_epochs = 5
num_classes = 22
batch_size = 32
dropout = 0.1
decay = 0.1
lr = 2e-5
max_len = 512
fp16 = True
metric_name = 'f1'
model_checkpoint = "distilbert-base-uncased"
new_model_name = 'distilbert-base-uncased-'+version+'_bs' +str(batch_size)+'-ep' +str(no_of_epochs)+ '-lr' +str(lr)+ '-wd' +str(decay)+ '-dp' +str(dropout)+ '-ml' +str(max_len)
if fp16:
    new_model_name = new_model_name+'-fp16'
print(new_model_name)

distilbert-base-uncased-TPrN_bs32-ep5-lr2e-05-wd0.1-dp0.1-ml512-fp16


In [4]:
dataset = load_dataset('csv', data_files={'train': '.\data\data_train-'+version+'.csv', 'dev': '.\data\data_dev-'+version+'.csv', 'test': '.\data\data_test-'+version+'.csv'})
metric = load_metric(metric_name)

Found cached dataset csv (C:/Users/Admin/.cache/huggingface/datasets/csv/default-9a72842af8de3d23/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

  metric = load_metric(metric_name)


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples[feature_set], max_length=max_len, padding="max_length", truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\Admin\.cache\huggingface\datasets\csv\default-9a72842af8de3d23\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-3a6959575b2dff4c.arrow


Map:   0%|          | 0/4306 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\Admin\.cache\huggingface\datasets\csv\default-9a72842af8de3d23\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-9cd193ba6487483d.arrow


In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_classes)#, hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [7]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"./snapshots/{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 5, 
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=no_of_epochs,
    weight_decay=decay,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    fp16=fp16
)

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="micro")

In [9]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['dev'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.936,0.527977,0.83372
2,0.4214,0.460878,0.846493
3,0.3244,0.45918,0.847887
4,0.2268,0.481282,0.849048
5,0.1875,0.480308,0.855318


TrainOutput(global_step=3140, training_loss=0.3839228891263342, metrics={'train_runtime': 753.5193, 'train_samples_per_second': 133.175, 'train_steps_per_second': 4.167, 'total_flos': 1.32978447277056e+16, 'train_loss': 0.3839228891263342, 'epoch': 5.0})

In [11]:
saved_path = "./models/" + new_model_name
trainer.save_model(saved_path)

In [12]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")
results = classifier(dataset['test'][feature_set], max_length=max_len, padding="max_length", truncation=True)
dfResults = pd.DataFrame.from_dict(results)
dfResults['label'] = dfResults['label'].str.replace('LABEL_','')
out_file = 'results_' + new_model_name + '.csv'
dfResults['label'].to_csv(out_file, sep=',', encoding='utf-8', index=False, header=False)
print(out_file)

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


results_distilbert-base-uncased-TPrN_bs32-ep5-lr2e-05-wd0.1-dp0.1-ml512-fp16.csv


In [13]:
metric = load_metric('f1')
f1micro = metric.compute(predictions=dfResults['label'].tolist(), references=dataset['test']['label'], average="micro") # not available for private test set
metric = load_metric('f1')
f1macro = metric.compute(predictions=dfResults['label'].tolist(), references=dataset['test']['label'], average="macro") # not available for private test set
results_row = new_model_name + ', ' + str(f1micro['f1']) + ', ' + str(f1macro['f1']) + '\n'
print(results_row)

with open("summary_results.csv", "a") as myfile:
    myfile.write(results_row)

distilbert-base-uncased-TPrN_bs32-ep5-lr2e-05-wd0.1-dp0.1-ml512-fp16, 0.7711627906976745, 0.4800302223960525

