In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import numpy as np
import pandas as pd
import evaluate
from datasets import load_dataset, load_metric, list_metrics
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [3]:
version = 'TPrN'
feature_set = 'mergedText'
no_of_epochs = 5
num_classes = 22
batch_size = 32
dropout = 0.1
decay = 0.1
lr = 2e-5
max_len = 512
fp16 = True
metric_name = 'f1'
model_checkpoint = "bert-base-uncased"
new_model_name = 'bert-base-uncased-'+version+'_bs' +str(batch_size)+'-ep' +str(no_of_epochs)+ '-lr' +str(lr)+ '-wd' +str(decay)+ '-dp' +str(dropout)+ '-ml' +str(max_len)
if fp16:
    new_model_name = new_model_name+'-fp16'
print(new_model_name)

bert-base-uncased-TPrN_bs32-ep5-lr2e-05-wd0.1-dp0.1-ml512-fp16


In [4]:
dataset = load_dataset('csv', data_files={'train': '.\data\data_train-'+version+'.csv', 'dev': '.\data\data_dev-'+version+'.csv', 'test': '.\data\data_test-'+version+'.csv'})
metric = load_metric(metric_name)

Found cached dataset csv (C:/Users/Admin/.cache/huggingface/datasets/csv/default-9a72842af8de3d23/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

  metric = load_metric(metric_name)


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples[feature_set], max_length=max_len, padding="max_length", truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/20070 [00:00<?, ? examples/s]

Map:   0%|          | 0/4306 [00:00<?, ? examples/s]

Map:   0%|          | 0/4300 [00:00<?, ? examples/s]

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_classes, hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"./snapshots/{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 5, 
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=no_of_epochs,
    weight_decay=decay,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    fp16=fp16
)

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="micro")

In [9]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['dev'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.9665,0.550148,0.832791
2,0.4204,0.499525,0.835114
3,0.3198,0.471849,0.857408
4,0.2079,0.529189,0.849512
5,0.161,0.556496,0.852531


TrainOutput(global_step=3140, training_loss=0.37750635572299834, metrics={'train_runtime': 1485.7815, 'train_samples_per_second': 67.54, 'train_steps_per_second': 2.113, 'total_flos': 2.6407935677952e+16, 'train_loss': 0.37750635572299834, 'epoch': 5.0})

In [11]:
saved_path = "./models/" + new_model_name
trainer.save_model(saved_path)

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")
results = classifier(dataset['test'][feature_set], max_length=max_len, padding="max_length", truncation=True)
dfResults = pd.DataFrame.from_dict(results)
dfResults['label'] = dfResults['label'].str.replace('LABEL_','')
out_file = 'results_' + new_model_name + '.csv'
dfResults['label'].to_csv(out_file, sep=',', encoding='utf-8', index=False, header=False)
print(out_file)

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


In [None]:
metric = load_metric('f1')
f1micro = metric.compute(predictions=dfResults['label'].tolist(), references=dataset['test']['label'], average="micro") # not available for private test set
metric = load_metric('f1')
f1macro = metric.compute(predictions=dfResults['label'].tolist(), references=dataset['test']['label'], average="macro") # not available for private test set
results_row = new_model_name + ', ' + str(f1micro['f1']) + ', ' + str(f1macro['f1']) + '\n'
print(results_row)

with open("summary_results.csv", "a") as myfile:
    myfile.write(results_row)