In [1]:
# code ultimately from: https://huggingface.co/transformers/custom_datasets.html


# articles I read but didn't end up using as much
# https://towardsdatascience.com/bert-to-the-rescue-17671379687f
# https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613
# https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
pip install transformers



In [4]:
import sys
import pandas as pd
import numpy as np
import random as rn
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [5]:
# check that we're using GPU - this is from the Chen notebook: https://colab.research.google.com/drive/1P4Hq0btDUDOTGkCHGzZbAx1lb0bTzMMa?usp=sharing
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [6]:
# set all seeds: https://github.com/shudima/notebooks/blob/master/BERT_to_the_rescue.ipynb
rn.seed(117)
np.random.seed(117)
torch.manual_seed(117)
torch.cuda.manual_seed(117)

In [7]:
# load train, val, test
data_path = "/content/drive/MyDrive/nlp_data/deep_learning_data/"
train = pd.read_csv(data_path + "train.csv")
test = pd.read_csv(data_path + "test.csv")
val = pd.read_csv(data_path + "val.csv")

In [8]:
# truncate to lengths that Colab RAM can handle
train_texts = train['defn'][0:50000]
train_labels = train['cat'][0:50000]

val_texts = val['defn'][0:12500]
val_labels = val['cat'][0:12500]

test_texts = test['defn'][0:12500]
test_labels = test['cat'][0:12500]

In [9]:
# instantiate BERT tokenizer and tokenize the data
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [10]:
# make a torch Dataset for the data
class UDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = UDDataset(train_encodings, train_labels)
val_dataset = UDDataset(val_encodings, val_labels)
test_dataset = UDDataset(test_encodings, test_labels)

In [None]:
# compute metrics is from: https://huggingface.co/transformers/training.html
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/nlp_data/bert_results2/final_model/checkpoints',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/MyDrive/nlp_data/bert_results2/final_model/logs/',            # directory for storing logs
    logging_steps=500,               #adjusted these steps to take less memory
    save_steps = 2500,
    save_total_limit = 10,
)

# must tell the model how many labels to expect, else assumes binary
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 3)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics)             

trainer.train()


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Step,Training Loss
500,1.100668
1000,1.095058
1500,1.085775
2000,1.088925
2500,1.091675
3000,1.088419
3500,1.082031
4000,1.081734
4500,1.088106
5000,1.081945


In [15]:
# save the final model and tokenizer for reuse
trainer.save_model('/content/drive/MyDrive/nlp_data/bert_results2/final_model/model')
tokenizer.save_pretrained('/content/drive/MyDrive/nlp_data/bert_results2/final_model/tokenizer')


('/content/drive/MyDrive/nlp_data/bert_results2/final_model/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/nlp_data/bert_results2/final_model/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/nlp_data/bert_results2/final_model/tokenizer/vocab.txt',
 '/content/drive/MyDrive/nlp_data/bert_results2/final_model/tokenizer/added_tokens.json')

In [18]:
test_eval2 = trainer.predict(test_dataset)

In [28]:
train_eval = trainer.predict(train_dataset)

In [29]:
# metrics on train set
train_eval.metrics

{'eval_accuracy': 0.672,
 'eval_f1': 0.672,
 'eval_loss': 0.7674375176429749,
 'eval_precision': 0.672,
 'eval_recall': 0.672}

In [26]:
# metrics on test set
test_eval2.metrics

{'eval_accuracy': 0.40144,
 'eval_f1': 0.40143999999999996,
 'eval_loss': 1.2062667608261108,
 'eval_precision': 0.40144,
 'eval_recall': 0.40144}

In [20]:
# now we save out the results on the test set for comparison with other models
pred = test_eval2.predictions.argmax(-1)
results = pd.DataFrame({"pred": pred, "true": test_labels, "text" : test_texts})
results["match"] = results["pred"] == results["true"]

In [24]:
# check how the model performs by true category
results.groupby(["true", "match"]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

true  match
0     False    56.138614
      True     43.861386
1     False    55.308924
      True     44.691076
2     False    68.386308
      True     31.613692
dtype: float64

In [30]:
# save test results
results.to_csv("/content/drive/MyDrive/nlp_data/bert_results2/final_model/test_eval_final_bert.csv")

In [None]:
# example of loading a previously-trained model 
# original_model = DistilBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/nlp_data/bert_results2/first_batch', num_labels = 3)

# prediction syntax: https://github.com/huggingface/transformers
# https://stackoverflow.com/questions/55466298/pytorch-cant-call-numpy-on-variable-that-requires-grad-use-var-detach-num

# inputs = tokenizer(list(test_texts)[1:100], truncation = True, padding = True, return_tensors="pt")
# outputs = original_model(**inputs)

# outputs[0].detach().numpy().argmax(-1)