# Training a neural network model on the Alaska dataset

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Load the data

In [2]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [3]:
import os
import sys

In [4]:
sys.path.append(os.path.join(root_dir, src_dir))

In [5]:
corpus_filename = 'alaska_corpus_noisy.json'

In [6]:
from training import TrainingCorpus

In [7]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [8]:
corpus.size

2171

---

## Split the dataset into training, validation and test set

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
random_state = 3

In [11]:
X_ids, test_ids, X_labels, test_labels = corpus.get_train_test_data(test_size=0.1, random_state=random_state)
train_ids, val_ids, train_labels, val_labels = train_test_split(X_ids, X_labels, test_size=0.1, random_state=random_state)

In [12]:
import numpy as np

In [13]:
train_labels = np.argmax(train_labels, axis=1)
val_labels = np.argmax(val_labels, axis=1)
test_labels = np.argmax(test_labels, axis=1)

In [14]:
num_labels = len(set(train_labels))
print(f'Training set size: {len(train_ids)}')
print(f'Validation set size: {len(val_ids)}')
print(f'Test set size: {len(test_ids)}')
print(f'No. of labels: {num_labels}')

Training set size: 1757
Validation set size: 196
Test set size: 218
No. of labels: 12


## Retrieve text from ids

In [15]:
train_texts = [' '.join(corpus.get_tokens(doc_id)) for doc_id in train_ids]
val_texts = [' '.join(corpus.get_tokens(doc_id)) for doc_id in val_ids]
test_texts = [' '.join(corpus.get_tokens(doc_id)) for doc_id in test_ids]

## Tokenize text

In [16]:
model_name = 'distilbert-base-uncased'

In [17]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [18]:
train_encodings = tokenizer(train_texts, truncation=True)
val_encodings = tokenizer(val_texts, truncation=True)
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

## Define custom Dataset class

In [19]:
import torch
from torch.utils.data import Dataset

In [20]:
class CustomDataset(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

## Encode texts

In [21]:
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

## Load the pretrained model

In [22]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

## Set up TrainingArguments and Trainer class for training the model

### Define trained model's directory

In [23]:
model_dir_name = 'alaska_bert_noisy'
model_dir_path = os.path.join(root_dir, models_dir, model_dir_name)

In [24]:
if not os.path.exists(model_dir_path):
    # make directory
    os.mkdir(model_dir_path)

### Set up training arguments

In [25]:
from transformers import Trainer, TrainingArguments

In [26]:
training_args = TrainingArguments(
    output_dir=model_dir_path,
    overwrite_output_dir=True,
    save_steps=10000,
    save_total_limit=2,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=5e-5,
    evaluation_strategy='epoch', # run the model on the val dataset after each epoch
    seed=random_state,
    disable_tqdm=False, # whether to disable tqdm progress bar during training
    logging_steps=10
)

In [27]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

## Train the model

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,1.5948,1.227117,0.1676,1169.302
2,0.2544,0.161258,0.1198,1635.379
3,0.0691,0.038024,0.1179,1661.785
4,0.0362,0.021705,0.1163,1685.976
5,0.03,0.018742,0.1148,1707.144


TrainOutput(global_step=275, training_loss=0.5627439991994337, metrics={'train_runtime': 30.4223, 'train_samples_per_second': 9.039, 'total_flos': 134828860748400, 'epoch': 5.0})

### Save the trained model

In [29]:
trainer.save_model(model_dir_path)

## Evaluate the model

### Reload the model from file

Empty GPU cache

In [30]:
torch.cuda.empty_cache()

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_dir_path)
model = AutoModelForSequenceClassification.from_pretrained(model_dir_path)

### Move model and data to the selected device

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [33]:
model = model.to(device)
test_encodings = test_encodings.to(device)

### Get predictions

In [34]:
with torch.no_grad():
    preds = model(**test_encodings).logits
    preds = preds.cpu().numpy()
    preds = np.argmax(preds, axis=1)

### Compute metrics

In [35]:
from sklearn.metrics import classification_report

In [36]:
print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        32
           4       1.00      1.00      1.00        21
           5       1.00      1.00      1.00         4
           6       1.00      1.00      1.00        31
           7       1.00      1.00      1.00        27
           8       1.00      1.00      1.00         6
           9       1.00      1.00      1.00        27
          10       1.00      1.00      1.00        16
          11       1.00      1.00      1.00        10

    accuracy                           1.00       218
   macro avg       1.00      1.00      1.00       218
weighted avg       1.00      1.00      1.00       218



---