# Training a neural network model on the Alaska dataset (Test only)

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Load the data

In [2]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [3]:
import os
import sys

In [4]:
sys.path.append(os.path.join(root_dir, src_dir))

In [5]:
dataset_name = 'alaska'

In [6]:
corpus_filename = f'{dataset_name}_corpus.json'

In [7]:
from training import TrainingCorpus

In [8]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [9]:
corpus.size

2171

---

## Split the dataset into training, validation and test set

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
random_state = 3

In [12]:
X_ids, test_ids, X_labels, test_labels = corpus.get_train_test_data(test_size=0.1, random_state=random_state)
train_ids, val_ids, train_labels, val_labels = train_test_split(X_ids, X_labels, test_size=0.1, random_state=random_state)

In [13]:
import numpy as np

In [14]:
train_labels = np.argmax(train_labels, axis=1)
val_labels = np.argmax(val_labels, axis=1)
test_labels = np.argmax(test_labels, axis=1)

In [15]:
num_labels = len(set(train_labels))
print(f'Training set size: {len(train_ids)}')
print(f'Validation set size: {len(val_ids)}')
print(f'Test set size: {len(test_ids)}')
print(f'No. of labels: {num_labels}')

Training set size: 1757
Validation set size: 196
Test set size: 218
No. of labels: 20


## Retrieve text from ids

In [16]:
train_texts = [' '.join(corpus.get_tokens(doc_id)) for doc_id in train_ids]
val_texts = [' '.join(corpus.get_tokens(doc_id)) for doc_id in val_ids]
test_texts = [' '.join(corpus.get_tokens(doc_id)) for doc_id in test_ids]

## Tokenize text

In [17]:
model_name = 'distilbert-base-uncased'

In [18]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [19]:
train_encodings = tokenizer(train_texts, truncation=True)
val_encodings = tokenizer(val_texts, truncation=True)
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

## Define custom Dataset class

In [20]:
import torch
from torch.utils.data import Dataset

In [21]:
class CustomDataset(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

## Encode texts

In [22]:
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

## Load the pretrained model

In [23]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

## Set up TrainingArguments and Trainer class for training the model

### Define trained model's directory

In [24]:
model_dir_name = f'{dataset_name}_bert_test'
model_dir_path = os.path.join(root_dir, models_dir, model_dir_name)
model_dir_path

'../../data/models/alaska_bert_test'

In [25]:
if not os.path.exists(model_dir_path):
    # make directory
    os.mkdir(model_dir_path)

### Set up training arguments

In [26]:
from transformers import Trainer, TrainingArguments

In [27]:
training_args = TrainingArguments(
    output_dir=model_dir_path,
    overwrite_output_dir=True,
    save_steps=10000,
    save_total_limit=2,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=5e-5,
    evaluation_strategy='epoch', # run the model on the val dataset after each epoch
    seed=random_state,
    disable_tqdm=False, # whether to disable tqdm progress bar during training
    logging_steps=10
)

In [28]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

## Train the model

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.179497,1.668714
2,0.485151,0.296551
3,0.139253,0.074506
4,0.063668,0.042448
5,0.056001,0.036581


TrainOutput(global_step=275, training_loss=0.779420670596036)

### Save the trained model

In [30]:
trainer.save_model(model_dir_path)

## Evaluate the model

### Reload the model from file

Empty GPU cache

In [31]:
torch.cuda.empty_cache()

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_dir_path)
model = AutoModelForSequenceClassification.from_pretrained(model_dir_path)

### Move model and data to the selected device

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [34]:
model = model.to(device)
test_encodings = test_encodings.to(device)

### Get predictions

In [35]:
import torch.nn.functional as F

In [36]:
with torch.no_grad():
    preds = model(**test_encodings).logits
    preds = preds.cpu().numpy()
    preds = np.argmax(preds, axis=1)

### Compute metrics

In [37]:
from sklearn.metrics import classification_report

In [38]:
print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00        11
           4       1.00      1.00      1.00        17
           5       1.00      1.00      1.00        11
           6       0.91      1.00      0.95        10
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        21
           9       1.00      1.00      1.00         4
          10       1.00      0.91      0.95        11
          11       1.00      1.00      1.00        20
          12       1.00      1.00      1.00         6
          13       1.00      1.00      1.00        10
          14       1.00      1.00      1.00        10
          15       1.00      1.00      1.00         6
          16       1.00      1.00      1.00         7
          17       1.00    

---