In [1]:
import numpy as np
import pandas as pd
import torch
import transformers

from src import custom_dataset, custom_label, custom_model

In [2]:
BERT = 'bert-base-cased'
BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 1e-6
MAX_LENGTH = 512

TEXT_COL = 'text'
LABEL_COL = 'category'

In [3]:
datapath = f'data/bbc-text.csv'
df = pd.read_csv(datapath)

In [4]:
label_encoder = custom_label.CustomLabelEncoder(df[LABEL_COL])

In [5]:
df = custom_label.encode_labels(label_encoder=label_encoder, df=df, label_col=LABEL_COL)

In [6]:
n_classes = label_encoder.n_classes
n_classes, label_encoder.get_int2label_dict()

(5, {0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'})

### Brief look at a Bert model

In [7]:
tokenizer = transformers.BertTokenizer.from_pretrained(BERT)

In [8]:
model = transformers.BertModel.from_pretrained(BERT)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
f"Each token vector is of {model.embeddings.word_embeddings.embedding_dim}-dim"

'Each token vector is of 768-dim'

### Train, validate, test datasets

In [10]:
np.random.seed(112)
df_train, df_val, df_test = np.split(
    df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

len(df_train),len(df_val), len(df_test)

(1780, 222, 223)

In [11]:
train_dataset = custom_dataset.CustomDataset(
    tokenizer=tokenizer, 
    df=df_train,
    text_col=TEXT_COL,
    label_col=LABEL_COL,
    max_length=MAX_LENGTH)

val_dataset = custom_dataset.CustomDataset(
    tokenizer=tokenizer, 
    df=df_val,
    text_col=TEXT_COL,
    label_col=LABEL_COL,
    max_length=MAX_LENGTH)

len(train_dataset), len(val_dataset)

(1780, 222)

In [12]:
model = custom_model.BertClassifier(n_classes=n_classes, bert_model_name=BERT)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
model = custom_model.train(
    model=model, 
    train_dataset=train_dataset, 
    val_dataset=val_dataset,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    epochs=EPOCHS
)

100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [01:15<00:00,  2.95it/s]


Epoch 1 | Train Loss: 0.1966 | Val Loss: 0.1793


100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [01:14<00:00,  2.99it/s]


Epoch 2 | Train Loss: 0.1725 | Val Loss: 0.1520


100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [01:14<00:00,  3.00it/s]


Epoch 3 | Train Loss: 0.1268 | Val Loss: 0.0982


100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [01:14<00:00,  2.99it/s]


Epoch 4 | Train Loss: 0.0782 | Val Loss: 0.0603


100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [01:13<00:00,  3.02it/s]


Epoch 5 | Train Loss: 0.0511 | Val Loss: 0.0405


In [14]:
test_dataset = custom_dataset.CustomDataset(
    tokenizer=tokenizer, 
    df=df_test,
    text_col=TEXT_COL,
    label_col=LABEL_COL,
    max_length=MAX_LENGTH)

In [20]:
import numpy as np

def predict(model, test_dataset):
    test_dataloader = torch.utils.data.DataLoader(test_dataset)
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    with torch.no_grad():
        predictions = []
        for i, (X,Y) in enumerate(test_dataloader):
            Y = Y.to(device)
            mask = X['attention_mask'].to(device)
            input_ids = X['input_ids'].squeeze(1).to(device)
            output = model(input_ids, mask)
            predictions.append(output)
    
    predictions = torch.cat(predictions, axis=0)
    predictions = predictions.cpu().numpy()
    predictions = np.argmax(predictions, axis=1)
    
    return predictions

In [27]:
test_pred = predict(model, test_dataset)

In [28]:
test_metrics = custom_model.compute_metrics(df_test[LABEL_COL], pred)

In [29]:
test_metrics

{'accuracy': 0.9820627802690582, 'f1': 0.9818536446656315}

In [30]:
train_pred = predict(model, train_dataset)
train_metrics = custom_model.compute_metrics(df_train[LABEL_COL], train_pred)
train_metrics

{'accuracy': 0.9792134831460674, 'f1': 0.9791825981562116}

In [31]:
val_pred = predict(model, val_dataset)
val_metrics = custom_model.compute_metrics(df_val[LABEL_COL], val_pred)
val_metrics

{'accuracy': 0.9864864864864865, 'f1': 0.9864917259495571}