In [1]:
import numpy as np
import pandas as pd
import torch
import transformers

from src import custom_dataset, custom_label, custom_model

In [2]:
BERT = 'bert-base-cased'
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-6
MAX_LENGTH = 512

TEXT_COL = 'text'
LABEL_COL = 'category'

In [3]:
datapath = f'data/bbc-text.csv'
df = pd.read_csv(datapath)

In [4]:
label_encoder = custom_label.CustomLabelEncoder(df[LABEL_COL])

In [5]:
df = custom_label.encode_labels(label_encoder=label_encoder, df=df, label_col=LABEL_COL)

In [6]:
n_classes = label_encoder.n_classes
n_classes, label_encoder.get_int2label_dict()

(5, {0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'})

### Brief look at a Bert model

In [7]:
tokenizer = transformers.BertTokenizer.from_pretrained(BERT)

In [8]:
model = transformers.BertModel.from_pretrained(BERT)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
f"Each token vector is of {model.embeddings.word_embeddings.embedding_dim}-dim"

'Each token vector is of 768-dim'

### Train, validate, test datasets

In [10]:
np.random.seed(112)
df_train, df_val, df_test = np.split(
    df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

len(df_train),len(df_val), len(df_test)

(1780, 222, 223)

In [11]:
train_dataset = custom_dataset.CustomDataset(
    tokenizer=tokenizer, 
    df=df_train,
    text_col=TEXT_COL,
    label_col=LABEL_COL,
    max_length=MAX_LENGTH)

val_dataset = custom_dataset.CustomDataset(
    tokenizer=tokenizer, 
    df=df_val,
    text_col=TEXT_COL,
    label_col=LABEL_COL,
    max_length=MAX_LENGTH)

len(train_dataset), len(val_dataset)

(1780, 222)

In [12]:
model = custom_model.BertClassifier(n_classes=n_classes, bert_model_name=BERT)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
model = custom_model.train(
    model=model, 
    train_dataset=train_dataset, 
    val_dataset=val_dataset,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    epochs=EPOCHS
)

  0%|                                                                                                                                        | 0/56 [00:02<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 384.00 MiB (GPU 0; 16.00 GiB total capacity; 14.23 GiB already allocated; 0 bytes free; 14.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
test_dataset = custom_dataset.CustomDataset(
    tokenizer=tokenizer, 
    df=df_test,
    text_col=TEXT_COL,
    label_col=LABEL_COL,
    max_length=MAX_LENGTH)

In [None]:
pred = custom_model.predict(model, test_dataset)

In [None]:
metrics = custom_model.compute_metrics(df_test[LABEL_COL], pred)

In [None]:
metrics