In [1]:
import datetime


import numpy as np
import pandas as pd
import torch
import torch
import torchmetrics
import transformers
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model.to(device)

def tokenize_function(examples, max_length):
    return tokenizer(examples, max_length=max_length, padding="max_length", truncation=True, return_tensors='pt')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [3]:
for param in model.bert.parameters():
    param.requires_grad = False

In [4]:
import torchsummary

torchsummary.summary(model);

Layer (type:depth-idx)                   Param #
├─BertModel: 1-1                         --
|    └─BertEmbeddings: 2-1               --
|    |    └─Embedding: 3-1               (22,268,928)
|    |    └─Embedding: 3-2               (393,216)
|    |    └─Embedding: 3-3               (1,536)
|    |    └─LayerNorm: 3-4               (1,536)
|    |    └─Dropout: 3-5                 --
|    └─BertEncoder: 2-2                  --
|    |    └─ModuleList: 3-6              (85,054,464)
|    └─BertPooler: 2-3                   --
|    |    └─Linear: 3-7                  (590,592)
|    |    └─Tanh: 3-8                    --
├─Dropout: 1-2                           --
├─Linear: 1-3                            1,538
Total params: 108,311,810
Trainable params: 1,538
Non-trainable params: 108,310,272


In [5]:
df = pd.read_csv('data/train.csv')
df['len'] = df['text'].apply(lambda x: len(x))
df[df['location'].notnull()]

Unnamed: 0,id,keyword,location,text,target,len
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,55
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,67
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,82
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,34
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,76
...,...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0,51
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0,107
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0,107
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0,93


# Classifier on BERT's embeddings

### work with the data

In [25]:
config = {
    'd': [128, 64],
    'n_components': None,
    'batch_size': 2048,
}

In [7]:
x = tokenize_function(df['text'].tolist(), df['len'].max())

In [8]:
X = []
batch_size = 512

num_batches = len(df)//batch_size
if len(df)%batch_size != 0:
    num_batches+= 1
    
model.eval()
with torch.no_grad():
    for batch in range(num_batches):
        inputs = {
            k: v[batch*batch_size:(batch+1)*batch_size].to(device) 
                  for k, v in x.items()
        }
        
        outputs = model.bert(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            token_type_ids=inputs['token_type_ids'],
        )
        
        X.append(outputs.pooler_output.cpu())

In [9]:
torch.cuda.empty_cache()
X = torch.cat(X)
Y = df['target'].values

In [10]:
class EmbeddingDataset(Dataset):
    def __init__(self, x, y, **kwargs):
        self.x = x
        self.y = y

    def shape(self):
        return self.x.shape
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x, y

In [11]:
Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size=0.1, random_state=69)

In [12]:
train_dataset = EmbeddingDataset(Xtrain, Ytrain)
val_dataset = EmbeddingDataset(Xval, Yval)

In [15]:
class Trainer:
    def __init__(self, 
        model, loss_fn, optimizer, 
        stop_batch=None, metric=None,
        device='cuda', fp16=False, 
        **kwargs):
        self.model: nn.Module = model
        self.device = device
        self.metric = metric
        
        self.stop_batch = 100**100 if stop_batch is None else stop_batch
        self.loss_fn = loss_fn
        self.optimizer = optimizer  
        
        self.fp16 = fp16
        if fp16:
            self.scaler = torch.cuda.amp.GradScaler()        
        
        
    def checkpoint(self) -> dict:
        cpoint =  {
            "model": self.model.state_dict(),
            "optimizer": self.optimizer.state_dict(),
        }
        if self.fp16:
            cpoint["scaler"] = self.scaler.state_dict()
            
        return cpoint
    
        
    def train(self, dataset, epoch) -> float:
        self.model.train()
        running_loss = 0
        for idx, batch in enumerate(dataset):
            X, Y = batch
            X, Y = X.to(self.device), Y.to(self.device)
            running_loss+= self.__train(X, Y)
            if idx >= self.stop_batch:
                break
                
        return running_loss
            
            
    def val(self, dataset) -> list[torch.Tensor]:
        self.model.eval()
        val_pred, val_true = [], []
        with torch.inference_mode():
            for batch in dataset:
                X, Y = batch
                val_pred+= [self.model(X.to(self.device)).cpu()]
                val_true+= [Y]
        return torch.cat(val_pred), torch.cat(val_true)
                
            
    def __train(self, X, Y) -> float:
        self.optimizer.zero_grad()
        if self.fp16:
            with torch.cuda.amp.autocast(enabled=True):
                outputs = self.model(X).softmax(1)
                loss = self.loss_fn(outputs, Y)
                
            self.scaler.scale(loss).backward()
            self.scaler.step(self.optimizer)
            self.scaler.update()

        else:
            outputs = self.model(X).softmax(1)
            loss = self.loss_fn(outputs, Y)    
            loss.backward()
            self.optimizer.step()
            
        if self.metric:
            self.metric(outputs, Y)
            
        return loss.item()

In [16]:
class Model(nn.Module):
    def __init__(self, input_dim=32, output_dim=2, d=[64, 64], callable_=None, **kwargs):
        super().__init__()
        
        seq = []
        d = [input_dim] + d + [output_dim]
        for i in range(len(d)-1):
            seq.append(
                nn.Linear(d[i], d[i+1])
            )
            seq.append(nn.Dropout(p=0.5))
            if i != len(d)-2:
                seq.append(nn.GELU())
                
        self.callable_ = callable_
        self.seq = nn.Sequential(*seq)

        
    def forward(self, x: torch.Tensor):
        if self.callable_ is not None:
            device = x.device
            x = torch.from_numpy(self.callable_(x.cpu()).astype('float32'))
            x = x.to(device)
            
        return self.seq(x)

In [27]:
n_components = config['n_components']
if n_components is not None:
    pca = PCA(n_components=n_components)
    pca.fit(Xtrain)
    
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config['batch_size'])
val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'])

classification_model = Model(
    input_dim=n_components if n_components is not None else train_dataset.shape()[1], 
    callable_=None if n_components is None else pca.transform,
    output_dim=2,
    d=config['d'],
).to(device)

trainer = Trainer(
    model=classification_model,
    metric=torchmetrics.AUROC(num_classes=2),
    loss_fn=nn.CrossEntropyLoss(reduce=True),
    optimizer=torch.optim.AdamW(classification_model.parameters(), lr=3e-4),
)

acc = torchmetrics.Accuracy(num_classes=2)
auc = torchmetrics.AUROC(num_classes=2)

In [28]:
name = 'BERT embeddings + {0} + {1} '.format(
    f"{config['n_components']} pca", 
    f"{config['d']} d"
)
board_name = name + datetime.datetime.now().strftime("%Y.%m.%d - %H-%M-%S")

log_dir = f"logs/fit/{board_name}"
writer = SummaryWriter(log_dir)

In [29]:
try:
    wait = 0
    patience = 100

    epoch = 0
    best_loss = -np.inf
    while wait < patience:
        train_loss = trainer.train(train_dataloader, epoch)

        val_pred, val_true = trainer.val(val_dataloader)
        val_pred = val_pred.softmax(1)
        metrics = {
            'AUC': auc(val_pred, val_true),
            'ACC': acc(val_pred, val_true),
        }
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('AUC/train', trainer.metric.compute(), epoch)
        writer.add_scalar('AUC/val', metrics['AUC'], epoch)
        writer.add_scalar('ACC/val', metrics['ACC'], epoch)


        wait+=1
        epoch+=1
        if metrics['AUC'] > best_loss:
            checkpoint = trainer.checkpoint()
            torch.save(checkpoint, f'Models/W/{name}.torch')
            best_loss = metrics['AUC']
            wait = 0

except KeyboardInterrupt:
    print("KeyboardInterrupt")

In [30]:
print(name)

checkpoint = torch.load(f'Models/W/{name}.torch')
classification_model.load_state_dict(checkpoint['model'])

val_pred, val_true = trainer.val(val_dataloader)
val_pred = val_pred.softmax(1)
print('AUC:', auc(val_pred, val_true))
print('ACC:', acc(val_pred, val_true))

BERT embeddings + None pca + [128, 64] d
AUC: tensor(0.8180)
ACC: tensor(0.7677)


In [24]:
print(name)

checkpoint = torch.load(f'Models/W/{name}.torch')
classification_model.load_state_dict(checkpoint['model'])

val_pred, val_true = trainer.val(val_dataloader)
val_pred = val_pred.softmax(1)
print('AUC:', auc(val_pred, val_true))
print('ACC:', acc(val_pred, val_true))

BERT embeddings + None pca + [256, 256] d
AUC: tensor(0.8111)
ACC: tensor(0.7598)


In [160]:
print(name)

checkpoint = torch.load(f'Models/W/{name}.torch')
classification_model.load_state_dict(checkpoint['model'])

val_pred, val_true = trainer.val(val_dataloader)
val_pred = val_pred.softmax(1)
print('AUC:', auc(val_pred, val_true))
print('ACC:', acc(val_pred, val_true))

BERT embeddings + pca + linears
AUC: tensor(0.8246)
ACC: tensor(0.7795)


In [27]:
checkpoint = torch.load(f'Models/W/{name}.torch')
classification_model.load_state_dict(checkpoint['model'])

val_pred, val_true = trainer.val(val_dataloader)
val_pred = val_pred.softmax(1)
print('AUC:', auc(val_pred, val_true))
print('ACC:', acc(val_pred, val_true))

AUC: tensor(0.8505)
ACC: tensor(0.7720)


# Tune BERT

w.i.p.

In [51]:
class NLPDataset(Dataset):
    def __init__(self, df, tokenizer, batch_size, **kwargs):
        self.df = df.reset_index()
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.values = df[['text', 'target']].values
        
        
    def tokenize_function(self, examples):
        return tokenizer(examples, padding="max_length", truncation=True, return_tensors='pt')

    
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, idx):
        idx = random.choices(range(len(self)), k=self.batch_size)
        x = self.tokenize_function(self.values[idx, 0].tolist())
        y = self.values[idx, 1].astype('int32')
        
        x['labels'] = torch.LongTensor(y)
        return x

In [9]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = transformers.get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [11]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(
            labels=batch['labels'],
            input_ids=batch['input_ids'][0],
            attention_mask=batch['attention_mask'][0],
            token_type_ids=batch['token_type_ids'][0],
        )
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  1%|▍                                                                          | 120/19839 [06:35<18:25:30,  3.36s/it]

KeyboardInterrupt: 

# End to end transformer classifier

w.i.p.