In [26]:
import re
import random
import datetime


import numpy as np
import pandas as pd
import torch
import torch
import torchmetrics
import transformers
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import utils

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model.to(device)

def tokenize_function(examples, max_length):
    return tokenizer(examples, max_length=max_length, padding="max_length", truncation=True, return_tensors='pt')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [3]:
import torchsummary

torchsummary.summary(model);

Layer (type:depth-idx)                   Param #
├─BertModel: 1-1                         --
|    └─BertEmbeddings: 2-1               --
|    |    └─Embedding: 3-1               22,268,928
|    |    └─Embedding: 3-2               393,216
|    |    └─Embedding: 3-3               1,536
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─BertEncoder: 2-2                  --
|    |    └─ModuleList: 3-6              85,054,464
|    └─BertPooler: 2-3                   --
|    |    └─Linear: 3-7                  590,592
|    |    └─Tanh: 3-8                    --
├─Dropout: 1-2                           --
├─Linear: 1-3                            1,538
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0


In [4]:
df = pd.read_csv('data/train.csv')
df['len'] = df['text'].apply(lambda x: len(x))
df[df['location'].notnull()]

Unnamed: 0,id,keyword,location,text,target,len
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,55
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,67
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,82
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,34
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,76
...,...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0,51
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0,107
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0,107
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0,93


In [5]:
# Remove links
df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+', '', x))

In [6]:
x = tokenize_function(df['text'].tolist(), df['len'].max())

# Classifier on BERT's embeddings

### work with the data

In [61]:
config = {
    'd': [64, 32],
    'dropout': 0.5,
    'n_components': 384,
    'batch_size': 1024,
}

In [63]:
X = []
batch_size = 512

num_batches = len(df)//batch_size
if len(df)%batch_size != 0:
    num_batches+= 1
    
model.eval()
with torch.no_grad():
    for batch in range(num_batches):
        inputs = {
            k: v[batch*batch_size:(batch+1)*batch_size].to(device) 
                  for k, v in x.items()
        }
        
        outputs = model.bert(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            token_type_ids=inputs['token_type_ids'],
        )
        
        X.append(outputs.pooler_output.cpu())

In [64]:
torch.cuda.empty_cache()
X = torch.cat(X)
Y = df['target'].values

In [65]:
Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size=0.1, random_state=69)

In [66]:
train_dataset = utils.EmbeddingDataset(Xtrain, Ytrain)
val_dataset = utils.EmbeddingDataset(Xval, Yval)

In [67]:
def validate(name):    
    print(name)

    checkpoint = torch.load(f'Models/W/{name}.torch')
    classification_model.load_state_dict(checkpoint['model'])

    val_pred, val_true = trainer.val(val_dataloader)
    val_pred = val_pred.softmax(1)
    print('AUC:', auc(val_pred, val_true))
    print('ACC:', acc(val_pred, val_true))

In [68]:
n_components = config['n_components']
if n_components is not None:
    pca = PCA(n_components=n_components)
    pca.fit(Xtrain)
    
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config['batch_size'])
val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'])

classification_model = utils.ClassifierModel(
    input_dim=n_components if n_components is not None else train_dataset.shape()[1], 
    callable_=None if n_components is None else pca.transform,
    output_dim=2,
    **config,
).to(device)

trainer = utils.Trainer(
    model=classification_model,
    metric=torchmetrics.AUROC(num_classes=2),
    loss_fn=nn.CrossEntropyLoss(reduce=True),
    optimizer=torch.optim.AdamW(classification_model.parameters(), lr=3e-4),
)

acc = torchmetrics.Accuracy(num_classes=2)
auc = torchmetrics.AUROC(num_classes=2)



In [69]:
name = 'BERT embeddings + {0} + {1} + {2} + {3} '.format(
    f"{config['n_components']} pca", 
    f"{config['d']} d",
    f"{config['dropout']} dropout",
    f"{config['batch_size']} batchsize",
)
board_name = name + datetime.datetime.now().strftime("%Y.%m.%d - %H-%M-%S")

log_dir = f"logs/fit/{board_name}"
writer = SummaryWriter(log_dir)

In [70]:
try:
    wait = 0
    patience = 150

    epoch = 0
    best_loss = -np.inf
    while wait < patience:
        train_loss = trainer.train(train_dataloader, epoch)

        val_pred, val_true = trainer.val(val_dataloader)
        val_pred = val_pred.softmax(1)
        metrics = {
            'AUC': auc(val_pred, val_true),
            'ACC': acc(val_pred, val_true),
        }
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('AUC/train', trainer.metric.compute(), epoch)
        writer.add_scalar('AUC/val', metrics['AUC'], epoch)
        writer.add_scalar('ACC/val', metrics['ACC'], epoch)


        wait+=1
        epoch+=1
        if metrics['AUC'] > best_loss:
            checkpoint = trainer.checkpoint()
            torch.save(checkpoint, f'Models/W/{name}.torch')
            best_loss = metrics['AUC']
            wait = 0

except KeyboardInterrupt:
    print("KeyboardInterrupt")

In [71]:
validate(name)

BERT embeddings + 384 pca + [64, 32] d + 0.5 dropout + 1024 batchsize 
AUC: tensor(0.8397)
ACC: tensor(0.7887)


In [46]:
# With links in text
validate(name)

BERT embeddings + 384 pca + [64, 32] d + 0.5 dropout + 1024 batchsize 
AUC: tensor(0.8214)
ACC: tensor(0.7717)


In [38]:
# With links in text
validate(name)

BERT embeddings + 512 pca + [64, 32] d + 0.15 dropout + 1024 batchsize 
AUC: tensor(0.8257)
ACC: tensor(0.7730)


In [33]:
# With links in text
# 2048 batch
validate(name)

BERT embeddings + 256 pca + [] d + 0.15 dropout 
AUC: tensor(0.8196)
ACC: tensor(0.7743)


In [28]:
# With links in text
# 2048 batch
validate(name)

BERT embeddings + 256 pca + [64, 32] d + 0.15 dropout 
AUC: tensor(0.8231)
ACC: tensor(0.7808)


In [23]:
# With links in text
# 2048 batch
validate(name)

BERT embeddings + 256 pca + [128, 128, 64] d + 0.15 dropout 
AUC: tensor(0.8211)
ACC: tensor(0.7835)


In [18]:
# With links in text
# 2048 batch
validate(name)

BERT embeddings + 256 pca + [128, 64] d + 0.15 dropout 
AUC: tensor(0.8247)
ACC: tensor(0.7782)


In [42]:
# With links in text
# 2048 batch
# 0.5 dropout
validate(name)

BERT embeddings + 256 pca + [128, 64] d 
AUC: tensor(0.8223)
ACC: tensor(0.7769)


In [37]:
# With links in text
# 2048 batch
# 0.5 dropout
validate(name)

BERT embeddings + 512 pca + [128, 64] d 
AUC: tensor(0.8231)
ACC: tensor(0.7769)


In [32]:
# With links in text
# 2048 batch
# 0.5 dropout
validate(name)

BERT embeddings + None pca + [128, 64] d
AUC: tensor(0.8180)
ACC: tensor(0.7677)


# Tune BERT

w.i.p.

In [45]:
class config:
    batch_size = 16
    accum_iter = 4  

In [38]:
class NLPDataset(Dataset):
    def __init__(self, x, y, batch_size, **kwargs):
        self.x = x
        self.y = torch.LongTensor(y)
        self.batch_size = batch_size
        
    
    def __len__(self):
        return len(self.y)
    
    
    def __getitem__(self, idx):
        idx = random.choices(range(len(self)), k=self.batch_size)
        
        x = {k: v[idx] for k, v in self.x.items()}
        x['labels'] = self.y[idx]
        return x

In [39]:
Y = df['target'].values

In [40]:
IDXtrain, IDXval, Ytrain, Yval = train_test_split(np.arange(len(Y)), Y, test_size=0.1, random_state=69)

In [41]:
Xtrain = {k: v[IDXtrain] for k, v in x.items()}
Xval = {k: v[IDXval] for k, v in x.items()}

In [42]:
train_dataset = NLPDataset(Xtrain, Ytrain, batch_size=config.batch_size)
val_dataset = NLPDataset(Xval, Yval, batch_size=config.batch_size)

In [44]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
val_dataloader = DataLoader(val_dataset, batch_size=1)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
lr_scheduler = transformers.get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=5
)

In [56]:
progress_bar = tqdm(range(len(train_dataloader)))

model.train()
for batch_idx, batch in enumerate(train_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(
        labels=batch['labels'],
        input_ids=batch['input_ids'][0],
        attention_mask=batch['attention_mask'][0],
        token_type_ids=batch['token_type_ids'][0],
    )
    loss = outputs.loss
    loss.backward()

    lr_scheduler.step()
    progress_bar.update(1)

    if ((batch_idx + 1) % config.accum_iter == 0) or (batch_idx + 1 == len(train_dataloader)):
        optimizer.step()
        optimizer.zero_grad()
        
    if batch_idx >= len(train_dataset)//config.batch_size:
        optimizer.step()
        optimizer.zero_grad()
        break

100%|██████████████████████████████████████████████████████████████████████████████| 6851/6851 [59:08<00:00,  1.36it/s]

# End to end transformer classifier

w.i.p.