In [1]:
import re
import datetime


import numpy as np
import pandas as pd
import torch
import torchmetrics
import transformers
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import utils

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
base_model_name = "bert-large-cased"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
model.to(device)

def tokenize_function(examples, max_length):
    return tokenizer(examples, max_length=max_length, padding="max_length", truncation=True, return_tensors='pt')

Downloading: 100%|██████████████████████████████████████████████████████████████████| 29.0/29.0 [00:00<00:00, 9.83kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████| 208k/208k [00:00<00:00, 296kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████| 426k/426k [00:01<00:00, 400kB/s]
Downloading:  15%|█████████▊                                                        | 189M/1.25G [07:29<52:32, 362kB/s]

In [3]:
import torchsummary

torchsummary.summary(model);

Layer (type:depth-idx)                   Param #
├─BertModel: 1-1                         --
|    └─BertEmbeddings: 2-1               --
|    |    └─Embedding: 3-1               22,268,928
|    |    └─Embedding: 3-2               393,216
|    |    └─Embedding: 3-3               1,536
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─BertEncoder: 2-2                  --
|    |    └─ModuleList: 3-6              85,054,464
|    └─BertPooler: 2-3                   --
|    |    └─Linear: 3-7                  590,592
|    |    └─Tanh: 3-8                    --
├─Dropout: 1-2                           --
├─Linear: 1-3                            1,538
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0


In [4]:
df = pd.read_csv('data/train.csv')
df[df['location'].notnull()]

# Remove links
df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+', '', x))
df['len'] = df['text'].apply(lambda x: len(x))

In [19]:
text = r'It was fire :), freash meat 🔥🔥 www.site.com'
text_processor = utils.TextPreprocessor(
    # pkl='data/emoticons.pkl'
)
text_processor(text)

'It was fire  freash meat  '

In [6]:
x = tokenize_function(df['text'].tolist(), df['len'].max())
Y = df['target'].values

# Classifier on BERT's embeddings

In [7]:
config = {
    'd': [64, 32],
    'dropout': 0.5,
    'n_components': 384,
    'batch_size': 1024,
    'tuned_model_name': 'BERT Tune ',
}

if config['tuned_model_name'] is not None:
    state_dict = torch.load(f'Models/W/{config["tuned_model_name"]}.torch')
    model.load_state_dict(state_dict)

### work with the data

In [63]:
X = []
batch_size = 512

num_batches = len(df)//batch_size
if len(df)%batch_size != 0:
    num_batches+= 1
    
model.eval()
with torch.no_grad():
    for batch in range(num_batches):
        inputs = {
            k: v[batch*batch_size:(batch+1)*batch_size].to(device) 
                  for k, v in x.items()
        }
        
        outputs = model.bert(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            token_type_ids=inputs['token_type_ids'],
        )
        
        X.append(outputs.pooler_output.cpu())

In [64]:
torch.cuda.empty_cache()
X = torch.cat(X)
Y = df['target'].values

In [65]:
Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size=0.1, random_state=69)

In [66]:
train_dataset = utils.EmbeddingDataset(Xtrain, Ytrain)
val_dataset = utils.EmbeddingDataset(Xval, Yval)

### train

In [67]:
def validate(name):    
    print(name)

    checkpoint = torch.load(f'Models/W/{name}.torch')
    classification_model.load_state_dict(checkpoint['model'])

    val_pred, val_true = trainer.val(val_dataloader)
    val_pred = val_pred.softmax(1)
    print('AUC:', auc(val_pred, val_true))
    print('ACC:', acc(val_pred, val_true))

In [68]:
n_components = config['n_components']
if n_components is not None:
    pca = PCA(n_components=n_components)
    pca.fit(Xtrain)
    
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config['batch_size'])
val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'])

classification_model = utils.ClassifierModel(
    input_dim=n_components if n_components is not None else train_dataset.shape()[1], 
    callable_=None if n_components is None else pca.transform,
    output_dim=2,
    **config,
).to(device)

trainer = utils.Trainer(
    model=classification_model,
    metric=torchmetrics.AUROC(num_classes=2),
    loss_fn=nn.CrossEntropyLoss(reduce=True),
    optimizer=torch.optim.AdamW(classification_model.parameters(), lr=3e-4),
)

acc = torchmetrics.Accuracy(num_classes=2)
auc = torchmetrics.AUROC(num_classes=2)



In [69]:
name = 'BERT {4} embeddings + {0} + {1} + {2} + {3} '.format(
    f"{config['n_components']} pca", 
    f"{config['d']} d",
    f"{config['dropout']} dropout",
    f"{config['batch_size']} batchsize",
    f"tuned" if config['tuned_model_name'] is not None else "",
)
board_name = name + datetime.datetime.now().strftime("%Y.%m.%d - %H-%M-%S")

log_dir = f"logs/fit/{board_name}"
writer = SummaryWriter(log_dir)

In [70]:
try:
    wait = 0
    patience = 150

    epoch = 0
    best_loss = -np.inf
    while wait < patience:
        train_loss = trainer.train(train_dataloader, epoch)

        val_pred, val_true = trainer.val(val_dataloader)
        val_pred = val_pred.softmax(1)
        metrics = {
            'AUC': auc(val_pred, val_true),
            'ACC': acc(val_pred, val_true),
        }
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('AUC/train', trainer.metric.compute(), epoch)
        writer.add_scalar('AUC/val', metrics['AUC'], epoch)
        writer.add_scalar('ACC/val', metrics['ACC'], epoch)


        wait+=1
        epoch+=1
        if metrics['AUC'] > best_loss:
            checkpoint = trainer.checkpoint()
            torch.save(checkpoint, f'Models/W/{name}.torch')
            best_loss = metrics['AUC']
            wait = 0

except KeyboardInterrupt:
    print("KeyboardInterrupt")

### resluts

In [54]:
validate(name)

BERT tuned embeddings + None pca + [128, 64] d + 0.5 dropout + 1024 batchsize 
AUC: tensor(0.8849)
ACC: tensor(0.8281)


In [18]:
validate(name)

BERT tuned embeddings + 384 pca + [64, 32] d + 0.5 dropout + 1024 batchsize 
AUC: tensor(0.8805)
ACC: tensor(0.8294)


In [71]:
validate(name)

BERT embeddings + 384 pca + [64, 32] d + 0.5 dropout + 1024 batchsize 
AUC: tensor(0.8397)
ACC: tensor(0.7887)


In [18]:
# With links in text
# 2048 batch
validate(name)

BERT embeddings + 256 pca + [128, 64] d + 0.15 dropout 
AUC: tensor(0.8247)
ACC: tensor(0.7782)


In [37]:
# With links in text
# 2048 batch
# 0.5 dropout
validate(name)

BERT embeddings + 512 pca + [128, 64] d 
AUC: tensor(0.8231)
ACC: tensor(0.7769)


In [32]:
# With links in text
# 2048 batch
# 0.5 dropout
validate(name)

BERT embeddings + None pca + [128, 64] d
AUC: tensor(0.8180)
ACC: tensor(0.7677)


# Tune BERT

In [7]:
class config:
    # batchsize is acum_iter*batch_size
    batch_size = 96 # 128->96 due preproccesing that increase max len
    accum_iter = 4
    preproc = True
    rand = True

### work with data

In [8]:
if config.preproc:
    new_df = df.copy()
    new_df['text'] = new_df['text'].apply(lambda x: text_processor(x))
    new_df['len'] = new_df['text'].apply(lambda x: len(x))
    
    x = tokenize_function(new_df['text'].tolist(), 260)
    Y = df['target'].values

In [9]:
IDXtrain, IDXval, Ytrain, Yval = train_test_split(np.arange(len(Y)), Y, test_size=0.1, random_state=69)

Xtrain = {k: v[IDXtrain] for k, v in x.items()}
Xval = {k: v[IDXval] for k, v in x.items()}

train_dataset = utils.NLPDataset(Xtrain, Ytrain, rand=config.rand)
val_dataset = utils.NLPDataset(Xval, Yval)

### train

In [10]:
def train_epoch():
    progress_bar = tqdm(range(len(train_dataloader)))

    model.train()
    for batch_idx, batch in enumerate(train_dataloader):
        batch = [item.to(device) for item in batch]
        labels, input_ids, attention_mask, token_type_ids = batch

        outputs = model(
            labels=labels,
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        loss = outputs.loss
        loss.backward()

        if ((batch_idx + 1) % config.accum_iter == 0) or (batch_idx + 1 == len(train_dataloader)):
            optimizer.step()
            optimizer.zero_grad()

        progress_bar.update(1)
        
        _x, _y = outputs.logits.cpu().softmax(1), labels.cpu()
        train_acc(_x, _y)
        train_auc(_x, _y)

In [11]:
def evaluate():
    model.eval()
    val_pred, val_true = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            batch = [item.to(device) for item in batch]
            labels, input_ids, attention_mask, token_type_ids = batch

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )

            val_true.append(labels.cpu())
            val_pred.append(outputs.logits.cpu())
            
    val_pred = torch.cat(val_pred)
    val_true = torch.cat(val_true)
    return val_pred, val_true

In [12]:
def validate(name): 
    print(name)

    state_dict = torch.load(f'Models/W/{name}.torch')
    model.load_state_dict(state_dict)

    val_pred, val_true = evaluate()
    val_pred = val_pred.softmax(1)
    print('AUC:', auc(val_pred, val_true))
    print('ACC:', acc(val_pred, val_true))
    print('F1:', f1(val_pred, val_true))

In [13]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config.batch_size, pin_memory=True)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=config.batch_size, pin_memory=True)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

acc = torchmetrics.Accuracy(num_classes=2)
auc = torchmetrics.AUROC(num_classes=2)
f1 = torchmetrics.F1Score(num_classes=2)

train_acc = torchmetrics.Accuracy(num_classes=2)
train_auc = torchmetrics.AUROC(num_classes=2)



In [14]:
name = 'BERT Tune {0} {1}'.format(
    'rand' if config.rand else '',
    'preproc' if config.preproc else '',
)

board_name = name + datetime.datetime.now().strftime("%Y.%m.%d - %H-%M-%S")

log_dir = f"logs/fit/{board_name}"
writer = SummaryWriter(log_dir)

In [15]:
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()

In [18]:
try:
    wait = 0
    patience = 5

    epoch = 0
    best_loss = -np.inf
    while wait < patience:
        train_epoch()

        val_pred, val_true = evaluate()
        val_pred = val_pred.softmax(1)
        metrics = {
            'AUC': auc(val_pred, val_true),
            'ACC': acc(val_pred, val_true),
            'F1': f1(val_pred, val_true),
        }

        writer.add_scalar('ACC/train', train_acc.compute(), epoch)
        writer.add_scalar('AUC/train', train_auc.compute(), epoch)
        writer.add_scalar('AUC/val', metrics['AUC'], epoch)
        writer.add_scalar('ACC/val', metrics['ACC'], epoch)
        
        acc.reset()
        auc.reset()
        train_acc.reset()
        train_auc.reset()
        
        wait+=1
        epoch+=1
        if metrics['AUC'] > best_loss:
            torch.save(model.state_dict(), f'Models/W/{name}.torch')
            best_loss = metrics['AUC']
            wait = 0

except KeyboardInterrupt:
    print("KeyboardInterrupt")

100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [13:59<00:00, 11.66s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [14:51<00:00, 12.38s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [13:29<00:00, 11.24s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [13:14<00:00, 11.04s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [11:51<00:00,  9.88s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [13:22<00:00, 11.15s/it]


### results

In [17]:
# cringe
validate(name)

BERT Tune rand preproc
AUC: tensor(0.8695)
ACC: tensor(0.8018)
F1: tensor(0.8018)


In [17]:
validate(name)

BERT Tune rand
AUC: tensor(0.8831)
ACC: tensor(0.8150)
F1: tensor(0.8150)


In [29]:
validate(name)

BERT Tune 
AUC: tensor(0.8837)
ACC: tensor(0.8228)
F1: tensor(0.8228)


# End to end transformer

w.i.p.

In [56]:
torch.cuda.empty_cache()