In [1]:
import pandas as pd
from disaster_prediction.dataset import load_raw_train_df, load_raw_val_df

train_df = load_raw_train_df()
val_df = load_raw_val_df()

train_df.shape[0], val_df.shape[0]

(6852, 761)

In [2]:
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset

MODEL_NAME = 'bert-base-uncased'

def create_dataset(df: pd.DataFrame, include_labels=True) -> TensorDataset:
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
    encoded_dict = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors='pt')
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    if include_labels:
        labels = torch.tensor(df['target'].tolist())
        return TensorDataset(input_ids, attention_mask, labels)
    else:
        return TensorDataset(input_ids, attention_mask)

train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)

In [3]:
len(train_dataset), len(val_dataset)

(6852, 761)

In [4]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

BATCH_SIZE = 32

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

In [5]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print('Using mps')
else:
    device = torch.device('cpu')
    print('Using cpu')

Using mps


In [6]:
from transformers import BertForSequenceClassification
from torchinfo import summary

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model = model.to(device)
summary(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                                       Param #
BertForSequenceClassification                                --
├─BertModel: 1-1                                             --
│    └─BertEmbeddings: 2-1                                   --
│    │    └─Embedding: 3-1                                   23,440,896
│    │    └─Embedding: 3-2                                   393,216
│    │    └─Embedding: 3-3                                   1,536
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─BertEncoder: 2-2                                      --
│    │    └─ModuleList: 3-6                                  85,054,464
│    └─BertPooler: 2-3                                       --
│    │    └─Linear: 3-7                                      590,592
│    │    └─Tanh: 3-8                                        --
├─Dropout: 1-2                                               --
├─L

In [7]:
from tqdm.auto import tqdm
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
import numpy as np

NUM_EPOCHS = 4
LEARNING_RATE = 5e-5

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch in train_dataloader:
        batch = [t.to(device) for t in batch]
        outputs = model(batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    full_predictions = []
    full_references = []
    for batch in val_dataloader:
        batch = [t.to(device) for t in batch]
        outputs = model(batch[0], attention_mask=batch[1], labels=batch[2])
        logits = outputs.logits.detach().to('cpu').numpy()
        predictions = np.argmax(logits, axis=-1).tolist()
        full_predictions += predictions
        full_references += batch[2].to('cpu').tolist()

    accuracy = accuracy_score(y_pred=full_predictions, y_true=full_references)
    f1 = f1_score(y_pred=full_predictions, y_true=full_references)
    recall = recall_score(y_pred=full_predictions, y_true=full_references)
    precision = precision_score(y_pred=full_predictions, y_true=full_references)

    print(f'Epoch: {epoch+1}')
    print(f'Accuracy: {accuracy}')
    print(f'f1: {f1}')
    print(f'Recall: {recall}')
    print(f'Precision: {precision}')

    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

  0%|          | 0/860 [00:00<?, ?it/s]

Epoch: 1
Accuracy: 0.8226018396846255
f1: 0.7969924812030075
Recall: 0.7748538011695907
Precision: 0.8204334365325078
Epoch: 2
Accuracy: 0.8291721419185283
f1: 0.7936507936507936
Recall: 0.7309941520467836
Precision: 0.8680555555555556
Epoch: 3
Accuracy: 0.8160315374507228
f1: 0.7910447761194029
Recall: 0.7748538011695907
Precision: 0.8079268292682927
Epoch: 4
Accuracy: 0.8212877792378449
f1: 0.7957957957957958
Recall: 0.7748538011695907
Precision: 0.8179012345679012


In [8]:
MODEL_PATH = '../models/bert-base-uncased-on-text.pt'

torch.save(model, MODEL_PATH)

In [13]:
from disaster_prediction.dataset import load_raw_test_df

def evaluate_df(df:pd.DataFrame):
    model = torch.load(MODEL_PATH)
    dataset = create_dataset(df, include_labels=False)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, sampler=SequentialSampler(dataset))
    predictions = []
    model.eval()
    for batch in dataloader:
        batch = [t.to(device) for t in batch]
        with torch.no_grad():
            outputs = model(input_ids=batch[0], attention_mask=batch[1])
        logits = outputs.logits.detach().to('cpu').numpy()
        predictions += np.argmax(logits, axis=-1).tolist()
    return pd.DataFrame({'id': df['id'], 'target': predictions})

results = evaluate_df(load_raw_test_df())
results

  model = torch.load(MODEL_PATH)


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [14]:
results['target'].value_counts()

target
0    1908
1    1355
Name: count, dtype: int64

In [16]:
SUBMISSION_PATH = '../data/submissions/bert-base-uncased-on-text-only.csv'

results.to_csv(SUBMISSION_PATH, index=False)