In [12]:
import pandas as pd
import torch
from torch import nn
from transformers import BertModel, BertTokenizer, get_scheduler
from disaster_prediction.utils import KeyedDataset
from disaster_prediction.dataset import load_raw_train_df, load_raw_val_df, load_raw_test_df
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
import os

In [2]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_bert = BertModel.from_pretrained('google-bert/bert-large-uncased')
        self.text_bert_no_hidden = 1024

        self.keyword_bert = BertModel.from_pretrained('google-bert/bert-base-uncased')
        self.keyword_bert_no_hidden = 768

        self.head = nn.Sequential(nn.LazyLinear(512),
                                  nn.ReLU(),
                                  nn.LazyLinear(2))

    def forward(self, text_input_ids: torch.Tensor,
                text_attention_mask: torch.Tensor,
                keyword_input_ids: torch.Tensor,
                keyword_attention_mask: torch.Tensor,
                labels: torch.Tensor = None):
        text_outputs = self.text_bert(input_ids=text_input_ids, attention_mask=text_attention_mask)
        text_hidden_layer = text_outputs['pooler_output']

        keyword_outputs = self.keyword_bert(input_ids=keyword_input_ids, attention_mask=keyword_attention_mask)
        keyword_hidden_layer = keyword_outputs['pooler_output']

        full_hidden_layer = torch.cat((text_hidden_layer, keyword_hidden_layer), dim=1)

        logits = self.head(full_hidden_layer)

        if labels is not None:
            loss = nn.functional.cross_entropy(logits, labels)
            return {'loss': loss, 'logits': logits}
        else:
            return {'logits': logits}

In [3]:
def create_dataset(df: pd.DataFrame, include_labels=True) -> KeyedDataset:
    df['keyword'] = df['keyword'].fillna('')

    tokenizer = BertTokenizer.from_pretrained('google-bert/bert-large-uncased', do_lower_case=True)
    encoded_dict = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors='pt')
    text_input_ids = encoded_dict['input_ids']
    text_attention_mask = encoded_dict['attention_mask']

    tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased', do_lower_case=True)
    encoded_dict = tokenizer(df['keyword'].tolist(), padding=True, truncation=True, return_tensors='pt')
    keyword_input_ids = encoded_dict['input_ids']
    keyword_attention_mask = encoded_dict['attention_mask']

    if include_labels:
        labels = torch.tensor(df['target'].tolist())
        return KeyedDataset(text_input_ids=text_input_ids,
                            text_attention_mask=text_attention_mask,
                            keyword_input_ids=keyword_input_ids,
                            keyword_attention_mask=keyword_attention_mask,
                            labels=labels)
    else:
        return KeyedDataset(text_input_ids=text_input_ids,
                            text_attention_mask=text_attention_mask,
                            keyword_input_ids=keyword_input_ids,
                            keyword_attention_mask=keyword_attention_mask)

In [4]:
def train(model: nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          eval_dataloader: torch.utils.data.DataLoader = None,
          device: torch.device = torch.device('cpu'),
          lr: float = 5e-5,
          epochs: int = 4):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_training_steps = len(train_dataloader) * epochs
    lr_scheduler = get_scheduler('linear', optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    progress = tqdm(range(num_training_steps))

    for epoch in range(epochs):
        model.train()
        for batch in train_dataloader:
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**batch)
            loss = outputs['loss']
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress.update()
            progress.set_postfix({'loss': loss.item()})

        if eval_dataloader is not None:
            model.eval()
            predictions_list = []
            labels_list = []
            with torch.no_grad():
                for batch in eval_dataloader:
                    batch = {key: value.to(device) for key, value in batch.items()}
                    outputs = model(**batch)
                    predictions = outputs['logits'].argmax(dim=1)
                    predictions_list.extend(predictions.tolist())
                    labels_list.extend(batch['labels'].tolist())
            f1 = f1_score(labels_list, predictions_list)
            print(f'EPOCH {epoch + 1}/{epochs} F1: {f1}')

In [5]:
train_df = load_raw_train_df()
val_df = load_raw_val_df()

train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

model = Model()

device = torch.device('cpu')

if torch.backends.mps.is_available():
    device = torch.device('mps')
    print('Using mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using cuda')
else:
    print('Using cpu')

Using mps


In [6]:
train(model=model,
        train_dataloader=train_dataloader,
        eval_dataloader=val_dataloader,
        device=device,
        lr=5e-5,
        epochs=4)

  0%|          | 0/860 [00:00<?, ?it/s]

EPOCH 1/4 F1: 0.8016997167138811
EPOCH 2/4 F1: 0.806697108066971
EPOCH 3/4 F1: 0.8145896656534954
EPOCH 4/4 F1: 0.8190184049079755


In [7]:
def save_model(model: nn.Module, path: str):
    torch.save(model.state_dict(), path)

In [9]:
model_path = os.path.join('../models', 'bert-large-on-text-and-base-on-keyword.pt')

save_model(model, model_path)

In [14]:
def predict_df(df: pd.DataFrame, model: nn.Module, model_path: str) -> pd.DataFrame:
    model.load_state_dict(torch.load(model_path, weights_only=True))
    dataset = create_dataset(df, include_labels=False)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)
    predictions = []
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**batch)
            predictions.extend(outputs['logits'].argmax(dim=1).tolist())
    df['target'] = predictions
    return df[['id', 'target']]

In [15]:
test_df = load_raw_test_df()
results = predict_df(test_df, model, model_path)
results

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [16]:
results.to_csv(os.path.join('../data', 'submissions', 'bert-large-on-text-and-base-on-keyword.csv'), index=False)