In [1]:
import torch
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from transformers import BertConfig, BertModel, BertTokenizer
import numpy as np
import pandas as pd

class NewsDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        model_name="bert-base-cased",
        split='train'
    ):
        self._device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self._config = BertConfig.from_pretrained(model_name)
        self._bert_model = BertModel.from_pretrained(model_name, config=self._config)
        self._bert_model.eval()
        self._bert_tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
        self._data_df = pd.read_csv(f"../data/{split}_data.csv", index_col="Date")

    def __len__(self):
        return len(self._data_df.index)

    def __getitem__(self, index):
        row = self._data_df.iloc[index]
        label = row[-1]
        text_series = row[:-3]
        nan_count = text_series.isna().sum()
        day_text_matrix = np.empty((text_series.size - nan_count, 768))
        for index, text in enumerate(text_series):
            if isinstance(text, str):
                tokens = self._bert_tokenizer(text, return_tensors='pt')
                self._bert_model = self._bert_model.to(self._device)
                output = self._bert_model(tokens.input_ids.to(self._device))
                latent_matrix = output.last_hidden_state[0]
                mean_vector = torch.mean(latent_matrix, 0)
                mean_vector = mean_vector.to('cpu').detach().numpy()
                mean_vector = mean_vector.reshape((1,-1))
                day_text_matrix[index, :] = mean_vector
        return (
            torch.tensor(day_text_matrix),
            torch.tensor(label)
        )
        


[nltk_data] Downloading package punkt to /home/jorgenv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import torch
from torch import nn

class Model(nn.Module):
    def __init__(self, device):
        super(Model, self).__init__()
        self.lstm_size = 768
        self.num_layers = 1
        self._device = device

        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.6,
            batch_first=True
        )
        self.fc = nn.Linear(self.lstm_size, 1)

    def forward(self, x):
        h0, c0 = self.init_hidden(x.size(0))
        output, state = self.lstm(x, (h0, c0))
        output = self.fc(output[:, -1, :])
        output = torch.sigmoid(output)
        return output, state

    def init_hidden(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self._device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self._device))

In [3]:
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

def train(train_dataset, val_dataset, model, device, batch_size=32, max_epochs=100):
    model.to(device)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
    )

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    results = {
        "epoch": [],
        "train_loss": [],
        "train_accuracy": [],
        "val_accuracy": []
    }

    for epoch in range(max_epochs):
        results["epoch"].append(epoch)

        train_running_loss = []
        train_running_accuracy = []

        model = model.train()
        for _, (x, y_true) in enumerate(train_dataloader):
            optimizer.zero_grad()
            x = x.to(device)
            y_true = y_true.to(device)
            y_pred, _ = model(x.float())
            y_true = y_true.reshape((-1, 1))
            loss = criterion(y_pred, y_true.float())

            loss.backward()
            optimizer.step()

            train_running_loss.append(loss.item())

            pred = np.round(y_pred.cpu().detach())
            target = np.round(y_true.cpu().detach())
            accuracy = accuracy_score(target, pred)
            train_running_accuracy.append(accuracy)

        train_loss = np.mean(train_running_loss)
        results["train_loss"].append(train_loss)
        train_accuracy = np.mean(train_running_accuracy)
        results["train_accuracy"].append(train_accuracy)

        val_dataloader = DataLoader(
            val_dataset,
            batch_size=batch_size,
        )

        val_running_accuracy = []

        model = model.eval()
        with torch.no_grad():

            for _, (x, y_true) in enumerate(val_dataloader):
                x = x.to(device)
                y_true = y_true.to(device)
                y_pred, _ = model(x.float())
                y_true = y_true.reshape((-1, 1))

                pred = np.round(y_pred.cpu().detach())
                target = np.round(y_true.cpu().detach())
                accuracy = accuracy_score(target, pred)
                val_running_accuracy.append(accuracy)
        
        val_accuracy = np.mean(val_running_accuracy)
        results["val_accuracy"].append(val_accuracy)
        print({ 'epoch': epoch, 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'val_accuracy': val_accuracy })

    return results
        


In [4]:
train_dataset = NewsDataset()
val_dataset = NewsDataset(split='val')
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = Model(device)

train(train_dataset, val_dataset, model, device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.

{'epoch': 0, 'train_loss': 0.6958538862791929, 'train_accuracy': 0.5498251748251749, 'val_accuracy': 0.5493055555555555}
{'epoch': 1, 'train_loss': 0.6801002364267003, 'train_accuracy': 0.5615166083916084, 'val_accuracy': 0.5524305555555555}
{'epoch': 2, 'train_loss': 0.6606309576468035, 'train_accuracy': 0.5984484265734266, 'val_accuracy': 0.5166666666666667}
{'epoch': 3, 'train_loss': 0.6584962369366125, 'train_accuracy': 0.6111778846153846, 'val_accuracy': 0.5260416666666667}
{'epoch': 4, 'train_loss': 0.6162110600959171, 'train_accuracy': 0.6612215909090909, 'val_accuracy': 0.5010416666666667}
{'epoch': 5, 'train_loss': 0.5959938751025633, 'train_accuracy': 0.6789772727272727, 'val_accuracy': 0.5055555555555555}
{'epoch': 6, 'train_loss': 0.5686613853004846, 'train_accuracy': 0.7088068181818182, 'val_accuracy': 0.5399305555555556}
{'epoch': 7, 'train_loss': 0.5296902781860395, 'train_accuracy': 0.7350852272727273, 'val_accuracy': 0.5243055555555556}
{'epoch': 8, 'train_loss': 0.507

KeyboardInterrupt: 

https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html

After training binary classification model, try having a tanh activation function as output. The output would be tanh but use a function to transform to logits: 

https://stats.stackexchange.com/a/221905
https://stackoverflow.com/questions/3985619/how-to-calculate-a-logistic-sigmoid-function-in-python/36440463#36440463