In [45]:
import torch
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from transformers import BertConfig, BertModel, BertTokenizer
import numpy as np
import pandas as pd

class NewsDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        model_name="bert-base-cased",
        split='train'
    ):
        self._device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self._config = BertConfig.from_pretrained(model_name)
        self._bert_model = BertModel.from_pretrained(model_name, config=self._config)
        self._bert_model.eval()
        self._bert_tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
        self._data_df = pd.read_csv(f"../data/{split}_data.csv", index_col="Date")

    def __len__(self):
        return len(self._data_df.index)

    def __getitem__(self, index):
        row = self._data_df.iloc[index]
        label = row[-1]
        text_series = row[:-3]
        nan_count = text_series.isna().sum()
        day_text_matrix = np.empty((text_series.size - nan_count, 768))
        for index, text in enumerate(text_series):
            if isinstance(text, str):
                tokens = self._bert_tokenizer(text, return_tensors='pt')
                self._bert_model.to(self._device)
                output = self._bert_model(tokens.input_ids.to(self._device))
                latent_matrix = output.last_hidden_state.to('cpu').detach().numpy()[0]
                mean_vector = np.mean(latent_matrix, axis=0).reshape((1, latent_matrix.shape[1]))
                day_text_matrix[index, :] = mean_vector


[nltk_data] Downloading package punkt to /home/jorgenv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
dataset = NewsDataset()
dataset.__getitem__(1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'pandas.core.series.Series'>
(25, 768)


In [44]:
dataset.__getitem__(2)

<class 'pandas.core.series.Series'>
(25, 768)
