In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import load_dataset, Features, Sequence, Value, Dataset
import torch

In [2]:
tmp = Dataset.from_pandas(pd.read_csv('train.csv').fillna('').head(100))
data = tmp.map(lambda ex: {'labels': ex['target']}, batched=True).remove_columns(['target', 'standard_error', 'id'])

  0%|          | 0/1 [00:00<?, ?ba/s]

In [3]:
MODEL_NAME = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [4]:
MAX_LENGTH = 512
data = data.map(lambda e: tokenizer(e['excerpt'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

  0%|          | 0/1 [00:00<?, ?ba/s]

In [34]:
dataloader = torch.utils.data.DataLoader(data, batch_size=32)

In [48]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x15355d820>

In [39]:
class AttentionHead(torch.nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = torch.nn.Linear(in_features, hidden_dim)
        self.V = torch.nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [45]:
class Model(torch.nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.head = AttentionHead(768,768,1)
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.head(xb[0])
        return x

In [47]:
model = Model()

embeddings = list()
with torch.no_grad():
    for i, inputs in enumerate(dataloader):
        inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
        outputs = model(**inputs)
        outputs = outputs.detach().cpu().numpy()
        embeddings.extend(outputs)

AttributeError: 'list' object has no attribute 'reshape'

In [5]:
# Compute metrics is an argument of the Trainer
def compute_metrics(pred_results):
    """For computing RMSE inside the training loop"""
    y_pred = pred_results.predictions.squeeze()
    y_true = pred_results.label_ids
    return {"rmse": rmse(y_true, y_pred)}

def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean().item())