# Exercise 6 (solution)

In [None]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
from transformers import logging

logging.set_verbosity_error()

## Task 1: Masked calculations in numpy

Calculate the mean over all valid entries in `a` and `b`. 

For a, `valid` directly shows which entries are valid. For `b`, `valid` defines which rows are valid. 

In [None]:
a = np.arange(4)
b = np.arange(12).reshape(4, 3)
valid = np.array([1, 0, 1, 0]).astype(bool)

In [None]:
masked_a = np.ma.array(a, mask=~valid)
masked_a

In [None]:
masked_a.mean()

In [None]:
valid_mat = valid.reshape(-1, 1).repeat(3, axis=1)
masked_b = np.ma.array(b, mask=~valid_mat)
masked_b

In [None]:
masked_b.mean()

## Task 2: Extract last hidden states

1. Create an Model instance using `AutoModel`
2. define a batch of the tokenized data that helps you to prototype a function that can be used with `DatasetDict.map` (in batched mode)
3. Extract the `input_ids` and `attention_mask` from the batch and convert them to `torch.tensor`s. 
4. Use the tensors to extract the last hidden states of the model
5. Convert the last hidden states to a numpy array

In [None]:
emotions = load_dataset("dair-ai/emotion", name="split")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained(model_name)

In [None]:
batch = emotions_encoded["train"][:2]

In [None]:
input_ids = torch.tensor(batch["input_ids"])
attention_mask = torch.tensor(batch["attention_mask"])

In [None]:
with torch.no_grad():
    output = model(input_ids, attention_mask)
    lhs = output.last_hidden_state.cpu().numpy()

lhs.shape

## Task 3: Average last hidden states

Take the mean over the middle dimension of the last hidden states, but only take elements into account for which the attention mask takes the value 1. 

In [None]:
valid = np.array(batch["attention_mask"]).astype(bool)
valid.shape

In [None]:
batch_size, n_tokens, hidden_dim = lhs.shape
valid = valid.reshape(batch_size, n_tokens, 1).repeat(hidden_dim, axis=-1)
valid.shape

In [None]:
masked_mean = np.ma.array(lhs, mask=~valid).mean(axis=1).data
masked_mean.shape

## Task 4: Now the same using map

1. Write a function called `extract_lhs` that takes `batch` and `model` as argument and extracts and averages the last hidden states. This really just means copy pasting all the steps we did in the last two tasks into one function and saving the result in the batch. 
2. Test the function on your practice batch
3. Apply the function to the encoded emotions dataset using `.map` with the following settings:
    - `batched=True`
    - `batch_size=1000`
    - `fn_kwargs={"model": model}`

**Note**: Step 3 will take a while, let it run while I show the solution and discuss the next steps. If you want, you can use `num_proc=...` to run this step on more than one core. If so, you should set it to the number of physical cores in your computer. 

In [None]:
def extract_states(batch, model):
    pass

In [None]:
def extract_states(batch, model):  # noqa: F811
    input_ids = torch.tensor(batch["input_ids"])
    attention_mask = torch.tensor(batch["attention_mask"])

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        lhs = output.last_hidden_state.cpu().numpy()

    valid = np.array(batch["attention_mask"]).astype(bool)

    batch_size, n_tokens, hidden_dim = lhs.shape
    valid = valid.reshape(batch_size, n_tokens, 1).repeat(hidden_dim, axis=-1)

    masked_mean = np.ma.array(lhs, mask=~valid).mean(axis=1).data

    batch["hidden_state"] = masked_mean
    return batch

In [None]:
extract_states(batch, model)

In [None]:
last_states = emotions_encoded.map(
    extract_states,
    batched=True,
    batch_size=1000,
    fn_kwargs={"model": model},
)

## Task 5: Use the last hidden states in sklearn

1. Extract the arrays `X_train`, `X_test`, `y_train` and `y_test`
2. Run a logistic regression using sklearn (at default values)
3. Calculate the accuracy score

In [None]:
X_train = np.array(last_states["train"]["hidden_state"])
X_test = np.array(last_states["test"]["hidden_state"])
y_train = np.array(last_states["train"]["label"])
y_test = np.array(last_states["test"]["label"])

In [None]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression()
logit.fit(X_train, y_train)
logit.score(X_test, y_test)