# Probing Workshop - 33. TaCoS
Katja Konermann & Mikhail Sonkin

Let's explore probing together! Make sure you connect to a GPU runtime in this notebook.

First, we will have to install the relevant packages

In [None]:
# install
!pip3 install datasets scikit-learn transformers

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/542.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.7

## Importing the code
Next, we will have to import our code from the Github repository.


In [None]:
!git clone https://github.com/katjakon/probing_workshop

Cloning into 'probing_workshop'...
remote: Enumerating objects: 55, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 55 (delta 29), reused 38 (delta 15), pack-reused 0[K
Receiving objects: 100% (55/55), 27.09 KiB | 5.42 MiB/s, done.
Resolving deltas: 100% (29/29), done.


Let's import all the relvant packages and our code!

In [None]:
# import statements
from datasets import Dataset, DatasetDict, load_dataset, Features, Sequence, Value, ClassLabel
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# our own code
from probing_workshop.dataset_processor import DatasetProcessor
from probing_workshop.probes import ClassifierProbe, ControlTaskProbe, MajorityBaseline, RandomProbe

## Datasets

### How Datasets operate

We will be working with Huggingface's Dataset class, as well as the DatasetProcessor class that we wrote for this workshop to have a more intuitive framework for probing.

Let's look at the WikiANN database, which has labeled Named Entity data from Wikipedia.

In [None]:
example_datasetdict = load_dataset("wikiann", "en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/748k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/748k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [None]:
example_datasetdict

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

This is a **DatasetDict** object, which stores three **Dataset** objects: train, validation, and test.

Each dataset has the same four features. Let's look at one entry of 'tokens':



In [None]:
example_dataset = example_datasetdict['train']
example_entry = example_dataset[2]
example_entry

{'tokens': ['Karl', 'Ove', 'Knausgård', '(', 'born', '1968', ')'],
 'ner_tags': [1, 2, 2, 0, 0, 0, 0],
 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en'],
 'spans': ['PER: Karl Ove Knausgård']}

These numbers in 'ner_tags' don't really mean anything to us humans. Each number corresponds to a NER tag. How do we figure out which is which? That's where **features** come in:

In [None]:
example_dataset.features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None),
 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'spans': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In the WikiANN dataset, a 'ner_tags' entry is a **Sequence** of **ClassLabels**, the possible **names** of which are:
* O (null tag)
* B-PER (beginning of Person)
* I-PER (inside Person)
* B-ORG (beginning of Organisation)
* I-ORG (inside Organisation)
* B-LOC (beginning of Location)
* I-LOC (inside Location)

Here's how we can extract those names:

In [None]:
example_dataset.features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

With the .int2str method we can convert it:

In [None]:
example_dataset.features['ner_tags'].feature.int2str(0)

'O'

We can map the numbers to the actual labels:

In [None]:
example_labels = [example_dataset.features['ner_tags'].feature.int2str(i) for i in example_entry['ner_tags']]
example_labels

['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O']

In [None]:
for token, label in zip(example_entry['tokens'], example_labels):
  print(f"token:\t{token};\tlabel:\t{label}")

token:	Karl;	label:	B-PER
token:	Ove;	label:	I-PER
token:	Knausgård;	label:	I-PER
token:	(;	label:	O
token:	born;	label:	O
token:	1968;	label:	O
token:	);	label:	O


### Toy Dataset

Let's create a DatasetDict from scratch. We'll use a couple the Universal Dependencies en_pronouns dataset entries for that:

In [None]:
train_sentences = [
    ['It', 'is', 'hers', '.'],
    ['The', 'car', 'is', "n't", 'yours', '.'],
    ['Dealers', 'like', 'cleaning', 'his', '.'],
    ['Mine', "'ll", 'do', '.'],
    ['One', 'of', 'mine', 'was', 'cleaned', '.'],
    ['Dealers', 'like', 'seeing', 'cars', ',', 'especially', 'theirs', '.']
]

test_sentences = [
    ['It', 'is', 'his', '.'],
    ['The', 'car', 'is', "n't", 'hers', '.'],
    ['Dealers', 'like', 'cleaning', 'mine', '.'],
    ['Hers', "'ll", 'do', '.'],
    ['One', 'of', 'theirs', 'was', 'cleaned', '.'],
    ['Dealers', 'like', 'seeing', 'cars', ',', 'especially', 'yours', '.']
]

train_tags = [
    [11, 17, 11, 1],
    [8, 0, 17, 7, 11, 1],
    [0, 16, 16, 11, 1],
    [11, 17, 17, 1],
    [3, 2, 11, 17, 16, 1],
    [0, 16, 16, 0, 1, 14, 11, 1]
]

test_tags = [
    [11, 17, 11, 1],
    [8, 0, 17, 7, 11, 1],
    [0, 16, 16, 11, 1],
    [11, 17, 17, 1],
    [3, 2, 11, 17, 16, 1],
    [0, 16, 16, 0, 1, 14, 11, 1]
]

In [None]:
toy_features = Features({
    "tokens": Sequence(Value(dtype='string')),
    "upos": Sequence(ClassLabel(names=['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX']))
    })
toy_features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'upos': Sequence(feature=ClassLabel(names=['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX'], id=None), length=-1, id=None)}

In [None]:
toy_dataset_train = Dataset.from_dict({
    "tokens": train_sentences,
    "upos": train_tags}, features=toy_features)

toy_dataset_test = Dataset.from_dict({
    "tokens": test_sentences,
    "upos": test_tags}, features=toy_features)

toy_dataset = DatasetDict({"train": toy_dataset_train, "test": toy_dataset_test})

In [None]:
toy_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'upos'],
        num_rows: 6
    })
    test: Dataset({
        features: ['tokens', 'upos'],
        num_rows: 6
    })
})

Now let's use the class that would make it easier to use with our probe model. To initialize, we specify the dataset, the model we will use and the tokens and labels columns.

dataset_type is set to "tokens" just in case we will ever develop one for sentences :)

In [None]:
model_name = "google-bert/bert-base-cased"
toy_dataset = DatasetProcessor(toy_dataset, model_name, "tokens", "upos", dataset_type="tokens")



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Creating dict for probing...
Embeddings for train


100%|██████████| 6/6 [00:04<00:00,  1.48it/s]


Embeddings for test


100%|██████████| 6/6 [00:00<00:00,  9.78it/s]


# Probing

## Classifier Probe
Different types of probes and baseline take different arguments. Let's start with the most relevant one: The classifier probe!

In [None]:
help(ClassifierProbe)

Help on class ClassifierProbe in module probing_workshop.probes:

class ClassifierProbe(builtins.object)
 |  ClassifierProbe(data_set, clf, clf_kwargs: dict = None) -> None
 |  
 |  Methods defined here:
 |  
 |  __init__(self, data_set, clf, clf_kwargs: dict = None) -> None
 |      Initialize a probing classifier.
 |      
 |      Args:
 |          data_set (Custom Dataset type): Should have attributes for embeddings, labels & strings.
 |          clf (scikit-learn classifier): For instance, SGDClassifier or MLPClassifier
 |          clf_kwargs (dict): Keyword arguments to be given to clf
 |  
 |  fit(self)
 |      Fit the given probe to the given classifier.
 |  
 |  predict(self, embeddings)
 |      Predict given instances.
 |      
 |      Args:
 |          embeddings (matrix-like): Predict labels based on given embeddings.
 |      
 |      Returns:
 |          1-d array: Predicted labels.
 |  
 |  ----------------------------------------------------------------------
 |  Data desc

The probe is initialized with a data set and a `scikit-learn` classifier like `MLPClassifier`. Optionally, you can specify keyword arguments for the classifier.

You can look at the documentation of `scikit-learn` to find out which hyperparameters you can adjust:

- [SGDClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html)
- [MLPClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)

In [None]:
# Initialize Probe
probe = ClassifierProbe(data_set=toy_dataset, clf=SGDClassifier)


With the method `fit`, you can train the probing classifier on the given data set.

In [None]:
# Fit
probe.fit()

After we have fitted the probing classifier, we can use it to predict and evaluate!

For **predicting**, we have to give the probing classifier the embeddings of the instances we want to predict.

In [None]:
# Predict
test = toy_dataset["test"]["embeddings"]
predictions = probe.predict(test)

Let's evaluate how good are probe performs by giving it the embeddings of the test set.

In [None]:
# Evaluate with gold labels
accuracy_score(y_true=toy_dataset["test"]["labels"], y_pred=predictions)

0.8181818181818182

But what does that mean? Let's also compare our probe to some baselines:

## Random Initialization Baseline

Initialising the random baseline works the same as before. Internally, the random initialization baseline generates a random new embeddings for each token. Can a probe still extract information out of this?

In [None]:
# Initialize and fit
rand_probe = RandomProbe(data_set=toy_dataset, clf=SGDClassifier)

rand_probe.fit()

When prediciting, we have to give the probe the token ids instead of the embeddings of the instances we want to classify.

In [None]:
# Predict with token ids
test_ids = toy_dataset["test"]["ids"]
predictions = rand_probe.predict(token_ids=test_ids)

Now let's evaluate: What do we expect?

In [None]:
# Evaluate
accuracy_score(y_true=toy_dataset["test"]["labels"], y_pred=predictions)

1.0

In [None]:
predictions

array(['PRON', 'AUX', 'PRON', 'PUNCT', 'DET', 'NOUN', 'AUX', 'PART',
       'PRON', 'PUNCT', 'NOUN', 'VERB', 'VERB', 'PRON', 'PUNCT', 'PRON',
       'AUX', 'AUX', 'PUNCT', 'NUM', 'ADP', 'PRON', 'AUX', 'VERB',
       'PUNCT', 'NOUN', 'VERB', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PRON',
       'PUNCT'], dtype='<U5')

## Majority Baseline
This is a very simple concept: For each token, assign it the label it is most frequently associated with. For tokens that were not seen during training, assign it the most common label overall.

We initialize it by giving it a data set. No need to specify a classifier type.

In [None]:

majority_baseline = MajorityBaseline(data_set=toy_dataset)
# Fit
majority_baseline.fit()

Again, predict and evaluate. For prediction, this baseline needs the token ids.

In [None]:

# Predict
test_ids = toy_dataset["test"]["ids"]
predictions = majority_baseline.predict(test_ids)

# Evaluate
accuracy_score(y_true=toy_dataset["test"]["labels"], y_pred=predictions)

1.0

## Control Task Probing
For a control tasks, each word type gets assigned a randomly sampled label from a set with the same cardinality. A probe should perform low on a control task and high on the actual probing task ideally. If a probe performs very good on a control task, it is able to simply memorize the the word types.

We initialize and fit it exactly like the classifier probe.

In [None]:
# Initialize
control_task_probe = ControlTaskProbe(data_set=toy_dataset, clf=SGDClassifier)
# Fit
control_task_probe.fit()

Again, let's evaluate:

In [None]:
# Predict
test_embeds = toy_dataset["test"]["embeddings"]

# Map the tokens to their control labels
y_true_control = control_task_probe.ids2control(toy_dataset["test"]["ids"])

predictions = control_task_probe.predict(test_embeds)
# Evaluation
accuracy_score(y_true=y_true_control , y_pred=predictions)

0.6666666666666666

## Probing Experiments
Now your turn! Choose one of the tasks below and perform some probing experiments and baseline.
We have already specified the data set. Copy and adjust code from above to run your own probing experiments.
Some things to try out and think about:
- Choose between `MLPClassifier` and `SGDClassifier`
- Adjust different hyperparameters
- Evaluate the classifiers and compare them to the baselines
- What do you conclude from this? Has the BERT model learned knowledge of these tasks?

### Part-of-Speech Tagging

In [None]:
# Data Set
model_name = "google-bert/bert-base-cased"
pos_dataset = load_dataset("universal_dependencies", "en_lines")
del pos_dataset['validation'] # no need :)

# Let's trim it a bit for the sake of time
for key in pos_dataset.keys():
  pos_dataset[key] = pos_dataset[key].select(range(1000))

# convert it
pos_dataset = DatasetProcessor(pos_dataset, model_name, "tokens", "upos")

Downloading data:   0%|          | 0.00/837k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/296k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/272k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1032 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1035 [00:00<?, ? examples/s]



Creating dict for probing...
Embeddings for train


100%|██████████| 1000/1000 [02:14<00:00,  7.44it/s]


Embeddings for test


100%|██████████| 1000/1000 [02:13<00:00,  7.49it/s]


In [None]:
# Create Probing Classifier

# Predict the the test instance

# Evaluate

In [None]:
# Try out some baseline!

### Named Entity Recognition

In [None]:
# Data Set
model_name = "google-bert/bert-base-cased"
ner_dataset = load_dataset("wikiann", "en")
del ner_dataset['validation']

for key in ner_dataset.keys():
  ner_dataset[key] = ner_dataset[key].select(range(1000))

ner_dataset = DatasetProcessor(ner_dataset, model_name, "tokens", "ner_tags")



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Creating dict for probing...
Embeddings for test


100%|██████████| 10/10 [00:07<00:00,  1.41it/s]


Embeddings for train


100%|██████████| 10/10 [00:02<00:00,  3.93it/s]


In [None]:
# Create Probing Classifier

# Predict the the test instance

# Evaluate

In [None]:
# Try out some baseline!