# Analysis of BERT's knowledge of the Italian Subjunctive

### Imports

In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForMaskedLM, AutoTokenizer
import re
from datasets import load_dataset

### Functions for corpus creation and extraction of sentences with specific words

In [3]:
# Returns a list of strings created from the opus100 dataset
def create_corpus():
    corpus = []
    for entry in dataset['train']:
        corpus.append(entry['translation']['it'])


    sentence_pattern = re.compile(r'[^.!?]*[.!?]')
    sentences = []
    for entry in corpus:
        sents = sentence_pattern.findall(entry)
        for sent in sents:
            sentences.append(sent.strip())
    return sentences


# Given a list of strings, returns a list of all the strings that contain the word
def find_instances(corpus, word):
    instances = []
    for sentence in corpus:
        pattern = re.compile(fr'\b{re.escape(word)}\b') #, flags=re.IGNORECASE
        if pattern.findall(sentence):
            instances.append(sentence.strip())
    for sentence in instances:
        if word.upper() in sentence:
            instances.remove(sentence)
    return instances


# Returns tokenized sentence, replacing specified token with mask token id
def tokenize_and_mask(sentence, word):
    
    mapping = tokenizer(sentence, return_tensors='pt')
    tokens = mapping['input_ids'][0]
    
    mask_id = 103
    word_id_lower = tokenizer.convert_tokens_to_ids(word.lower())
    word_id_cap = tokenizer.convert_tokens_to_ids(word.capitalize())
    
    for index, value in enumerate(tokens):
        if value.item() == word_id_lower or value.item() == word_id_cap:
            tokens[index] = mask_id
            break
    mapping['input_ids'][0] = tokens
    return mapping



In [4]:
dataset = load_dataset("opus100", "en-it")
corpus = create_corpus()

In [63]:
len(find_instances(corpus, 'sia')), len(find_instances(corpus, 'è'))

(12587, 76032)

In [10]:
class Bert():
    
    def __init__(self, name, mode, tokenizer):
        self.name = name
        self.model = model
        self.tokenizer = tokenizer
        self.mask_id = tokenizer.mask_token_id
        self.mask = tokenizer.mask_token
        
    def tokenize(self, sentence):
        tokenized_input = tokenizer(sentence, return_tensors='pt')
        return tokenized_input
    
    def forward(self, tokens):
        output = self.model(**tokens)
        return output
    
    def logits(self, output, mask_location):
        logits = F.softmax(output.logits[:,mask_location,:], dim=1).squeeze()
        return logits
    
    def token_prob(self, logits, token):
        return logits[tokenizer.convert_tokens_to_ids(token)].item()

In [11]:
model = AutoModelForMaskedLM.from_pretrained("bert-base-multilingual-cased")
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
bert = Bert("bert", model, tokenizer)
mask_id = bert.tokenizer.mask_token_id

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
# Given a sentence, this function masks a specified token and returns the
# probability assigned to a token for that position

def test_sentence(sentence, word_to_mask, word_to_test):
    tokenized = tokenize_and_mask(sentence, word_to_mask)
    try:
        mask_location = list(tokenized['input_ids'][0]).index(mask_id)
    except:
        return None
    return bert.token_prob(bert.logits(bert.forward(tokenized), mask_location), word_to_test)



In [39]:
test_sentence('Non è necessario che [MASK] così.', 'sia', 'è')

0.20595484972000122

In [40]:
test_sentence('Non è necessario che [MASK] così.', 'sia', 'sia')

0.072059266269207

In [43]:
test_sentence('Non credo che veramente tedesco.', 'sia', 'sia')

In [26]:
test_sentence('Non credo che sia veramente tedesco.', 'sia', 'è')

0.7472403049468994

In [27]:
a = bert.forward(bert.tokenize('Non credo che [MASK] veramente tedesco.'))
b = bert.logits(a, 5)
c = bert.token_prob(b, 'sia')


In [28]:
bert.token_prob(b, 'sia')

0.019019730389118195

In [29]:
bert.token_prob(b, 'è')

0.7472403049468994

In [30]:
test_sentence('Non credo che sia veramente tedesco.', 'sia', 'sia')

0.019019730389118195

## Experiment 1: Quantitative Analysis

Comparison of accuracy for word pairs of contrasting grammatical mood

In [68]:
import random

def replace_compare(corpus, n, word_1, word_2):

    corp_1 = find_instances(corpus, word_1)
    corp_1 = random.sample(corp_1, n)
    corp_1_n_correct = 0
    
    odds = 0
    odds_correct = 0
    odds_false = 0
    
    congruent_probs = 0
    incongruent_probs = 0
    
    correct_prob_congruent = 0
    correct_prob_incongruent = 0
    false_prob_congruent = 0
    false_prob_incongruent = 0
    
    samples = []

    for sentence in corp_1:
        correct_prob = test_sentence(sentence, word_1, word_1)
        false_prob = test_sentence(sentence, word_1, word_2)
        if correct_prob == None:
            print(sentence)
            continue
        if false_prob == None:
            print(sentence)
            continue

        congruent_probs += correct_prob
        incongruent_probs += false_prob
        
        odd = correct_prob / false_prob
        odds += odd

        if correct_prob >= false_prob:
            corp_1_n_correct += 1
            odds_correct += odd
            correct_prob_congruent += correct_prob
            false_prob_congruent += false_prob
        else:
            samples.append(sentence)
            odds_false += odd
            correct_prob_incongruent += correct_prob
            false_prob_incongruent += false_prob
            
    corp_1_n_false = len(corp_1)-corp_1_n_correct
    if corp_1_n_false == 0:
        corp_1_n_false = 0.0000001
        
    print(f"### {word_1} was replaced ###")       
    print(f'Accuracy when replacing "{word_1}": {100*corp_1_n_correct/len(corp_1)}%')
    print(f'The average odds is: {odds/len(corp_1)}\n')
    
    print(f'The average odds when predicting correctly is: {odds_correct/corp_1_n_correct}')
    print(f'The average odds when predicting falsely is: {odds_false/corp_1_n_false}\n')
    
    print(f'The average probability for {word_1} was: {congruent_probs/n}')
    print(f'The average probability for {word_2} was: {incongruent_probs/n}\n')
    
    print(f'The average probability for {word_1} when predicting correctly is: {correct_prob_congruent/corp_1_n_correct}')
    print(f'The average probability for {word_2} when predicting correctly is: {false_prob_congruent/corp_1_n_correct}')
    print(f'The average probability for {word_1} when predicting falsely is: {correct_prob_incongruent/corp_1_n_false}')
    print(f'The average probability for {word_2} when predicting falsely is: {false_prob_incongruent/corp_1_n_false}\n\n')

    if len(samples) > 5:
        return samples[:5]
    return samples
    
    

def experiment_1(corpus, word_1, word_2, n):
    samples = []
    samples.append(replace_compare(corpus, n, word_1, word_2))
    samples.append(replace_compare(corpus, n, word_2, word_1))
    print(samples)
    

In [127]:
experiment_1(corpus, 'sia', 'è', 500)

### sia was replaced ###
Accuracy when replacing "sia": 35.0%
The average odds is: 2065.6235220069034

The average odds when predicting correctly is: 5901.331832324423
The average odds when predicting falsely is: 0.24212414362338394

The average probability for sia was: 0.22937348860322762
The average probability for è was: 0.2905929743952131

The average probability for sia when predicting correctly is: 0.5176749309159331
The average probability for è when predicting correctly is: 0.08616592725202987
The average probability for sia when predicting falsely is: 0.07413425043484774
The average probability for è when predicting falsely is: 0.40066907670308094


### è was replaced ###
Accuracy when replacing "è": 98.2%
The average odds is: 3872.287146190136

The average odds when predicting correctly is: 3943.2568439179477
The average odds when predicting falsely is: 0.49585903949949794

The average probability for è was: 0.6171230605177229
The average probability for sia was: 0.0178996978

In [82]:
experiment_1(corpus, 'sono', 'siano', 500)

### sono was replaced ###
Accuracy when replacing "sono": 99.8%
The average odds is: 902.499674573009

The average odds when predicting correctly is: 904.306779757173
The average odds when predicting falsely is: 0.7541876751871859

The average probability for sono was: 0.4724690455650628
The average probability for siano was: 0.008801220344894339

The average probability for sono when predicting correctly is: 0.4728989485126519
The average probability for siano when predicting correctly is: 0.008133446691860825
The average probability for sono when predicting falsely is: 0.25794747471809387
The average probability for siano when predicting falsely is: 0.34202027320861816


### siano was replaced ###
Accuracy when replacing "siano": 27.2%
The average odds is: 6.943140354481208

The average odds when predicting correctly is: 24.96537005337328
The average odds when predicting falsely is: 0.20956002742263588

The average probability for siano was: 0.21949496043055083
The average probabilit

In [83]:
experiment_1(corpus, 'erano', 'fossero', 500)

### erano was replaced ###
Accuracy when replacing "erano": 96.8%
The average odds is: 4789.918643201053

The average odds when predicting correctly is: 4948.241735659056
The average odds when predicting falsely is: 0.6450963464581804

The average probability for erano was: 0.21596862357176289
The average probability for fossero was: 0.006847677124260147

The average probability for erano when predicting correctly is: 0.22195900099920945
The average probability for fossero when predicting correctly is: 0.00524275290184653
The average probability for erano when predicting falsely is: 0.03475970639150461
The average probability for fossero when predicting falsely is: 0.055396634852272086


### fossero was replaced ###
Accuracy when replacing "fossero": 52.4%
The average odds is: 9.671214116278168

The average odds when predicting correctly is: 18.152567811046676
The average odds when predicting falsely is: 0.33459786405400954

The average probability for fossero was: 0.07874987738008259


In [198]:
for item in list(topk_indices):
    print(tokenizer.convert_ids_to_tokens(item.item()), logits[item.item()].item())

Non 0.5619245767593384
non 0.2179812639951706
è 0.017772043123841286
no 0.009306162595748901
No 0.008768565952777863


In [109]:
import random

def replace_compare_2(corpus, n, word_1, word_2, x):

    corp_1 = find_instances(corpus, word_1)
    corp_1 = random.sample(corp_1, n)
    corp_1_n_correct = 0
    
    samples = []

    for sentence in corp_1:
        correct_prob = test_sentence(sentence, word_1, word_1)
        false_prob = test_sentence(sentence, word_1, word_2)
        if correct_prob == None:
            print(sentence)
            continue
        if false_prob == None:
            print(sentence)
            continue


        if x * correct_prob >= false_prob:
            corp_1_n_correct += 1
            
    corp_1_n_false = len(corp_1)-corp_1_n_correct
    if corp_1_n_false == 0:
        corp_1_n_false = 0.0000001
     
    print(f'Accuracy when replacing "{word_1}": {100*corp_1_n_correct/len(corp_1)}%')
    
    if len(samples) > 5:
        return samples[:5]
    return samples
    
    

def experiment_2(corpus, word_1, word_2, n, x):
    samples = []
    samples.append(replace_compare_2(corpus, n, word_1, word_2, 1/x))
    samples.append(replace_compare_2(corpus, n, word_2, word_1, x))
    print(samples)

In [88]:
experiment_2(corpus, 'sono', 'siano', 10, 10)

### sono was replaced ###
Accuracy when replacing "sono": 100.0%
The average odds is: 1157.1675375276545

The average odds when predicting correctly is: 1157.1675375276545
The average odds when predicting falsely is: 0.0

The average probability for sono was: 0.4579587295591773
The average probability for siano was: 0.00396752415610365

The average probability for sono when predicting correctly is: 0.4579587295591773
The average probability for siano when predicting correctly is: 0.00396752415610365
The average probability for sono when predicting falsely is: 0.0
The average probability for siano when predicting falsely is: 0.0


### siano was replaced ###
Accuracy when replacing "siano": 95.0%
The average odds is: 2.3453953988131233

The average odds when predicting correctly is: 2.4685262204107725
The average odds when predicting falsely is: 0.005909788457789384

The average probability for siano was: 0.188029572701865
The average probability for sono was: 0.4312951381831078

The ave

In [89]:
experiment_2(corpus, 'erano', 'fossero', 100)

### erano was replaced ###
Accuracy when replacing "erano": 100.0%
The average odds is: 3657.332465502502

The average odds when predicting correctly is: 3657.332465502502
The average odds when predicting falsely is: 0.0

The average probability for erano was: 0.1887540190281468
The average probability for fossero was: 0.007481184508735399

The average probability for erano when predicting correctly is: 0.1887540190281468
The average probability for fossero when predicting correctly is: 0.007481184508735399
The average probability for erano when predicting falsely is: 0.0
The average probability for fossero when predicting falsely is: 0.0


### fossero was replaced ###
Accuracy when replacing "fossero": 99.0%
The average odds is: 6.815849145987573

The average odds when predicting correctly is: 6.884647669100133
The average odds when predicting falsely is: 0.00479535784401859

The average probability for fossero was: 0.08284206437712918
The average probability for erano was: 0.05442053

In [93]:
experiment_2(corpus, 'sono', 'siano', 10, 500)

### sono was replaced ###
Accuracy when replacing "sono": 100.0%
The average odds is: 444.5960143193952

The average odds when predicting correctly is: 444.5960143193952
The average odds when predicting falsely is: 0.0

The average probability for sono was: 0.44315625164308586
The average probability for siano was: 0.005199900197476381

The average probability for sono when predicting correctly is: 0.44315625164308586
The average probability for siano when predicting correctly is: 0.005199900197476381
The average probability for sono when predicting falsely is: 0.0
The average probability for siano when predicting falsely is: 0.0


### siano was replaced ###
Accuracy when replacing "siano": 100.0%
The average odds is: 11.861151553190883

The average odds when predicting correctly is: 11.861151553190883
The average odds when predicting falsely is: 0.0

The average probability for siano was: 0.1770077091103303
The average probability for sono was: 0.31462434413842855

The average probabi

In [99]:
experiment_2(corpus, 'sono', 'siano', 50, 10)

### sono was replaced ###
Accuracy when replacing "sono": 100.0%
The average odds is: 1070.6849533909729

The average odds when predicting correctly is: 1070.6849533909729
The average odds when predicting falsely is: 0.0

The average probability for sono was: 0.5017051460250513
The average probability for siano was: 0.019917086907566953

The average probability for sono when predicting correctly is: 0.5017051460250513
The average probability for siano when predicting correctly is: 0.019917086907566953
The average probability for sono when predicting falsely is: 0.0
The average probability for siano when predicting falsely is: 0.0


### siano was replaced ###
Accuracy when replacing "siano": 74.0%
The average odds is: 8.467180789604717

The average odds when predicting correctly is: 11.426849577503251
The average odds when predicting falsely is: 0.04350808558580299

The average probability for siano was: 0.27010721675604654
The average probability for sono was: 0.3316563809957975

The a

In [100]:
experiment_2(corpus, 'sono', 'siano', 50, 0.1)

### sono was replaced ###
Accuracy when replacing "sono": 92.0%
The average odds is: 323.50092208056054

The average odds when predicting correctly is: 351.41105461265704
The average odds when predicting falsely is: 2.534397961450886

The average probability for sono was: 0.31675642627999523
The average probability for siano was: 0.014397617363618167

The average probability for sono when predicting correctly is: 0.33506020045024343
The average probability for siano when predicting correctly is: 0.003277118584366156
The average probability for sono when predicting falsely is: 0.10626302332214088
The average probability for siano when predicting falsely is: 0.1422833533250163


### siano was replaced ###
Accuracy when replacing "siano": 10.0%
The average odds is: 5.8085528994795705

The average odds when predicting correctly is: 50.31306123330175
The average odds when predicting falsely is: 0.8636075290548848

The average probability for siano was: 0.21563974740267855
The average probab

In [106]:
experiment_2(corpus, 'sono', 'siano', 50, 1/15)

### sono was replaced ###
Accuracy when replacing "sono": 90.0%
The average odds is: 900.9235939409925

The average odds when predicting correctly is: 999.9841444973056
The average odds when predicting falsely is: 9.378638934175594

The average probability for sono was: 0.37969800011720506
The average probability for siano was: 0.007068398124346799

The average probability for sono when predicting correctly is: 0.3900635900338077
The average probability for siano when predicting correctly is: 0.00382853117750162
The average probability for sono when predicting falsely is: 0.2864076908677816
The average probability for siano when predicting falsely is: 0.03622720064595342


### siano was replaced ###
Accuracy when replacing "siano": 12.0%
The average odds is: 12.069433474466841

The average odds when predicting correctly is: 89.09440286585601
The average odds when predicting falsely is: 1.5660285574592274

The average probability for siano was: 0.27474085523618635
The average probabilit

In [126]:
experiment_2(corpus, 'sono', 'siano', 50, 15)

Accuracy when replacing "sono": 84.0%
Accuracy when replacing "siano": 70.0%
[[], []]


In [134]:
for i in [1, 2, 3, 4, 5]:
    experiment_2(corpus, 'è', 'sia', 50, i)

Accuracy when replacing "è": 98.0%
Accuracy when replacing "sia": 36.0%
[[], []]
Accuracy when replacing "è": 94.0%
Accuracy when replacing "sia": 42.0%
[[], []]
Accuracy when replacing "è": 94.0%
Accuracy when replacing "sia": 58.0%
[[], []]
Accuracy when replacing "è": 88.0%
Accuracy when replacing "sia": 60.0%
[[], []]
Accuracy when replacing "è": 92.0%
Accuracy when replacing "sia": 60.0%
[[], []]


In [136]:
for i in [5,7, 10,12, 15]:
    experiment_2(corpus, 'sono', 'siano', 100, i)

Accuracy when replacing "sono": 100.0%
Accuracy when replacing "siano": 47.0%
[[], []]
Accuracy when replacing "sono": 95.0%
Accuracy when replacing "siano": 52.0%
[[], []]
Accuracy when replacing "sono": 94.0%
Accuracy when replacing "siano": 70.0%
[[], []]
Accuracy when replacing "sono": 88.0%
Accuracy when replacing "siano": 69.0%
[[], []]
Accuracy when replacing "sono": 93.0%
Accuracy when replacing "siano": 75.0%
[[], []]


In [114]:
for i in [1,1.5,2,2.5,3,3.5]:
    experiment_2(corpus, 'erano', 'fossero', 300, i)

Accuracy when replacing "erano": 97.0%
Accuracy when replacing "fossero": 51.666666666666664%
[[], []]
Accuracy when replacing "erano": 93.33333333333333%
Accuracy when replacing "fossero": 63.333333333333336%
[[], []]
Accuracy when replacing "erano": 92.66666666666667%
Accuracy when replacing "fossero": 67.33333333333333%
[[], []]
Accuracy when replacing "erano": 91.0%
Accuracy when replacing "fossero": 72.66666666666667%
[[], []]
Accuracy when replacing "erano": 92.0%
Accuracy when replacing "fossero": 73.0%
[[], []]
Accuracy when replacing "erano": 88.66666666666667%
Accuracy when replacing "fossero": 77.66666666666667%
[[], []]


In [135]:
experiment_2(corpus, 'è', 'sia', 50, 2.5)
experiment_2(corpus, 'sono', 'siano', 50, 2.5)
experiment_2(corpus, 'erano', 'fossero', 50, 2.5)

Accuracy when replacing "è": 96.0%
Accuracy when replacing "sia": 42.0%
[[], []]
Accuracy when replacing "sono": 96.0%
Accuracy when replacing "siano": 48.0%
[[], []]
Accuracy when replacing "erano": 90.0%
Accuracy when replacing "fossero": 68.0%
[[], []]


In [137]:
experiment_2(corpus, 'è', 'sia', 150, 2.5)
experiment_2(corpus, 'sono', 'siano', 150, 2.5)
experiment_2(corpus, 'erano', 'fossero', 150, 2.5)

Accuracy when replacing "è": 96.0%
Accuracy when replacing "sia": 40.666666666666664%
[[], []]
Accuracy when replacing "sono": 97.33333333333333%
Accuracy when replacing "siano": 41.333333333333336%
[[], []]
Accuracy when replacing "erano": 92.0%
Accuracy when replacing "fossero": 68.66666666666667%
[[], []]
