In [None]:
!pip3 install conllu
!pip3 install transformers
!pip3 install torch
!pip3 install datasets

## Imports

In [5]:
import pandas as pd
from modules.probe_bert import BertForMaskedLMProbing
from transformers import pipeline, BertTokenizer, BertConfig

In [6]:
MODEL_NAME = 'bert-base-uncased'

## Model Infers

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
config = BertConfig.from_pretrained(MODEL_NAME)

In [8]:
sentence = 'The author that the teachers admire [MASK] happy'

In [9]:
def predictProbe(
    sentence, 
    token_idx,
    alpha,
    probe_layers,
    k,
    topK
):
    
    config.update({
        'TOKEN_IDX': token_idx,
        'ALPHA': alpha,
        'PROBE_LAYERS': probe_layers,
        'K': k,
    })

    probe_model = BertForMaskedLMProbing.from_pretrained(
        MODEL_NAME, config=config
    )

    unmasker = pipeline('fill-mask',
                        model=probe_model,
                        tokenizer=tokenizer,
                        top_k=topK)


    return list(unmasker(sentence))

In [12]:
def getNumberProbs(masks,
                   candidates=['is', 'are']):
    res = []
    for obj in masks:
        if obj['token_str'] in candidates:
            res.append(obj)
            
    return pd.DataFrame.from_records(res)

try:
    res = getNumberProbs(
        predictProbe(
            sentence ='The author that the teachers admire [MASK] happy',
            token_idx=1,
            alpha=5,
            probe_layers=[0],
            k=(0,50),
            topK = 200
        ), 
        candidates=['is', 'are']).sort_values(by='score', ascending=False).to_string() 
    
    print(res)
    
except KeyError:
    print('No Results have found')

  lambda_i = torch.tensor([sum(h_i.T * h_i[ix]) for ix in range(h_i.size(0))])


      score  token token_str                                       sequence
0  0.004827   2024       are  the author that the teachers admire are happy
1  0.002319   2003        is   the author that the teachers admire is happy


In [87]:
sent = '[MASK] men are good'
print(list(enumerate(tokenizer.tokenize(sent))))

[(0, '[MASK]'), (1, 'men'), (2, 'are'), (3, 'good')]


In [88]:
res = pd.DataFrame.from_records(
    predictProbe(
        sentence=sent,
        token_idx=0,
        alpha=-5,
        probe_layers=[0],
        k=(0,50),
        topK =1500
    )).sort_values(by='score', ascending=False)

mask = res['token_str'].apply(lambda x: x in ['these', 'this'])
print(res[mask])

        score  token token_str            sequence
115  0.000550   2023      this   this men are good
531  0.000178   2122     these  these men are good


## Dataset

1. Number Agreement (Subject - Verb)
    1. Replication of English number
    1. Binding
        - They saw itself / themselves
        - I saw myself / ourselves
    1. Concord
        - These men / This men
        - These men / these man
        - Five kids / five kid
    1. Polite, honorific plurality, assoc (For Turkish)

2. Replicate Number for Turkish
    - Geldi - geldiler		(pure number)
    - geldim - geldim-ler - geldik	()

3. Person
    - Ben, sen, o, biz, siz, onlar


In [18]:
dataset_ids = [
    'en_esl',
    'en_ewt',
    'en_gum',
    'en_gumreddit',
    'en_lines',
    'en_partut',
    'en_pronouns',
    'en_pud',
    'tr_boun',
    'tr_gb',
    'tr_imst',
    'tr_pud'
]

In [38]:
dataset = load_dataset("universal_dependencies", "en_pud")

In [89]:
data = pd.DataFrame.from_records(dataset['test'])

In [91]:
data[data.tokens.apply(lambda x: 'is' in x)]

Unnamed: 0,idx,text,tokens,lemmas,upos,xpos,feats,head,deprel,deps,misc
0,n01001011,“While much of the digital transition is unpre...,"[“, While, much, of, the, digital, transition,...","[“, while, much, of, the, digital, transition,...","[1, 5, 6, 2, 8, 6, 0, 17, 6, 2, 8, 10, 10, 1, ...","[``, IN, JJ, IN, DT, JJ, NN, VBZ, JJ, IN, DT, ...","[None, None, {'Degree': 'Pos'}, None, {'Defini...","[20, 9, 9, 7, 7, 7, 3, 9, 20, 13, 13, 13, 9, 2...","[punct, mark, nsubj, case, det, amod, nmod, co...","[[('punct', 20)], [('mark', 9)], [('nsubj', 9)...","[{'SpaceAfter': 'No'}, None, None, None, None,..."
4,n01002042,The new spending is fueled by Clinton’s large ...,"[The, new, spending, is, fueled, by, Clinton, ...","[the, new, spending, be, fuel, by, Clinton, ’s...","[8, 6, 0, 17, 16, 2, 10, 7, 6, 0, 0, 1]","[DT, JJ, NN, VBZ, VBN, IN, NNP, POS, JJ, NN, N...","[{'Definite': 'Def', 'PronType': 'Art'}, {'Deg...","[3, 3, 5, 5, 0, 11, 11, 7, 11, 11, 5, 5]","[det, amod, nsubj:pass, aux:pass, root, case, ...","[[('det', 3)], [('amod', 3)], [('nsubj:pass', ...","[None, None, None, None, None, None, {'SpaceAf..."
18,n01009027,"Today, Khanzir may be a lonely pig, but he isn...","[Today, ,, Khanzir, may, be, a, lonely, pig, ,...","[today, ,, Khanzir, may, be, a, lonely, pig, ,...","[0, 1, 10, 17, 17, 8, 6, 0, 1, 9, 11, 17, 14, ...","[NN, ,, NNP, MD, VB, DT, JJ, NN, ,, CC, PRP, V...","[{'Number': 'Sing'}, None, {'Number': 'Sing'},...","[8, 8, 8, 8, 8, 8, 8, 0, 8, 15, 15, 15, 15, 15...","[obl:tmod, punct, nsubj, aux, cop, det, amod, ...","[[('obl:tmod', 8)], [('punct', 8)], [('nsubj',...","[{'SpaceAfter': 'No'}, None, None, None, None,..."
28,n01015033,It's most obvious when a celebrity's name is i...,"[It, 's, most, obvious, when, a, celebrity, 's...","[it, be, most, obvious, when, a, celebrity, 's...","[11, 17, 14, 6, 14, 8, 0, 7, 0, 17, 14, 14, 6, 1]","[PRP, VBZ, RBS, JJ, WRB, DT, NN, POS, NN, VBZ,...","[{'Case': 'Nom', 'Gender': 'Neut', 'Number': '...","[4, 4, 4, 0, 13, 7, 9, 7, 13, 13, 13, 13, 4, 4]","[nsubj, cop, advmod, root, advmod, det, nmod:p...","[[('nsubj', 4)], [('cop', 4)], [('advmod', 4)]...","[{'SpaceAfter': 'No'}, None, None, None, None,..."
31,n01016019,Several analysts have suggested Huawei is best...,"[Several, analysts, have, suggested, Huawei, i...","[several, analyst, have, suggest, Huawei, be, ...","[6, 0, 17, 16, 10, 17, 14, 16, 7, 16, 2, 10, 7...","[JJ, NNS, VBP, VBN, NNP, VBZ, RBS, VBN, TO, VB...","[{'Degree': 'Pos'}, {'Number': 'Plur'}, {'Mood...","[2, 4, 4, 0, 8, 8, 8, 4, 10, 8, 14, 14, 12, 10...","[amod, nsubj, aux, root, nsubj:pass, aux:pass,...","[[('amod', 2)], [('nsubj', 4)], [('aux', 4)], ...","[None, None, None, None, None, None, None, Non..."
...,...,...,...,...,...,...,...,...,...,...,...
958,w04004005,The inner part of the region is flat and hilly...,"[The, inner, part, of, the, region, is, flat, ...","[the, inner, part, of, the, region, be, flat, ...","[8, 6, 0, 2, 8, 0, 17, 6, 9, 6, 1, 2, 8, 6, 0,...","[DT, JJ, NN, IN, DT, NN, VBZ, JJ, CC, JJ, ,, I...","[{'Definite': 'Def', 'PronType': 'Art'}, {'Deg...","[3, 3, 8, 6, 6, 3, 8, 0, 10, 8, 8, 15, 15, 15,...","[det, amod, nsubj, case, det, nmod, cop, root,...","[[('det', 3)], [('amod', 3)], [('nsubj', 8), (...","[None, None, None, None, None, None, None, Non..."
969,w04009042,The man told him that a war between the two un...,"[The, man, told, him, that, a, war, between, t...","[the, man, tell, he, that, a, war, between, th...","[8, 0, 16, 11, 5, 8, 0, 2, 8, 3, 0, 17, 16, 1,...","[DT, NN, VBD, PRP, IN, DT, NN, IN, DT, CD, NNS...","[{'Definite': 'Def', 'PronType': 'Art'}, {'Num...","[2, 3, 0, 3, 13, 7, 13, 11, 11, 11, 7, 13, 3, ...","[det, nsubj, root, iobj, mark, det, nsubj, cas...","[[('det', 2)], [('nsubj', 3)], [('root', 0)], ...","[None, None, None, None, None, None, None, Non..."
974,w04010031,"During their various collaborations, it is not...","[During, their, various, collaborations, ,, it...","[during, they, various, collaboration, ,, it, ...","[2, 11, 6, 0, 1, 11, 17, 7, 6, 11, 2, 8, 3, 16...","[IN, PRP$, JJ, NNS, ,, PRP, VBZ, RB, JJ, WP, I...","[None, {'Number': 'Plur', 'Person': '3', 'Poss...","[4, 4, 4, 9, 9, 9, 9, 9, 0, 14, 13, 13, 10, 9,...","[case, nmod:poss, amod, obl, punct, expl, cop,...","[[('case', 4)], [('nmod:poss', 4)], [('amod', ...","[None, None, None, {'SpaceAfter': 'No'}, None,..."
983,w05004031,It is the portion from this second boundary up...,"[It, is, the, portion, from, this, second, bou...","[it, be, the, portion, from, this, second, bou...","[11, 17, 8, 0, 2, 8, 6, 0, 2, 2, 8, 6, 0, 1, 8...","[PRP, VBZ, DT, NN, IN, DT, JJ, NN, RB, IN, DT,...","[{'Case': 'Nom', 'Gender': 'Neut', 'Number': '...","[4, 4, 4, 0, 8, 8, 8, 4, 13, 9, 13, 13, 4, 17,...","[nsubj, cop, det, root, case, det, amod, nmod,...","[[('nsubj', 4)], [('cop', 4)], [('det', 4)], [...","[None, None, None, None, None, None, None, Non..."
