In [None]:
!pip3 install conllu
!pip3 install transformers
!pip3 install torch
!pip3 install datasets

## Imports

In [5]:
import pandas as pd
from probe_bert import BertForMaskedLMProbing
from transformers import pipeline, BertTokenizer, BertConfig

In [6]:
MODEL_NAME = 'bert-base-uncased'

## Model Infers

In [7]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
config = BertConfig.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
sentence = 'The author that the teachers admire [MASK] happy'

In [9]:
def predictProbe(
    sentence, 
    token_idx,
    alpha,
    probe_layers,
    k,
    topK
):
    
    config.update({
        'TOKEN_IDX': token_idx,
        'ALPHA': alpha,
        'PROBE_LAYERS': probe_layers,
        'K': k,
    })

    probe_model = BertForMaskedLMProbing.from_pretrained(
        MODEL_NAME, config=config
    )

    unmasker = pipeline('fill-mask',
                        model=probe_model,
                        tokenizer=tokenizer,
                        top_k=topK)


    return list(unmasker(sentence))

In [12]:
def getNumberProbs(masks,
                   candidates=['is', 'are']):
    res = []
    for obj in masks:
        if obj['token_str'] in candidates:
            res.append(obj)
            
    return pd.DataFrame.from_records(res)

try:
    res = getNumberProbs(
        predictProbe(
            sentence ='The author that the teachers admire [MASK] happy',
            token_idx=1,
            alpha=5,
            probe_layers=[0],
            k=(0,50),
            topK = 200
        ), 
        candidates=['is', 'are']).sort_values(by='score', ascending=False).to_string() 
    
    print(res)
    
except KeyError:
    print('No Results have found')

  lambda_i = torch.tensor([sum(h_i.T * h_i[ix]) for ix in range(h_i.size(0))])


      score  token token_str                                       sequence
0  0.004827   2024       are  the author that the teachers admire are happy
1  0.002319   2003        is   the author that the teachers admire is happy


## Dataset

1. Number Agreement (Subject - Verb)
    1. Replication of English number
    1. Binding
        - They saw itself / themselves
        - I saw myself / ourselves
    1. Concord
        - These men / This men
        - These men / these man
        - Five kids / five kid
    1. Polite, honorific plurality, assoc (For Turkish)

2. Replicate Number for Turkish
    - Geldi - geldiler		(pure number)
    - geldim - geldim-ler - geldik	()

3. Person
    - Ben, sen, o, biz, siz, onlar


In [18]:
dataset_ids = [
    'en_esl',
    'en_ewt',
    'en_gum',
    'en_gumreddit',
    'en_lines',
    'en_partut',
    'en_pronouns',
    'en_pud',
    'tr_boun',
    'tr_gb',
    'tr_imst',
    'tr_pud'
]

In [38]:
dataset = load_dataset("universal_dependencies", "en_pud")

In [42]:
pd.DataFrame.from_records(dataset['test'])

Unnamed: 0,idx,text,tokens,lemmas,upos,xpos,feats,head,deprel,deps,misc
0,n01001011,“While much of the digital transition is unpre...,"[“, While, much, of, the, digital, transition,...","[“, while, much, of, the, digital, transition,...","[1, 5, 6, 2, 8, 6, 0, 17, 6, 2, 8, 10, 10, 1, ...","[``, IN, JJ, IN, DT, JJ, NN, VBZ, JJ, IN, DT, ...","[None, None, {'Degree': 'Pos'}, None, {'Defini...","[20, 9, 9, 7, 7, 7, 3, 9, 20, 13, 13, 13, 9, 2...","[punct, mark, nsubj, case, det, amod, nmod, co...","[[('punct', 20)], [('mark', 9)], [('nsubj', 9)...","[{'SpaceAfter': 'No'}, None, None, None, None,..."
1,n01001013,For those who follow social media transitions ...,"[For, those, who, follow, social, media, trans...","[for, those, who, follow, social, media, trans...","[2, 11, 11, 16, 6, 0, 0, 2, 10, 10, 1, 11, 17,...","[IN, DT, WP, VBP, JJ, NN, NNS, IN, NN, NN, ,, ...","[None, {'Number': 'Plur', 'PronType': 'Dem'}, ...","[2, 17, 4, 2, 6, 7, 4, 10, 10, 7, 17, 17, 17, ...","[case, obl, nsubj, acl:relcl, amod, compound, ...","[[('case', 2)], [('nsubj', 4), ('obl:for', 17)...","[None, None, None, None, None, None, None, Non..."
2,n01002017,But in a break from his past rhetoric about cu...,"[But, in, a, break, from, his, past, rhetoric,...","[but, in, a, break, from, he, past, rhetoric, ...","[9, 2, 8, 0, 2, 11, 6, 0, 2, 16, 0, 1, 8, 10, ...","[CC, IN, DT, NN, IN, PRP$, JJ, NN, IN, VBG, NN...","[None, None, {'Definite': 'Ind', 'PronType': '...","[16, 4, 4, 16, 8, 8, 8, 4, 10, 8, 10, 16, 15, ...","[cc, case, det, obl, case, nmod:poss, amod, nm...","[[('cc', 16)], [('case', 4)], [('det', 4)], [(...","[None, None, None, None, None, None, None, Non..."
3,n01002032,"“So I hate to put a little pressure on you, bu...","[“, So, I, hate, to, put, a, little, pressure,...","[“, so, I, hate, to, put, a, little, pressure,...","[1, 14, 11, 16, 7, 16, 8, 6, 0, 2, 11, 1, 9, 8...","[``, RB, PRP, VBP, TO, VB, DT, JJ, NN, IN, PRP...","[None, None, {'Case': 'Nom', 'Number': 'Sing',...","[4, 4, 4, 0, 6, 4, 9, 9, 6, 11, 6, 4, 19, 15, ...","[punct, advmod, nsubj, root, mark, xcomp, det,...","[[('punct', 4)], [('advmod', 4)], [('nsubj', 4...","[{'SpaceAfter': 'No'}, None, None, None, None,..."
4,n01002042,The new spending is fueled by Clinton’s large ...,"[The, new, spending, is, fueled, by, Clinton, ...","[the, new, spending, be, fuel, by, Clinton, ’s...","[8, 6, 0, 17, 16, 2, 10, 7, 6, 0, 0, 1]","[DT, JJ, NN, VBZ, VBN, IN, NNP, POS, JJ, NN, N...","[{'Definite': 'Def', 'PronType': 'Art'}, {'Deg...","[3, 3, 5, 5, 0, 11, 11, 7, 11, 11, 5, 5]","[det, amod, nsubj:pass, aux:pass, root, case, ...","[[('det', 3)], [('amod', 3)], [('nsubj:pass', ...","[None, None, None, None, None, None, {'SpaceAf..."
...,...,...,...,...,...,...,...,...,...,...,...
995,w05010023,Pompey took command of two legions in Capua an...,"[Pompey, took, command, of, two, legions, in, ...","[Pompey, take, command, of, two, legion, in, C...","[10, 16, 0, 2, 3, 0, 2, 10, 9, 16, 7, 16, 0, 1...","[NNP, VBD, NN, IN, CD, NNS, IN, NNP, CC, VBD, ...","[{'Number': 'Sing'}, {'Mood': 'Ind', 'Tense': ...","[2, 0, 2, 6, 6, 2, 8, 2, 10, 2, 12, 10, 12, 12...","[nsubj, root, obj, case, nummod, obl, case, ob...","[[('nsubj', 2), ('nsubj', 10), ('nsubj:xsubj',...","[None, None, None, None, None, None, None, Non..."
996,w05010024,César was personally informed of Pompey's acti...,"[César, was, personally, informed, of, Pompey,...","[César, be, personally, inform, of, Pompey, 's...","[10, 17, 14, 16, 2, 10, 7, 0, 2, 10, 1, 11, 2,...","[NNP, VBD, RB, VBN, IN, NNP, POS, NNS, IN, NNP...","[{'Number': 'Sing'}, {'Mood': 'Ind', 'Number':...","[4, 4, 4, 0, 8, 8, 6, 4, 10, 4, 10, 17, 15, 15...","[nsubj:pass, aux:pass, advmod, root, case, nmo...","[[('nsubj:pass', 4)], [('aux:pass', 4)], [('ad...","[None, None, None, None, None, {'SpaceAfter': ..."
997,w05010025,"Meanwhile, his place in tribune was occupied b...","[Meanwhile, ,, his, place, in, tribune, was, o...","[meanwhile, ,, he, place, in, tribune, be, occ...","[14, 1, 11, 0, 2, 0, 17, 16, 2, 10, 10, 1, 11,...","[RB, ,, PRP$, NN, IN, NN, VBD, VBN, IN, NNP, N...","[None, None, {'Gender': 'Masc', 'Number': 'Sin...","[8, 8, 4, 8, 6, 4, 8, 0, 10, 8, 10, 14, 14, 10...","[advmod, punct, nmod:poss, nsubj:pass, case, n...","[[('advmod', 8)], [('punct', 8)], [('nmod:poss...","[{'SpaceAfter': 'No'}, None, None, None, None,..."
998,w05010026,But when the Senate answered him definitively ...,"[But, when, the, Senate, answered, him, defini...","[but, when, the, senate, answer, he, definitiv...","[9, 14, 8, 10, 16, 11, 14, 5, 16, 11, 5, 16, 2...","[CC, WRB, DT, NN, VBD, PRP, RB, IN, VBG, PRP, ...","[None, {'PronType': 'Int'}, {'Definite': 'Def'...","[33, 5, 4, 5, 33, 5, 5, 9, 5, 9, 12, 9, 15, 15...","[cc, advmod, det, nsubj, advcl, obj, advmod, m...","[[('cc', 33)], [('advmod', 5)], [('det', 4)], ...","[None, None, None, None, None, None, None, Non..."
