In [5]:
%reset
### Basic logistical, wrangling, & visualization imports

import os
import glob
import torch

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm.notebook import tqdm

### Model imports
from transformers import BertModel, BertConfig

In [6]:
### Define useful functions

def make_sentences_df(dct_noun,df_noun):
    """Creates sentence templates masking what should be a definite plural or singular article
    INPUTS: dct_noun: dictionary, noun:article pairs, the article is lowercase, singular, and gendered--this 
                      was hand-annotated from native Caribbean Spanish speaker intuition and checks in online
                      dictionary of the Real Academia Española; if there are any concerns about gendered article 
                      errors, check here first
            df_noun: pandas dataframe, cols: whole_word (plural), root (becomes our lemma), affix (s or es)--
                     this df is drawn from a noun subset of AnCora Tree Bank"""
    
    gather_df = []
    for noun in tqdm(dct_noun.keys()): 
        
        subdf = df_noun[df_noun['whole_word'] == noun]

        ### set up singular & plural template components
        sing_art = dct_noun[noun]
        sing_art = str(sing_art[0].upper() + sing_art[1])

        sing_subj = subdf['root'].values[0]

        plur_subj = noun

        ### set up target definite articles
        if sing_art == 'El':

            plur_art= 'Los'


        elif sing_art == 'La': 

            plur_art = 'Las'

        ### set up target singluar and plural verbs

        sing_verb = "es"

        plur_verb = "son"

        ### set up composite template components
        affix = subdf['affix'].values[0]
        comp_subj = sing_subj +'##'+ affix


        ### create all sentence templates

        sing_sentence_template = sing_art + ' ' + sing_subj + ' [MASK]'

        plur_sentence_template = plur_art + ' ' + plur_subj + ' [MASK]'

        comp_sentence_template = plur_art + ' ' + comp_subj + ' [MASK]'

        ### set up all the column/row entries you will store per lemma

        all_wordforms = [sing_subj,
                         plur_subj,
                         comp_subj]

        all_word_number = ['sing',
                           'plur',
                           'plur'] 

        all_n_tokens = [len(tokenizer.encode(sing_subj,add_special_tokens=False)),
                        len(tokenizer.encode(plur_subj,add_special_tokens=False)),
                        len(tokenizer.encode(sing_subj,add_special_tokens=False)) + len(tokenizer.encode(['##'+affix],add_special_tokens=False))                           ]

        all_sentence_templates = [sing_sentence_template,
                                  plur_sentence_template,
                                  comp_sentence_template]
            
        #this is a super important column! distinguishes the default tokenization from 
        #our artificially imposed tokenization scheme
        all_tokenization_types = ['default',
                                  'default',
                                  'artificial']

        d = {'lemma': np.repeat(sing_subj,len(all_sentence_templates)),
             'word_form': all_wordforms,
             'word_number': all_word_number,
             'n_tokens': all_n_tokens,
             'tokenization_type': all_tokenization_types,
             'sentence': all_sentence_templates,
             'target_ART_sing': sing_art, 
             'target_ART_plur': plur_art,
             'target_VERB_sing': sing_verb,
             'target_VERB_plur': plur_verb,
             'affix': np.repeat(affix,len(all_sentence_templates))
             }

        gather_df.append(pd.DataFrame(d))

    sentence_df = pd.concat(gather_df,ignore_index=True)
    return sentence_df


### Model Predictions 

def find_sublist_index(list, sublist):
    """Find the first occurence of sublist in list.
    Return the start and end indices of sublist in list.
    Used to find index of [MASK] in template sentences.

    h/t GPT-3-codex for writing this."""

    for i in range(len(list)):
        if list[i] == sublist[0] and list[i:i+len(sublist)] == sublist:
            return i, i+len(sublist)
    return None


def get_article_predictions(df,data_source): 
    """Predict the likelihood of target definite/indefinite and singular/plural articles
       Will assume you've already loaded and defined your tokenizer and model.
       Will iterate row by row in your dataframe `df`, containing cols for masked sentences,
       the corresponding lemma and plural forms being tested. When it comes across a row
       with column `tokenizer_type` label `artificial`, will shunt you to a process that creates
       the inputs to the model by hand (if there are any issues/concerns with how the artificial
       tokenization proceeds and leads to predictions, check the following `if` statement: 
       `if row['tokenization_type'] == 'artificial'`)
       
       INPUTS: df, pandas dataframe, cols for lemma, word_number (plural, singular), tokenization_type
                   (artificial/default), masked sentence, target singular and plural articles to 
                   get probabilities for filling in the [MASK], and others"""

    gather_df = []
    gather_debug = []
    for (_,row) in tqdm(df.iterrows(),total=df.shape[0]):

        if row['word_number'] == 'sing':
            target_article = row['target_ART_sing']
        else:
            target_article = row['target_ART_plur']


        #tokens for each article type 
        token_article= tokenizer.encode(target_article,
                                              add_special_tokens=False
                                         )

                                    
        #token for mask
        token_mask = tokenizer.encode('[MASK]',
                                      add_special_tokens=False
                                     )

        target_verb_sing = row['target_VERB_sing']
        target_verb_plur = row['target_VERB_plur']

        #tokens for each verb number 
        token_verb_sing = tokenizer.encode(target_verb_sing,
                                              add_special_tokens=False
                                         )
        token_verb_plur = tokenizer.encode(target_verb_plur,
                                              add_special_tokens=False
                                         )

        ### Set up your representation of the sentence for the 
        #.  model

        if row['tokenization_type'] == 'artificial': 

            if '##es' in row['word_form']:

                token_affix = tokenizer.convert_tokens_to_ids(["##es"])


            elif '##s' in row['word_form']: 

                token_affix = tokenizer.convert_tokens_to_ids(["##s"])

            #token for singular form
            lemma = row['lemma']
            token_lemma = tokenizer.encode(lemma,
                                           add_special_tokens=False
                                          )
            print(token_lemma)
            #TODO: Check for source and if it's non-morphemic, combine the tokens that the lemma gets broken into
 
            #token for special start
            start = '[CLS]'
            token_start = tokenizer.encode(start,
                                           add_special_tokens=False)
            

            #token for special end
            ending = '[SEP]'
            token_end = tokenizer.encode(ending,
                                         add_special_tokens=False)
            
            
            ### Collect your tokens into a list that you will then flatten
            #   prior to converting to tensor
            bulky_token_list = [token_start,
                                token_article,
                                token_lemma,
                                token_affix,
                                token_mask,
                                token_end
                                ]
            flat_token_list = [item for sublist in bulky_token_list for item in sublist]
            token_idx = torch.tensor([flat_token_list])
            

            inputs = {'input_ids': token_idx,
                      'token_type_ids': torch.zeros_like(token_idx),
                      'attention_mask': torch.ones_like(token_idx)
                     }

        elif row['tokenization_type']=='default': 

            inputs = tokenizer(row['sentence'],
                               return_tensors='pt',
                               add_special_tokens=True
                              )
            
        model_token_inputs = tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0])
        model_token_inputs = ' , '.join(model_token_inputs)

        ### Predict the item that should fill in the mask!

        outputs = model(**inputs)

        #find the index of the mask in sentence
        midx = find_sublist_index(inputs["input_ids"][0].tolist(),
                                  token_mask)
                                 
        masked_token_logits = outputs.logits[0][midx[0]]
        masked_token_probs = torch.softmax(masked_token_logits, dim=0)


        # prob_article_sing = masked_token_probs[token_article_sing].item()
        # prob_article_plur = masked_token_probs[token_article_plur].item()

        # prob_list = [prob_article_sing, prob_article_plur]
        # article_list = [target_article_sing, target_article_plur]
    

        prob_verb_sing = masked_token_probs[token_verb_sing].item()
        prob_verb_plur = masked_token_probs[token_verb_plur].item()

        prob_list = [prob_verb_sing, prob_verb_plur]
        verb_list = [target_verb_sing, target_verb_plur]

    
        ### Store your results

        d = {'lemma': np.repeat(row['lemma'],len(prob_list)),
             'word_form': np.repeat(row['word_form'],len(prob_list)),
             'word_number': np.repeat(row['word_number'],len(prob_list)),
             'n_tokens': np.repeat(row['n_tokens'],len(prob_list)),
             'tokenization_type': np.repeat(row['tokenization_type'],len(prob_list)),
             'verb_probs': prob_list,
             'article_number': ['singular','plural'],
             'verb_number': ['singular','plural'],
             'verb': verb_list,
             'affix': row['affix'],
             'sentence': row['sentence'],
             'model_token_inputs': model_token_inputs,
             'source': data_source
            }
        gather_df.append(pd.DataFrame(d))


        debug_d = {'lemma': [row['lemma']],
                   'word_form': [row['word_form']],
                   'tokenized_sentence': [inputs["input_ids"][0].tolist()],
                   'mask_index': [midx[0]]
                  }
        gather_debug.append(pd.DataFrame(debug_d))


    probs_df = pd.concat(gather_df,ignore_index=True)
    debug_df = pd.concat(gather_debug,ignore_index=True)
    
    return probs_df,debug_df

In [7]:
### Import the necessary

import torch
from transformers import BertForMaskedLM, BertTokenizer

### Create the tokenizer and the model

tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [8]:
# Path to grab dataframes containing nouns from the AnCora Tree Banks
datapath = 'datasets/'

# This line generates 3 dictionaries with hand-annotated noun:article pairs
# dct_noun_singletok
# dct_noun_multitok_morph
# dct_noun_multitok_nonmorph
%run -i 'create_annotated_dicts.py'

### Load the datasets one by one

# For default SINGLE-TOKEN plurals
filename = 'nounlist_single-token-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_singletok
df_singletok = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'single-token' 
probs_df_singletok,_ = get_article_predictions(df_singletok,data_source)


# For default MULTI-TOKEN, MORPHEMIC plurals
filename = 'nounlist_multi-token-morph-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_multitok_morph
df_multitok_morph = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'morphemic' 
probs_df_multitok_morph,_ = get_article_predictions(df_multitok_morph,data_source)


# For default MULTI-TOKEN, NONMORPHEMIC plurals
filename = 'nounlist_multi-token-nonmorph-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_multitok_nonmorph
df_multitok_nonmorph = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'non_morphemic' 
probs_df_multitok_nonmorph,_ = get_article_predictions(df_multitok_nonmorph,data_source)



  0%|          | 0/1247 [00:00<?, ?it/s]

original list len: 1363 select list len: 1247


  0%|          | 0/3741 [00:00<?, ?it/s]

[5850]
[14089]
[7388]
[5911]
[9823]
[3785]
[2378]
[13651]
[17022]
[1944]
[14878]
[4772]
[7746]
[2274]
[5411]
[3921]
[2784]
[2957]
[7043]
[2395]
[5721]
[2966]
[1814]
[3325]
[4443]
[2116]
[12032]
[4149]
[3455]
[2209]
[11381]
[12653]
[5444]
[7473]
[1890]
[6560]
[8222]
[7076]
[5186]
[4627]
[8316]
[4598]
[4379]
[27438]
[15018]
[6013]
[2709]
[4244]
[2307]
[5651]
[12500]
[3179]
[14955]
[2981]
[22553]
[11623]
[9200]
[5070]
[4840]
[1730]
[6835]
[1560]
[7172]
[3434]
[9454]
[9342]
[6114]
[4660]
[11171]
[15643]
[3305]
[1708]
[8955]
[3219]
[7984]
[4436]
[5705]
[3918]
[3630]
[2353]
[6067]
[9866]
[2483]
[7958]
[5029]
[4792]
[12078]
[6289]
[5529]
[24137]
[2153]
[3038]
[10884]
[12112]
[9574]
[3531]
[4196]
[10508]
[11554]
[2984]
[15951]
[2743]
[5076]
[6599]
[14627]
[3889]
[2444]
[18803]
[4198]
[4837]
[6876]
[12175]
[3599]
[3428]
[1726]
[27125]
[9291]
[6290]
[3661]
[5161]
[12049]
[9738]
[4137]
[7412]
[11551]
[15817]
[17802]
[2785]
[7484]
[3740]
[9712]
[13820]
[2181]
[5247]
[3912]
[3165]
[4320]
[30563]
[6

  0%|          | 0/508 [00:00<?, ?it/s]

original list len: 515 select list len: 508


  0%|          | 0/1524 [00:00<?, ?it/s]

[16287]
[9469]
[26549]
[25370]
[13881]
[16715]
[12884]
[18276]
[18873]
[25913]
[14178]
[26590]
[15324]
[19947]
[28904]
[25112]
[14983]
[10548]
[28631]
[19809]
[24443]
[16648]
[5646]
[26808]
[13071]
[21953]
[24950]
[16517]
[5630]
[30330]
[17936]
[18658]
[23903]
[22641]
[11769]
[22329]
[20061]
[24504]
[18817]
[25422]
[13189]
[13205]
[30712]
[30473]
[11941]
[28715]
[11053]
[21570]
[9039]
[22834]
[18894]
[26004]
[26745]
[20740]
[10468]
[29842]
[11331]
[12006]
[30476]
[30327]
[26629]
[26452]
[10925]
[23712]
[21281]
[12790]
[24780]
[14384]
[16480]
[15646]
[22952]
[25377]
[30079]
[18132]
[12495]
[3245]
[30588]
[11943]
[14545]
[19962]
[23589]
[9840]
[13254]
[19292]
[13302]
[28090]
[25041]
[14363]
[10865]
[30751]
[28006]
[28276]
[7285]
[9328]
[26842]
[24051]
[11594]
[15421]
[27341]
[7039]
[10040]
[30308]
[13795]
[13355]
[18835]
[26982]
[20205]
[4252]
[24357]
[16987]
[11535]
[27905]
[8020]
[27554]
[8236]
[13391]
[25716]
[28637]
[16466]
[18235]
[25306]
[21936]
[30798]
[24808]
[28331]
[11421]
[115

  0%|          | 0/627 [00:00<?, ?it/s]

original list len: 646 select list len: 627


  0%|          | 0/1881 [00:00<?, ?it/s]

[7385, 2471]
[26376, 30932]
[982, 3915]
[13280, 2230]
[13377, 1630]
[4521, 2442]
[28282, 30936]
[1098, 9483]
[1404, 3862]
[8616, 1071]
[8111, 1629]
[1664, 13212]
[26021, 30933]
[28578, 30936]
[4046, 1629]
[24833, 30932]
[1627, 1103]
[2870, 1532]
[23162, 1324]
[1475, 9534]
[18532, 1459]
[11623, 1224]
[15227, 1650]
[982, 20847]
[1411, 1077]
[24418, 1142]
[14675, 1743]
[13923, 1066]
[11724, 1630]
[4873, 1093]
[24743, 2767]
[10907, 1441]
[3107, 6007]
[4934, 1255]
[3129, 1441]
[2914, 1095]
[9743, 1406]
[26216, 1476]
[17026, 2288]
[25850, 1441]
[4979, 1228]
[1334, 1180]
[2891, 6602]
[1574, 13879]
[27383, 15508]
[1620, 1361]
[18554, 5213]
[10616, 30938]
[1292, 12825]
[8654, 1280]
[3783, 1462]
[1884, 1284, 7166]
[4806, 1006]
[8222, 3726]
[19466, 4940]
[15694, 9995]
[24691, 1095]
[6231, 1071]
[2007, 1169]
[1985, 1806]
[10233, 30933]
[26962, 1024]
[4034, 1485]
[3275, 17023]
[29429, 30950]
[26994, 8767]
[25481, 1101]
[11130, 30148]
[24234, 1081]
[4436, 1806]
[8969, 1025]
[29080, 1280]
[25998, 987

In [9]:
probs_df_singletok['surprisal'] = probs_df_singletok['verb_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_morph['surprisal'] = probs_df_multitok_morph['verb_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_nonmorph['surprisal'] = probs_df_multitok_nonmorph['verb_probs'].apply(lambda x: -np.log(x))

In [10]:
### Pause to check that the default tokenizations for plurals are, in fact, morphemic when you expect them to be

non_morphemic_list = []
for (_,row) in df_multitok_morph.iterrows(): 
    
    sing = row['lemma']
    plur = row['word_form']
    affix = row['affix']
    mod_affix = "##" + affix
    
    sing_tokids = tokenizer.encode(sing,add_special_tokens=False)
    
    if row['tokenization_type'] == 'artificial': 
        plur_tokids = tokenizer.encode([sing]+[mod_affix],add_special_tokens=False)
        
    elif row['tokenization_type'] == 'default': 
        plur_tokids = tokenizer.encode(plur,add_special_tokens=False)


    affix_tokid = tokenizer.encode([mod_affix],add_special_tokens=False)[0]

    n_plural_tokens = len(plur_tokids)
    
    sing_tokens = tokenizer.convert_ids_to_tokens(sing_tokids)
    plural_tokens = tokenizer.convert_ids_to_tokens(plur_tokids)
    
    if n_plural_tokens == 1:
        check_morph = "singular single"
        
    elif affix_tokid in plur_tokids:
        check_morph = "morphemic"

    ### Multi-token, non-morphemic
    else:
        check_morph = "non_morphemic"

        print(sing,plural_tokens,check_morph)
        
        non_morphemic_list.append(sing)

print(non_morphemic_list)

[]


In [11]:
### Pause to check that the default tokenizations for plurals are, in fact, nonmorphemic when you expect them to be

morphemic_list = []
for (_,row) in df_multitok_nonmorph.iterrows(): 
    
    sing = row['lemma']
    plur = row['word_form']
    affix = row['affix']
    mod_affix = "##" + affix
    
    sing_tokids = tokenizer.encode(sing,add_special_tokens=False)
    
    if row['tokenization_type'] == 'artificial': 
        plur_tokids = tokenizer.encode([sing]+[mod_affix],add_special_tokens=False)
        
    elif row['tokenization_type'] == 'default': 
        plur_tokids = tokenizer.encode(plur,add_special_tokens=False)


    affix_tokid = tokenizer.encode([mod_affix],add_special_tokens=False)[0]

    n_plural_tokens = len(plur_tokids)
    
    sing_tokens = tokenizer.convert_ids_to_tokens(sing_tokids)
    plural_tokens = tokenizer.convert_ids_to_tokens(plur_tokids)
    
    if n_plural_tokens == 1:
        check_morph = "singular single"
        
    elif (affix_tokid in plur_tokids) & (row['tokenization_type']=='default'):
        check_morph = "morphemic"
        
        morphemic_list.append(sing)
        print(sing,plural_tokens,check_morph)


    ### Multi-token, non-morphemic
    else:
        check_morph = "non_morphemic"
        
print(morphemic_list)

[]


In [14]:
### Save each of these to a dataframe!

savepath = 'results_unmasked_verb-agreement/'

# if ~os.path.exists(savepath): 
os.mkdir(savepath)
    

probs_df_singletok.to_csv(os.path.join(savepath,'results_singletok.csv'))
probs_df_multitok_morph.to_csv(os.path.join(savepath,'results_multitok_morph.csv'))
probs_df_multitok_nonmorph.to_csv(os.path.join(savepath,'results_multitok_nonmorph.csv'))