In [1]:
# %reset
### Basic logistical, wrangling, & visualization imports

import os
import glob
import torch

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm.notebook import tqdm

### Model imports
from transformers import BertModel, BertConfig

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
### Define useful functions

def make_sentences_df(dct_noun,df_noun):
    """Creates sentence templates masking what should be a definite plural or singular article
    INPUTS: dct_noun: dictionary, noun:article pairs, the article is lowercase, singular, and gendered--this 
                      was hand-annotated from native Caribbean Spanish speaker intuition and checks in online
                      dictionary of the Real Academia Española; if there are any concerns about gendered article 
                      errors, check here first
            df_noun: pandas dataframe, cols: whole_word (plural), root (becomes our lemma), affix (s or es)--
                     this df is drawn from a noun subset of AnCora Tree Bank"""
    
    gather_df = []
    for noun in tqdm(dct_noun.keys()): 
        
        subdf = df_noun[df_noun['whole_word'] == noun]

        ### set up singular & plural template components
        sing_art = dct_noun[noun]
        sing_art = str(sing_art[0].upper() + sing_art[1])

        sing_subj = subdf['root'].values[0]

        plur_subj = noun

        ### set up target definite articles
        if sing_art == 'El':

            plur_art= 'Los'


        elif sing_art == 'La': 

            plur_art = 'Las'

        ### set up target singluar and plural adjs

        sing_adj = "grande"

        plur_adj = "grandes"

        ### set up composite template components
        affix = subdf['affix'].values[0]
        comp_subj = sing_subj +'##'+ affix


        ### create all sentence templates

        sing_sentence_template = sing_art + ' ' + sing_subj + ' [MASK]'

        plur_sentence_template = plur_art + ' ' + plur_subj + ' [MASK]'

        comp_sentence_template = plur_art + ' ' + comp_subj + ' [MASK]'

        ### set up all the column/row entries you will store per lemma

        all_wordforms = [sing_subj,
                         plur_subj,
                         comp_subj]

        all_word_number = ['sing',
                           'plur',
                           'plur'] 

        all_n_tokens = [len(tokenizer.encode(sing_subj,add_special_tokens=False)),
                        len(tokenizer.encode(plur_subj,add_special_tokens=False)),
                        len(tokenizer.encode(sing_subj,add_special_tokens=False)) + len(tokenizer.encode(['##'+affix],add_special_tokens=False))                           ]

        all_sentence_templates = [sing_sentence_template,
                                  plur_sentence_template,
                                  comp_sentence_template]
            
        #this is a super important column! distinguishes the default tokenization from 
        #our artificially imposed tokenization scheme
        all_tokenization_types = ['default',
                                  'default',
                                  'artificial']

        d = {'lemma': np.repeat(sing_subj,len(all_sentence_templates)),
             'word_form': all_wordforms,
             'word_number': all_word_number,
             'n_tokens': all_n_tokens,
             'tokenization_type': all_tokenization_types,
             'sentence': all_sentence_templates,
             'target_ART_sing': sing_art, 
             'target_ART_plur': plur_art,
             'target_adj_sing': sing_adj,
             'target_adj_plur': plur_adj,
             'affix': np.repeat(affix,len(all_sentence_templates))
             }

        gather_df.append(pd.DataFrame(d))

    sentence_df = pd.concat(gather_df,ignore_index=True)
    return sentence_df


### Model Predictions 

def find_sublist_index(list, sublist):
    """Find the first occurence of sublist in list.
    Return the start and end indices of sublist in list.
    Used to find index of [MASK] in template sentences.

    h/t GPT-3-codex for writing this."""

    for i in range(len(list)):
        if list[i] == sublist[0] and list[i:i+len(sublist)] == sublist:
            return i, i+len(sublist)
    return None


def get_article_predictions(df,data_source): 
    """Predict the likelihood of target definite/indefinite and singular/plural articles
       Will assume you've already loaded and defined your tokenizer and model.
       Will iterate row by row in your dataframe `df`, containing cols for masked sentences,
       the corresponding lemma and plural forms being tested. When it comes across a row
       with column `tokenizer_type` label `artificial`, will shunt you to a process that creates
       the inputs to the model by hand (if there are any issues/concerns with how the artificial
       tokenization proceeds and leads to predictions, check the following `if` statement: 
       `if row['tokenization_type'] == 'artificial'`)
       
       INPUTS: df, pandas dataframe, cols for lemma, word_number (plural, singular), tokenization_type
                   (artificial/default), masked sentence, target singular and plural articles to 
                   get probabilities for filling in the [MASK], and others"""

    gather_df = []
    gather_debug = []
    for (_,row) in tqdm(df.iterrows(),total=df.shape[0]):

        if row['word_number'] == 'sing':
            target_article = row['target_ART_sing']
        else:
            target_article = row['target_ART_plur']


        #tokens for each article type 
        token_article= tokenizer.encode(target_article,
                                              add_special_tokens=False
                                         )

                                    
        #token for mask
        token_mask = tokenizer.encode('[MASK]',
                                      add_special_tokens=False
                                     )

        target_adj_sing = row['target_adj_sing']
        target_adj_plur = row['target_adj_plur']

        #tokens for each adj number 
        token_adj_sing = tokenizer.encode(target_adj_sing,
                                              add_special_tokens=False
                                         )
        token_adj_plur = tokenizer.encode(target_adj_plur,
                                              add_special_tokens=False
                                         )

        ### Set up your representation of the sentence for the 
        #.  model

        if row['tokenization_type'] == 'artificial': 

            if '##es' in row['word_form']:

                token_affix = tokenizer.convert_tokens_to_ids(["##es"])


            elif '##s' in row['word_form']: 

                token_affix = tokenizer.convert_tokens_to_ids(["##s"])

            #token for singular form
            lemma = row['lemma']
            token_lemma = tokenizer.encode(lemma,
                                           add_special_tokens=False
                                          )
            print(token_lemma)
            #TODO: Check for source and if it's non-morphemic, combine the tokens that the lemma gets broken into
 
            #token for special start
            start = '[CLS]'
            token_start = tokenizer.encode(start,
                                           add_special_tokens=False)
            

            #token for special end
            ending = '[SEP]'
            token_end = tokenizer.encode(ending,
                                         add_special_tokens=False)
            
            
            ### Collect your tokens into a list that you will then flatten
            #   prior to converting to tensor
            bulky_token_list = [token_start,
                                token_article,
                                token_lemma,
                                token_affix,
                                token_mask,
                                token_end
                                ]
            flat_token_list = [item for sublist in bulky_token_list for item in sublist]
            token_idx = torch.tensor([flat_token_list])
            

            inputs = {'input_ids': token_idx,
                      'token_type_ids': torch.zeros_like(token_idx),
                      'attention_mask': torch.ones_like(token_idx)
                     }

        elif row['tokenization_type']=='default': 

            inputs = tokenizer(row['sentence'],
                               return_tensors='pt',
                               add_special_tokens=True
                              )
            
        model_token_inputs = tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0])
        model_token_inputs = ' , '.join(model_token_inputs)

        ### Predict the item that should fill in the mask!

        outputs = model(**inputs)

        #find the index of the mask in sentence
        midx = find_sublist_index(inputs["input_ids"][0].tolist(),
                                  token_mask)
                                 
        masked_token_logits = outputs.logits[0][midx[0]]
        masked_token_probs = torch.softmax(masked_token_logits, dim=0)

        # print("masked token probs: ", masked_token_probs)
        # print("token_adj_plur: ", token_adj_plur)
        # print("token_adj_sing: ", token_adj_sing)

        # print("not item: ", masked_token_probs[token_adj_plur])
        # print("item: ", masked_token_probs[token_adj_plur].item())


        # prob_article_sing = masked_token_probs[token_article_sing].item()
        # prob_article_plur = masked_token_probs[token_article_plur].item()

        # prob_list = [prob_article_sing, prob_article_plur]
        # article_list = [target_article_sing, target_article_plur]
    

        prob_adj_sing = masked_token_probs[token_adj_sing].item()
        prob_adj_plur = masked_token_probs[token_adj_plur].item()

        prob_list = [prob_adj_sing, prob_adj_plur]
        adj_list = [target_adj_sing, target_adj_plur]

    
        ### Store your results

        d = {'lemma': np.repeat(row['lemma'],len(prob_list)),
             'word_form': np.repeat(row['word_form'],len(prob_list)),
             'word_number': np.repeat(row['word_number'],len(prob_list)),
             'n_tokens': np.repeat(row['n_tokens'],len(prob_list)),
             'tokenization_type': np.repeat(row['tokenization_type'],len(prob_list)),
             'adj_probs': prob_list,
             'article_number': ['singular','plural'],
             'adj_number': ['singular','plural'],
             'adj': adj_list,
             'affix': row['affix'],
             'sentence': row['sentence'],
             'model_token_inputs': model_token_inputs,
             'source': data_source
            }
        gather_df.append(pd.DataFrame(d))


        debug_d = {'lemma': [row['lemma']],
                   'word_form': [row['word_form']],
                   'tokenized_sentence': [inputs["input_ids"][0].tolist()],
                   'mask_index': [midx[0]]
                  }
        gather_debug.append(pd.DataFrame(debug_d))


    probs_df = pd.concat(gather_df,ignore_index=True)
    debug_df = pd.concat(gather_debug,ignore_index=True)
    
    return probs_df,debug_df

In [7]:
### Import the necessary

import torch
from transformers import BertForMaskedLM, BertTokenizer

### Create the tokenizer and the model

tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [12]:
# Path to grab dataframes containing nouns from the AnCora Tree Banks
datapath = 'datasets/'

# This line generates 3 dictionaries with hand-annotated noun:article pairs
# dct_noun_singletok
# dct_noun_multitok_morph
# dct_noun_multitok_nonmorph
%run -i 'create_annotated_dicts.py'

### Load the datasets one by one

# For default SINGLE-TOKEN plurals
filename = 'nounlist_single-token-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_singletok
df_singletok = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'single-token' 
probs_df_singletok,_ = get_article_predictions(df_singletok,data_source)


# For default MULTI-TOKEN, MORPHEMIC plurals
filename = 'nounlist_multi-token-morph-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_multitok_morph
df_multitok_morph = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'morphemic' 
probs_df_multitok_morph,_ = get_article_predictions(df_multitok_morph,data_source)


# For default MULTI-TOKEN, NONMORPHEMIC plurals
filename = 'nounlist_multi-token-nonmorph-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_multitok_nonmorph
df_multitok_nonmorph = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'non_morphemic' 
probs_df_multitok_nonmorph,_ = get_article_predictions(df_multitok_nonmorph,data_source)



  0%|          | 0/1247 [00:00<?, ?it/s]

original list len: 1363 select list len: 1247


  0%|          | 0/3741 [00:00<?, ?it/s]

masked token probs:  tensor([1.1701e-07, 1.9988e-08, 7.9700e-09,  ..., 4.8630e-07, 4.3188e-09,
        3.8110e-07], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([1.1479e-07], grad_fn=<IndexBackward0>)
item:  1.1478942951725912e-07
masked token probs:  tensor([3.0208e-08, 8.7491e-09, 6.5344e-09,  ..., 6.1598e-08, 2.7895e-09,
        1.0036e-07], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([3.4855e-06], grad_fn=<IndexBackward0>)
item:  3.4855384001275524e-06
[5850]
masked token probs:  tensor([7.8074e-08, 7.5959e-09, 6.2411e-09,  ..., 5.2004e-08, 3.0052e-09,
        3.5625e-08], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([1.3696e-05], grad_fn=<IndexBackward0>)
item:  1.3696097084903158e-05
masked token probs:  tensor([9.7298e-08, 5.1684e-09, 2.9240e-08,  ..., 1.9817e-07, 1.8987e-08,
        5.6552e-08], grad_fn=<SoftmaxBackward0>)
token_adj

  0%|          | 0/508 [00:00<?, ?it/s]

original list len: 515 select list len: 508


  0%|          | 0/1524 [00:00<?, ?it/s]

masked token probs:  tensor([2.0568e-07, 2.7193e-08, 1.0498e-08,  ..., 4.1757e-08, 1.5263e-08,
        1.9241e-07], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([4.8095e-08], grad_fn=<IndexBackward0>)
item:  4.8095298410544274e-08
masked token probs:  tensor([6.8685e-08, 1.1996e-08, 4.5917e-09,  ..., 8.4446e-09, 4.3478e-09,
        5.1430e-08], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([6.2088e-06], grad_fn=<IndexBackward0>)
item:  6.2087619880912825e-06
[16287]
masked token probs:  tensor([6.8685e-08, 1.1996e-08, 4.5917e-09,  ..., 8.4446e-09, 4.3478e-09,
        5.1430e-08], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([6.2088e-06], grad_fn=<IndexBackward0>)
item:  6.2087619880912825e-06
masked token probs:  tensor([2.8465e-07, 5.1379e-08, 1.4742e-08,  ..., 1.6892e-07, 2.0406e-08,
        2.4936e-07], grad_fn=<SoftmaxBackward0>)
token_ad

  0%|          | 0/627 [00:00<?, ?it/s]

original list len: 646 select list len: 627


  0%|          | 0/1881 [00:00<?, ?it/s]

masked token probs:  tensor([1.1588e-07, 2.6986e-09, 2.9135e-09,  ..., 1.1715e-07, 1.6415e-09,
        5.4873e-08], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([6.4923e-08], grad_fn=<IndexBackward0>)
item:  6.492314241768327e-08
masked token probs:  tensor([2.2350e-08, 4.6624e-09, 1.9343e-09,  ..., 2.7049e-08, 5.2589e-10,
        1.6317e-08], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([4.7342e-07], grad_fn=<IndexBackward0>)
item:  4.7341976028292265e-07
[7385, 2471]
masked token probs:  tensor([6.0421e-08, 4.3206e-09, 2.9208e-09,  ..., 2.7851e-08, 1.4008e-09,
        2.0819e-08], grad_fn=<SoftmaxBackward0>)
token_adj_plur:  [3037]
token_adj_sing:  [3411]
not item:  tensor([1.2769e-05], grad_fn=<IndexBackward0>)
item:  1.2769255590683315e-05
masked token probs:  tensor([1.4712e-07, 6.1208e-08, 2.4150e-08,  ..., 1.0554e-07, 1.1835e-08,
        1.5791e-07], grad_fn=<SoftmaxBackward0>)
toke

In [13]:
probs_df_singletok['surprisal'] = probs_df_singletok['adj_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_morph['surprisal'] = probs_df_multitok_morph['adj_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_nonmorph['surprisal'] = probs_df_multitok_nonmorph['adj_probs'].apply(lambda x: -np.log(x))

In [14]:
### Pause to check that the default tokenizations for plurals are, in fact, morphemic when you expect them to be

non_morphemic_list = []
for (_,row) in df_multitok_morph.iterrows(): 
    
    sing = row['lemma']
    plur = row['word_form']
    affix = row['affix']
    mod_affix = "##" + affix
    
    sing_tokids = tokenizer.encode(sing,add_special_tokens=False)
    
    if row['tokenization_type'] == 'artificial': 
        plur_tokids = tokenizer.encode([sing]+[mod_affix],add_special_tokens=False)
        
    elif row['tokenization_type'] == 'default': 
        plur_tokids = tokenizer.encode(plur,add_special_tokens=False)


    affix_tokid = tokenizer.encode([mod_affix],add_special_tokens=False)[0]

    n_plural_tokens = len(plur_tokids)
    
    sing_tokens = tokenizer.convert_ids_to_tokens(sing_tokids)
    plural_tokens = tokenizer.convert_ids_to_tokens(plur_tokids)
    
    if n_plural_tokens == 1:
        check_morph = "singular single"
        
    elif affix_tokid in plur_tokids:
        check_morph = "morphemic"

    ### Multi-token, non-morphemic
    else:
        check_morph = "non_morphemic"

        print(sing,plural_tokens,check_morph)
        
        non_morphemic_list.append(sing)

print(non_morphemic_list)

[]


In [15]:
### Pause to check that the default tokenizations for plurals are, in fact, nonmorphemic when you expect them to be

morphemic_list = []
for (_,row) in df_multitok_nonmorph.iterrows(): 
    
    sing = row['lemma']
    plur = row['word_form']
    affix = row['affix']
    mod_affix = "##" + affix
    
    sing_tokids = tokenizer.encode(sing,add_special_tokens=False)
    
    if row['tokenization_type'] == 'artificial': 
        plur_tokids = tokenizer.encode([sing]+[mod_affix],add_special_tokens=False)
        
    elif row['tokenization_type'] == 'default': 
        plur_tokids = tokenizer.encode(plur,add_special_tokens=False)


    affix_tokid = tokenizer.encode([mod_affix],add_special_tokens=False)[0]

    n_plural_tokens = len(plur_tokids)
    
    sing_tokens = tokenizer.convert_ids_to_tokens(sing_tokids)
    plural_tokens = tokenizer.convert_ids_to_tokens(plur_tokids)
    
    if n_plural_tokens == 1:
        check_morph = "singular single"
        
    elif (affix_tokid in plur_tokids) & (row['tokenization_type']=='default'):
        check_morph = "morphemic"
        
        morphemic_list.append(sing)
        print(sing,plural_tokens,check_morph)


    ### Multi-token, non-morphemic
    else:
        check_morph = "non_morphemic"
        
print(morphemic_list)

[]


In [17]:
### Save each of these to a dataframe!

savepath = 'results_unmasked_adj-agreement/'

# if ~os.path.exists(savepath): 
# os.mkdir(savepath)
    

probs_df_singletok.to_csv(os.path.join(savepath,'results_singletok.csv'))
probs_df_multitok_morph.to_csv(os.path.join(savepath,'results_multitok_morph.csv'))
probs_df_multitok_nonmorph.to_csv(os.path.join(savepath,'results_multitok_nonmorph.csv'))