In [1]:
%reset
### Basic logistical, wrangling, & visualization imports

import os
import glob
import torch

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm.notebook import tqdm

### Model imports
from transformers import BertModel, BertConfig

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [9]:
### Define useful functions

def make_sentences_df(dct_noun,df_noun):
    """Creates sentence templates masking what should be a definite or indefinite, plural or singular article
    INPUTS: dct_noun: dictionary, noun:article pairs, the article is lowercase, singular, and gendered--this 
                      was hand-annotated from native Caribbean Spanish speaker intuition and checks in online
                      dictionary of the Real Academia Española; if there are any concerns about gendered article 
                      errors, check here first
            df_noun: pandas dataframe, cols: whole_word (plural), root (becomes our lemma), affix (s or es)--
                     this df is drawn from a noun subset of AnCora Tree Bank"""
    
    gather_df = []
    for noun in tqdm(dct_noun.keys()): 
        
        subdf = df_noun[df_noun['whole_word'] == noun]

        ### set up singular & plural template components
        sing_art_def = dct_noun[noun]
        sing_art_def = str(sing_art_def[0].upper() + sing_art_def[1])

        sing_subj = subdf['root'].values[0]

        plur_subj = noun

        ### set up target definite and indefinite articles
        if sing_art_def == 'El':

            plur_art_def = 'Los'
            plur_art_indef = 'Unos'

            sing_art_indef = 'Un'


        elif sing_art_def == 'La': 

            plur_art_def = 'Las'
            plur_art_indef = 'Unas'

            sing_art_indef = 'Una'


        ### set up composite template components
        affix = subdf['affix'].values[0]
        comp_subj = sing_subj +'##'+ affix


        ### create all sentence templates
        article_types = ['definite','indefinite']
        for article_type in article_types: 

            sing_sentence_template = '[MASK] ' + sing_subj 

            plur_sentence_template = '[MASK] ' + plur_subj 

            comp_sentence_template = '[MASK] ' + comp_subj 


            ### set up all the column/row entries you will store per lemma

            all_wordforms = [sing_subj,
                             plur_subj,
                             comp_subj]

            all_word_number = ['sing',
                               'plur',
                               'plur'] 

            all_n_tokens = [len(tokenizer.encode(sing_subj,add_special_tokens=False)),
                            len(tokenizer.encode(plur_subj,add_special_tokens=False)),
                            len(tokenizer.encode(sing_subj,add_special_tokens=False)) + len(tokenizer.encode(['##'+affix],add_special_tokens=False))
                           ]

            all_sentence_templates = [sing_sentence_template,
                                      plur_sentence_template,
                                      comp_sentence_template]
            
            #this is a super important column! distinguishes the default tokenization from 
            #our artificially imposed tokenization scheme
            all_tokenization_types = ['default',
                                      'default',
                                      'artificial'
                                     ]

            if article_type == 'definite': 
                sing_art = sing_art_def
                plur_art = plur_art_def

            elif article_type == 'indefinite': 
                sing_art = sing_art_indef
                plur_art = plur_art_indef

            d = {'lemma': np.repeat(sing_subj,len(all_sentence_templates)),
                 'word_form': all_wordforms,
                 'word_number': all_word_number,
                 'n_tokens': all_n_tokens,
                 'tokenization_type': all_tokenization_types,
                 'sentence': all_sentence_templates,
                 'target_ART_sing': sing_art,
                 'target_ART_plur': plur_art,
                 'article_type': article_type,
                 'affix': np.repeat(affix,len(all_sentence_templates))
                }

            gather_df.append(pd.DataFrame(d))

    sentence_df = pd.concat(gather_df,ignore_index=True)
    return sentence_df


### Model Predictions 

def find_sublist_index(list, sublist):
    """Find the first occurence of sublist in list.
    Return the start and end indices of sublist in list.
    Used to find index of [MASK] in template sentences.

    h/t GPT-3-codex for writing this."""

    for i in range(len(list)):
        if list[i] == sublist[0] and list[i:i+len(sublist)] == sublist:
            return i, i+len(sublist)
    return None


def get_article_predictions(df,data_source): 
    """Predict the likelihood of target definite/indefinite and singular/plural articles
       Will assume you've already loaded and defined your tokenizer and model.
       Will iterate row by row in your dataframe `df`, containing cols for masked sentences,
       the corresponding lemma and plural forms being tested. When it comes across a row
       with column `tokenizer_type` label `artificial`, will shunt you to a process that creates
       the inputs to the model by hand (if there are any issues/concerns with how the artificial
       tokenization proceeds and leads to predictions, check the following `if` statement: 
       `if row['tokenization_type'] == 'artificial'`)
       
       INPUTS: df, pandas dataframe, cols for lemma, word_number (plural, singular), tokenization_type
                   (artificial/default), masked sentence, target singular and plural articles to 
                   get probabilities for filling in the [MASK], and others"""

    gather_df = []
    gather_debug = []
    for (_,row) in tqdm(df.iterrows(),total=df.shape[0]):

        target_article_sing = row['target_ART_sing']
        target_article_plur = row['target_ART_plur']


        #tokens for each article type 
        token_article_sing = tokenizer.encode(target_article_sing,
                                              add_special_tokens=False
                                         )
        token_article_plur = tokenizer.encode(target_article_plur,
                                              add_special_tokens=False
                                         )
        #token for mask
        token_mask = tokenizer.encode('[MASK]',
                                      add_special_tokens=False
                                     )


        ### Set up your representation of the sentence for the 
        #.  model

        if row['tokenization_type'] == 'artificial': 

            if '##es' in row['word_form']:

                token_affix = tokenizer.convert_tokens_to_ids(["##es"])


            elif '##s' in row['word_form']: 

                token_affix = tokenizer.convert_tokens_to_ids(["##s"])

            #token for singular form
            lemma = row['lemma']
            token_lemma = tokenizer.encode(lemma,
                                           add_special_tokens=False
                                          )
            print(token_lemma)
            #TODO: Check for source and if it's non-morphemic, combine the tokens that the lemma gets broken into
 
            #token for special start
            start = '[CLS]'
            token_start = tokenizer.encode(start,
                                           add_special_tokens=False)
            

            #token for special end
            ending = '[SEP]'
            token_end = tokenizer.encode(ending,
                                         add_special_tokens=False)
            
            
            ### Collect your tokens into a list that you will then flatten
            #   prior to converting to tensor
            bulky_token_list = [token_start,
                                token_mask,
                                token_lemma,
                                token_affix,
                                token_end
                                ]
            flat_token_list = [item for sublist in bulky_token_list for item in sublist]
            token_idx = torch.tensor([flat_token_list])
            

            inputs = {'input_ids': token_idx,
                      'token_type_ids': torch.zeros_like(token_idx),
                      'attention_mask': torch.ones_like(token_idx)
                     }

        elif row['tokenization_type']=='default': 

            inputs = tokenizer(row['sentence'],
                               return_tensors='pt',
                               add_special_tokens=True
                              )
            
        model_token_inputs = tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0])
        model_token_inputs = ' , '.join(model_token_inputs)

        ### Predict the item that should fill in the mask!

        outputs = model(**inputs)

        #find the index of the mask in sentence
        midx = find_sublist_index(inputs["input_ids"][0].tolist(),
                                  token_mask)
                                 
        masked_token_logits = outputs.logits[0][midx[0]]
        masked_token_probs = torch.softmax(masked_token_logits, dim=0)


        prob_article_sing = masked_token_probs[token_article_sing].item()
        prob_article_plur = masked_token_probs[token_article_plur].item()

        prob_list = [prob_article_sing, prob_article_plur]
        article_list = [target_article_sing, target_article_plur]
    

    
        ### Store your results

        d = {'lemma': np.repeat(row['lemma'],len(prob_list)),
             'word_form': np.repeat(row['word_form'],len(prob_list)),
             'word_number': np.repeat(row['word_number'],len(prob_list)),
             'n_tokens': np.repeat(row['n_tokens'],len(prob_list)),
             'tokenization_type': np.repeat(row['tokenization_type'],len(prob_list)),
             'article_probs': prob_list,
             'article_number': ['singular','plural'],
             'article_type': np.repeat(row['article_type'],len(prob_list)),
             'article': article_list,
             'affix': row['affix'],
             'sentence': row['sentence'],
             'model_token_inputs': model_token_inputs,
             'source': data_source
            }
        gather_df.append(pd.DataFrame(d))


        debug_d = {'lemma': [row['lemma']],
                   'word_form': [row['word_form']],
                   'tokenized_sentence': [inputs["input_ids"][0].tolist()],
                   'mask_index': [midx[0]]
                  }
        gather_debug.append(pd.DataFrame(debug_d))


    probs_df = pd.concat(gather_df,ignore_index=True)
    debug_df = pd.concat(gather_debug,ignore_index=True)
    
    return probs_df,debug_df

In [3]:
### Import the necessary

import torch
from transformers import BertForMaskedLM, BertTokenizer

### Create the tokenizer and the model

tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [12]:
# Path to grab dataframes containing nouns from the AnCora Tree Banks
datapath = 'datasets/'

# This line generates 3 dictionaries with hand-annotated noun:article pairs
# dct_noun_singletok
# dct_noun_multitok_morph
# dct_noun_multitok_nonmorph
%run -i 'create_annotated_dicts.py'

### Load the datasets one by one

# For default SINGLE-TOKEN plurals
filename = 'nounlist_single-token-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_singletok
df_singletok = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'single-token' 
probs_df_singletok,_ = get_article_predictions(df_singletok,data_source)


# For default MULTI-TOKEN, MORPHEMIC plurals
filename = 'nounlist_multi-token-morph-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_multitok_morph
df_multitok_morph = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'morphemic' 
probs_df_multitok_morph,_ = get_article_predictions(df_multitok_morph,data_source)


# For default MULTI-TOKEN, NONMORPHEMIC plurals
filename = 'nounlist_multi-token-nonmorph-plurals.csv'
df_noun = pd.read_csv(os.path.join(datapath,filename))
dct_noun = dct_noun_multitok_nonmorph
df_multitok_nonmorph = make_sentences_df(dct_noun,df_noun)

print('original list len:',df_noun.shape[0],'select list len:', len(dct_noun))

data_source = 'non_morphemic' 
probs_df_multitok_nonmorph,_ = get_article_predictions(df_multitok_nonmorph,data_source)



  0%|          | 0/1247 [00:00<?, ?it/s]

original list len: 1363 select list len: 1247


  0%|          | 0/7482 [00:00<?, ?it/s]

  0%|          | 0/508 [00:00<?, ?it/s]

original list len: 515 select list len: 508


  0%|          | 0/3048 [00:00<?, ?it/s]

  0%|          | 0/627 [00:00<?, ?it/s]

original list len: 646 select list len: 627


  0%|          | 0/3762 [00:00<?, ?it/s]

In [13]:
probs_df_singletok['surprisal'] = probs_df_singletok['article_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_morph['surprisal'] = probs_df_multitok_morph['article_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_nonmorph['surprisal'] = probs_df_multitok_nonmorph['article_probs'].apply(lambda x: -np.log(x))

In [15]:
### Pause to check that the default tokenizations for plurals are, in fact, morphemic when you expect them to be

non_morphemic_list = []
for (_,row) in df_multitok_morph.iterrows(): 
    
    sing = row['lemma']
    plur = row['word_form']
    affix = row['affix']
    mod_affix = "##" + affix
    
    sing_tokids = tokenizer.encode(sing,add_special_tokens=False)
    
    if row['tokenization_type'] == 'artificial': 
        plur_tokids = tokenizer.encode([sing]+[mod_affix],add_special_tokens=False)
        
    elif row['tokenization_type'] == 'default': 
        plur_tokids = tokenizer.encode(plur,add_special_tokens=False)


    affix_tokid = tokenizer.encode([mod_affix],add_special_tokens=False)[0]

    n_plural_tokens = len(plur_tokids)
    
    sing_tokens = tokenizer.convert_ids_to_tokens(sing_tokids)
    plural_tokens = tokenizer.convert_ids_to_tokens(plur_tokids)
    
    if n_plural_tokens == 1:
        check_morph = "singular single"
        
    elif affix_tokid in plur_tokids:
        check_morph = "morphemic"

    ### Multi-token, non-morphemic
    else:
        check_morph = "non_morphemic"

        print(sing,plural_tokens,check_morph)
        
        non_morphemic_list.append(sing)

print(emic_listnon_morph)

[]


In [7]:
### Pause to check that the default tokenizations for plurals are, in fact, nonmorphemic when you expect them to be

morphemic_list = []
for (_,row) in df_multitok_nonmorph.iterrows(): 
    
    sing = row['lemma']
    plur = row['word_form']
    affix = row['affix']
    mod_affix = "##" + affix
    
    sing_tokids = tokenizer.encode(sing,add_special_tokens=False)
    
    if row['tokenization_type'] == 'artificial': 
        plur_tokids = tokenizer.encode([sing]+[mod_affix],add_special_tokens=False)
        
    elif row['tokenization_type'] == 'default': 
        plur_tokids = tokenizer.encode(plur,add_special_tokens=False)


    affix_tokid = tokenizer.encode([mod_affix],add_special_tokens=False)[0]

    n_plural_tokens = len(plur_tokids)
    
    sing_tokens = tokenizer.convert_ids_to_tokens(sing_tokids)
    plural_tokens = tokenizer.convert_ids_to_tokens(plur_tokids)
    
    if n_plural_tokens == 1:
        check_morph = "singular single"
        
    elif (affix_tokid in plur_tokids) & (row['tokenization_type']=='default'):
        check_morph = "morphemic"
        
        morphemic_list.append(sing)
        print(sing,plural_tokens,check_morph)


    ### Multi-token, non-morphemic
    else:
        check_morph = "non_morphemic"
        
print(morphemic_list)

NameError: name 'df_multitok_nonmorph' is not defined

In [18]:
### Save each of these to a dataframe!

savepath = 'results_article-agreement/'

if ~os.path.exists(savepath): 
    os.mkdir(savepath)
    

probs_df_singletok.to_csv(os.path.join(savepath,'results_singletok.csv'))
probs_df_multitok_morph.to_csv(os.path.join(savepath,'results_multitok_morph.csv'))
probs_df_multitok_nonmorph.to_csv(os.path.join(savepath,'results_multitok_nonmorph.csv'))