In [2]:
import numpy as np 
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM
from tqdm.notebook import tqdm
import os
import random

In [3]:
import random
import torch
from create_annotated_dicts import dct_noun_singletok as st, dct_noun_multitok_morph as mm, dct_noun_multitok_nonmorph as nmm


tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model.eval()
random.seed(42)

def introduce_typo(word):
    """Replaces a random letter in the word with a random letter, resembling a typo."""
    if len(word) == 0:
        return word
    idx = random.randint(0, len(word) - 1)
    original_char = word[idx]
    spanish_letters = list('abcdefghijklmnñopqrstuvwxyz')
    if original_char in spanish_letters:
        letters = spanish_letters.copy()
        letters.remove(original_char)
        random_letter = random.choice(letters)
        mod_word = word[:idx] + random_letter + word[idx + 1:]
        return mod_word
    else:
        return word

dct_noun_singletok = {introduce_typo(noun): article for noun, article in st.items()}
dct_noun_multi_morph = {introduce_typo(noun): article for noun, article in mm.items()}
dct_noun_multi_nonmorph = {introduce_typo(noun): article for noun, article in nmm.items()}


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [4]:
import random
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM
from tqdm import tqdm
from create_annotated_dicts import dct_noun_singletok as st, dct_noun_multitok_morph as mm, dct_noun_multitok_nonmorph as nmm

tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model.eval()

def introduce_typo(word):
    """Replaces a random letter in the word with a random letter, resembling a typo."""
    if len(word) == 0:
        return word
    idx = random.randint(0, len(word) - 1)
    original_char = word[idx]
    spanish_letters = list('abcdefghijklmnñopqrstuvwxyz')
    if original_char.lower() in spanish_letters:
        letters = spanish_letters.copy()
        letters.remove(original_char.lower())
        random_letter = random.choice(letters)
        # Preserve the case of the original character
        if original_char.isupper():
            random_letter = random_letter.upper()
        mod_word = word[:idx] + random_letter + word[idx + 1:]
        return mod_word
    else:
        return word

dct_noun_singletok = st
dct_noun_multi_morph = mm
dct_noun_multi_nonmorph = nmm

def create_dataframe_with_typos(dct_noun):
    df_noun = pd.DataFrame({
        'whole_word': list(dct_noun.keys())
    })
    df_noun['whole_word_typo'] = df_noun['whole_word'].apply(introduce_typo)
    df_noun['root'] = df_noun['whole_word'].apply(lambda x: x[:-1])  # Assuming plural ends with 's' or 'es'
    df_noun['affix'] = df_noun['whole_word'].apply(lambda x: x[-1] if x.endswith('s') else x[-2:])
    return df_noun

df_noun_s = create_dataframe_with_typos(dct_noun_singletok)
df_noun_mm = create_dataframe_with_typos(dct_noun_multi_morph)
df_noun_nm = create_dataframe_with_typos(dct_noun_multi_nonmorph)


def make_sentences_df(dct_noun, df_noun):
    """Creates sentence templates masking what should be a definite or indefinite, plural or singular article."""
    gather_df = []
    for idx, row in tqdm(df_noun.iterrows(), total=df_noun.shape[0]): 
        noun = row['whole_word']
        typoed_noun = row['whole_word_typo']
        affix = row['affix']
        sing_subj = row['root']
        sing_art_def = dct_noun[noun]
        sing_art_def = str(sing_art_def[0].upper() + sing_art_def[1:])

        if sing_art_def == 'El':
            plur_art_def = 'Los'
            plur_art_indef = 'Unos'
            sing_art_indef = 'Un'
        elif sing_art_def == 'La': 
            plur_art_def = 'Las'
            plur_art_indef = 'Unas'
            sing_art_indef = 'Una'

        comp_subj = sing_subj + '##' + affix

        article_types = ['definite', 'indefinite']
        for article_type in article_types: 
            sing_sentence_template = '[MASK] ' + sing_subj 
            plur_sentence_template = '[MASK] ' + typoed_noun  # Use typoed word
            comp_sentence_template = '[MASK] ' + comp_subj 
            all_wordforms = [sing_subj, typoed_noun, comp_subj]
            all_word_number = ['sing', 'plur', 'plur']
            all_n_tokens = [
                len(tokenizer.encode(sing_subj, add_special_tokens=False)),
                len(tokenizer.encode(typoed_noun, add_special_tokens=False)),
                len(tokenizer.encode(sing_subj, add_special_tokens=False)) + len(tokenizer.encode(['##' + affix], add_special_tokens=False))
            ]
            all_sentence_templates = [sing_sentence_template, plur_sentence_template, comp_sentence_template]
            all_tokenization_types = ['default', 'default', 'artificial']
            if article_type == 'definite': 
                sing_art = sing_art_def
                plur_art = plur_art_def
            elif article_type == 'indefinite': 
                sing_art = sing_art_indef
                plur_art = plur_art_indef
            d = {
                'lemma': np.repeat(sing_subj, len(all_sentence_templates)),
                'word_form': all_wordforms,
                'word_number': all_word_number,
                'n_tokens': all_n_tokens,
                'tokenization_type': all_tokenization_types,
                'sentence': all_sentence_templates,
                'target_ART_sing': sing_art,
                'target_ART_plur': plur_art,
                'article_type': article_type,
                'affix': np.repeat(affix, len(all_sentence_templates))
            }
            gather_df.append(pd.DataFrame(d))
    sentence_df = pd.concat(gather_df, ignore_index=True)
    return sentence_df

### Model Predictions 

def find_sublist_index(lst, sublist):
    """Find the first occurrence of sublist in lst."""
    for i in range(len(lst)):
        if lst[i:i+len(sublist)] == sublist:
            return i, i + len(sublist)
    return None

def get_article_predictions(df, data_source): 
    """Predict the likelihood of target definite/indefinite and singular/plural articles."""
    gather_df = []
    for (_, row) in tqdm(df.iterrows(), total=df.shape[0]):
        target_article_sing = row['target_ART_sing']
        target_article_plur = row['target_ART_plur']

        token_article_sing = tokenizer.encode(target_article_sing, add_special_tokens=False)
        token_article_plur = tokenizer.encode(target_article_plur, add_special_tokens=False)

        token_mask = tokenizer.encode('[MASK]', add_special_tokens=False)

        if row['tokenization_type'] == 'artificial': 
            token_affix = tokenizer.convert_tokens_to_ids(["##" + row['affix']])

            lemma = row['lemma']
            token_lemma = tokenizer.encode(lemma, add_special_tokens=False)

            token_start = tokenizer.encode('[CLS]', add_special_tokens=False)
            token_end = tokenizer.encode('[SEP]', add_special_tokens=False)

            bulky_token_list = [token_start, token_mask, token_lemma, token_affix, token_end]
            flat_token_list = [item for sublist in bulky_token_list for item in sublist]
            token_idx = torch.tensor([flat_token_list])
            inputs = {'input_ids': token_idx, 'token_type_ids': torch.zeros_like(token_idx), 'attention_mask': torch.ones_like(token_idx)}
        elif row['tokenization_type'] == 'default': 
            inputs = tokenizer(row['sentence'], return_tensors='pt', add_special_tokens=True)
        model_token_inputs = tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0])
        model_token_inputs = ' , '.join(model_token_inputs)
        outputs = model(**inputs)
        midx = find_sublist_index(inputs["input_ids"][0].tolist(), token_mask)
        masked_token_logits = outputs.logits[0][midx[0]]
        masked_token_probs = torch.softmax(masked_token_logits, dim=0)
        prob_article_sing = masked_token_probs[token_article_sing].item()
        prob_article_plur = masked_token_probs[token_article_plur].item()
        prob_list = [prob_article_sing, prob_article_plur]
        article_list = [target_article_sing, target_article_plur]
        
        d = {
            'lemma': np.repeat(row['lemma'], len(prob_list)),
            'word_form': np.repeat(row['word_form'], len(prob_list)),
            'word_number': np.repeat(row['word_number'], len(prob_list)),
            'n_tokens': np.repeat(row['n_tokens'], len(prob_list)),
            'tokenization_type': np.repeat(row['tokenization_type'], len(prob_list)),
            'article_probs': prob_list,
            'article_number': ['singular', 'plural'],
            'article_type': np.repeat(row['article_type'], len(prob_list)),
            'article': article_list,
            'affix': row['affix'],
            'sentence': row['sentence'],
            'model_token_inputs': model_token_inputs,
            'source': data_source
        }
        gather_df.append(pd.DataFrame(d))
    probs_df = pd.concat(gather_df, ignore_index=True)
    return probs_df

# Create sentences and get predictions for SINGLE-TOKEN plurals
df_singletok = make_sentences_df(dct_noun_singletok, df_noun_s)
probs_df_singletok = get_article_predictions(df_singletok, data_source='single-token')

# Create sentences and get predictions for MULTI-TOKEN, MORPHEMIC plurals
df_multitok_morph = make_sentences_df(dct_noun_multi_morph, df_noun_mm)
probs_df_multitok_morph = get_article_predictions(df_multitok_morph, data_source='morphemic')

# Create sentences and get predictions for MULTI-TOKEN, NONMORPHEMIC plurals
df_multitok_nonmorph = make_sentences_df(dct_noun_multi_nonmorph, df_noun_nm)
probs_df_multitok_nonmorph = get_article_predictions(df_multitok_nonmorph, data_source='nonmorphemic')


100%|██████████| 1247/1247 [00:00<00:00, 4387.88it/s]
100%|██████████| 3741/3741 [02:10<00:00, 28.71it/s]
100%|██████████| 508/508 [00:00<00:00, 4127.20it/s]
100%|██████████| 1524/1524 [00:52<00:00, 28.82it/s]
100%|██████████| 627/627 [00:00<00:00, 4051.49it/s]
100%|██████████| 1881/1881 [01:02<00:00, 30.29it/s]


In [6]:
probs_df_singletok['surprisal'] = probs_df_singletok['article_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_morph['surprisal'] = probs_df_multitok_morph['article_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_nonmorph['surprisal'] = probs_df_multitok_nonmorph['article_probs'].apply(lambda x: -np.log(x))

In [None]:
### Pause to check that the default tokenizations for plurals are, in fact, morphemic when you expect them to be

non_morphemic_list = []
for (_,row) in df_multitok_morph.iterrows(): 
    
    sing = row['lemma']
    plur = row['word_form']
    affix = row['affix']
    mod_affix = "##" + affix
    
    sing_tokids = tokenizer.encode(sing,add_special_tokens=False)
    
    if row['tokenization_type'] == 'artificial': 
        plur_tokids = tokenizer.encode([sing]+[mod_affix],add_special_tokens=False)
        
    elif row['tokenization_type'] == 'default': 
        plur_tokids = tokenizer.encode(plur,add_special_tokens=False)


    affix_tokid = tokenizer.encode([mod_affix],add_special_tokens=False)[0]

    n_plural_tokens = len(plur_tokids)
    
    sing_tokens = tokenizer.convert_ids_to_tokens(sing_tokids)
    plural_tokens = tokenizer.convert_ids_to_tokens(plur_tokids)
    
    if n_plural_tokens == 1:
        check_morph = "singular single"
        
    elif affix_tokid in plur_tokids:
        check_morph = "morphemic"

    ### Multi-token, non-morphemic
    else:
        check_morph = "non_morphemic"

        print(sing,plural_tokens,check_morph)
        
        non_morphemic_list.append(sing)

print(non_morphemic_list)

In [None]:
### Pause to check that the default tokenizations for plurals are, in fact, nonmorphemic when you expect them to be

morphemic_list = []
for (_,row) in df_multitok_nonmorph.iterrows(): 
    
    sing = row['lemma']
    plur = row['word_form']
    affix = row['affix']
    mod_affix = "##" + affix
    
    sing_tokids = tokenizer.encode(sing,add_special_tokens=False)
    
    if row['tokenization_type'] == 'artificial': 
        plur_tokids = tokenizer.encode([sing]+[mod_affix],add_special_tokens=False)
        
    elif row['tokenization_type'] == 'default': 
        plur_tokids = tokenizer.encode(plur,add_special_tokens=False)


    affix_tokid = tokenizer.encode([mod_affix],add_special_tokens=False)[0]

    n_plural_tokens = len(plur_tokids)
    
    sing_tokens = tokenizer.convert_ids_to_tokens(sing_tokids)
    plural_tokens = tokenizer.convert_ids_to_tokens(plur_tokids)
    
    if n_plural_tokens == 1:
        check_morph = "singular single"
        
    elif (affix_tokid in plur_tokids) & (row['tokenization_type']=='default'):
        check_morph = "morphemic"
        
        morphemic_list.append(sing)
        print(sing,plural_tokens,check_morph)


    ### Multi-token, non-morphemic
    else:
        check_morph = "non_morphemic"
        
print(morphemic_list)

In [None]:
### Save each of these to a dataframe!

savepath = 'results_noisy_article-agreement/'

if not os.path.exists(savepath): 
    os.mkdir(savepath)
    

probs_df_singletok.to_csv(os.path.join(savepath,'results_singletok.csv'))
probs_df_multitok_morph.to_csv(os.path.join(savepath,'results_multitok_morph.csv'))
probs_df_multitok_nonmorph.to_csv(os.path.join(savepath,'results_multitok_nonmorph.csv'))