In [1]:
import numpy as np 
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM
from tqdm.notebook import tqdm
import os
import random

In [None]:
import random
import torch
from create_annotated_dicts import dct_noun_singletok as st, dct_noun_multitok_morph as mm, dct_noun_multitok_nonmorph as nmm


tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model.eval()
random.seed(42)

def introduce_typo(word):
    """Replaces a random letter in the word with a random letter, resembling a typo."""
    if len(word) == 0:
        return word
    idx = random.randint(0, len(word) - 1)
    original_char = word[idx]
    spanish_letters = list('abcdefghijklmnñopqrstuvwxyz')
    if original_char in spanish_letters:
        letters = spanish_letters.copy()
        letters.remove(original_char)
        random_letter = random.choice(letters)
        mod_word = word[:idx] + random_letter + word[idx + 1:]
        return mod_word
    else:
        return word

dct_noun_singletok = {introduce_typo(noun): article for noun, article in st.items()}
dct_noun_multi_morph = {introduce_typo(noun): article for noun, article in mm.items()}
dct_noun_multi_nonmorph = {introduce_typo(noun): article for noun, article in nmm.items()}


In [None]:
import random
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM
from tqdm import tqdm
from create_annotated_dicts import dct_noun_singletok as st, dct_noun_multitok_morph as mm, dct_noun_multitok_nonmorph as nmm

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model.eval()

def introduce_typo(word):
    """Replaces a random letter in the word with a random letter, resembling a typo."""
    if len(word) == 0:
        return word
    idx = random.randint(0, len(word) - 1)
    original_char = word[idx]
    spanish_letters = list('abcdefghijklmnñopqrstuvwxyz')
    if original_char.lower() in spanish_letters:
        letters = spanish_letters.copy()
        letters.remove(original_char.lower())
        random_letter = random.choice(letters)
        # Preserve the case of the original character
        if original_char.isupper():
            random_letter = random_letter.upper()
        mod_word = word[:idx] + random_letter + word[idx + 1:]
        return mod_word
    else:
        return word

# Keep original dictionaries without typos
dct_noun_singletok = st
dct_noun_multi_morph = mm
dct_noun_multi_nonmorph = nmm

# Introduce typos in words for dataframes
def create_dataframe_with_typos(dct_noun):
    df_noun = pd.DataFrame({
        'whole_word': list(dct_noun.keys())
    })
    df_noun['whole_word_typo'] = df_noun['whole_word'].apply(introduce_typo)
    df_noun['root'] = df_noun['whole_word'].apply(lambda x: x[:-1])  # Assuming plural ends with 's' or 'es'
    df_noun['affix'] = df_noun['whole_word'].apply(lambda x: x[-1] if x.endswith('s') else x[-2:])
    return df_noun

# Create dataframes for each category
df_noun_s = create_dataframe_with_typos(dct_noun_singletok)
df_noun_mm = create_dataframe_with_typos(dct_noun_multi_morph)
df_noun_nm = create_dataframe_with_typos(dct_noun_multi_nonmorph)

### Define useful functions

def make_sentences_df(dct_noun, df_noun):
    """Creates sentence templates masking what should be a singular or plural verb."""
    gather_df = []
    for idx, row in tqdm(df_noun.iterrows(), total=df_noun.shape[0]): 
        noun = row['whole_word']
        typoed_noun = row['whole_word_typo']
        affix = row['affix']
        sing_subj = row['root']
        sing_art = dct_noun[noun]
        sing_art = str(sing_art[0].upper() + sing_art[1:])
        if sing_art == 'El':
            plur_art = 'Los'
        elif sing_art == 'La': 
            plur_art = 'Las'

        # Set up target singular and plural verbs
        sing_verb = 'es'   # Singular form of 'ser'
        plur_verb = 'son'  # Plural form of 'ser'

        # Create composite subject for artificial tokenization
        comp_subj = sing_subj + '##' + affix

        # Create sentence templates with masked verb
        sing_sentence_template = sing_art + ' ' + sing_subj + ' [MASK]'
        plur_sentence_template = plur_art + ' ' + typoed_noun + ' [MASK]'  # Use typoed word
        comp_sentence_template = sing_art + ' ' + comp_subj + ' [MASK]'

        all_wordforms = [sing_subj, typoed_noun, comp_subj]
        all_word_number = ['sing', 'plur', 'plur']
        all_n_tokens = [
            len(tokenizer.encode(sing_subj, add_special_tokens=False)),
            len(tokenizer.encode(typoed_noun, add_special_tokens=False)),
            len(tokenizer.encode(sing_subj, add_special_tokens=False)) + len(tokenizer.encode(['##' + affix], add_special_tokens=False))
        ]
        all_sentence_templates = [sing_sentence_template, plur_sentence_template, comp_sentence_template]
        all_tokenization_types = ['default', 'default', 'artificial']

        d = {
            'lemma': np.repeat(sing_subj, len(all_sentence_templates)),
            'word_form': all_wordforms,
            'word_number': all_word_number,
            'n_tokens': all_n_tokens,
            'tokenization_type': all_tokenization_types,
            'sentence': all_sentence_templates,
            'target_VERB_sing': sing_verb,
            'target_VERB_plur': plur_verb,
            'article': [sing_art, plur_art, sing_art],  # Include article for artificial tokenization
            'affix': np.repeat(affix, len(all_sentence_templates))
        }
        gather_df.append(pd.DataFrame(d))
    sentence_df = pd.concat(gather_df, ignore_index=True)
    return sentence_df

### Model Predictions 

def find_sublist_index(lst, sublist):
    """Find the first occurrence of sublist in lst."""
    for i in range(len(lst)):
        if lst[i:i+len(sublist)] == sublist:
            return i, i + len(sublist)
    return None

def get_verb_predictions(df, data_source): 
    """Predict the likelihood of target singular and plural verbs."""
    gather_df = []
    for (_, row) in tqdm(df.iterrows(), total=df.shape[0]):
        target_verb_sing = row['target_VERB_sing']
        target_verb_plur = row['target_VERB_plur']

        # Tokens for each verb
        token_verb_sing = tokenizer.encode(target_verb_sing, add_special_tokens=False)
        token_verb_plur = tokenizer.encode(target_verb_plur, add_special_tokens=False)

        # Token for mask
        token_mask = tokenizer.encode('[MASK]', add_special_tokens=False)

        if row['tokenization_type'] == 'artificial': 
            # Tokens for article and noun
            token_article = tokenizer.encode(row['article'], add_special_tokens=False)
            token_affix = tokenizer.convert_tokens_to_ids(["##" + row['affix']])

            lemma = row['lemma']
            token_lemma = tokenizer.encode(lemma, add_special_tokens=False)

            token_start = tokenizer.encode('[CLS]', add_special_tokens=False)
            token_end = tokenizer.encode('[SEP]', add_special_tokens=False)

            # Construct tokens manually
            bulky_token_list = [token_start, token_article, token_lemma, token_affix, token_mask, token_end]
            flat_token_list = [item for sublist in bulky_token_list for item in sublist]
            token_idx = torch.tensor([flat_token_list])
            inputs = {'input_ids': token_idx, 'token_type_ids': torch.zeros_like(token_idx), 'attention_mask': torch.ones_like(token_idx)}
        elif row['tokenization_type'] == 'default': 
            inputs = tokenizer(row['sentence'], return_tensors='pt', add_special_tokens=True)
        model_token_inputs = tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0])
        model_token_inputs = ' , '.join(model_token_inputs)
        outputs = model(**inputs)
        midx = find_sublist_index(inputs["input_ids"][0].tolist(), token_mask)
        masked_token_logits = outputs.logits[0][midx[0]]
        masked_token_probs = torch.softmax(masked_token_logits, dim=0)
        prob_verb_sing = masked_token_probs[token_verb_sing].item()
        prob_verb_plur = masked_token_probs[token_verb_plur].item()
        prob_list = [prob_verb_sing, prob_verb_plur]
        verb_list = [target_verb_sing, target_verb_plur]
        
        d = {
            'lemma': np.repeat(row['lemma'], len(prob_list)),
            'word_form': np.repeat(row['word_form'], len(prob_list)),
            'word_number': np.repeat(row['word_number'], len(prob_list)),
            'n_tokens': np.repeat(row['n_tokens'], len(prob_list)),
            'tokenization_type': np.repeat(row['tokenization_type'], len(prob_list)),
            'verb_probs': prob_list,
            'verb_number': ['singular', 'plural'],
            'verb': verb_list,
            'affix': row['affix'],
            'sentence': row['sentence'],
            'model_token_inputs': model_token_inputs,
            'source': data_source
        }
        gather_df.append(pd.DataFrame(d))
    probs_df = pd.concat(gather_df, ignore_index=True)
    return probs_df

# Create sentences and get predictions for SINGLE-TOKEN plurals
df_singletok = make_sentences_df(dct_noun_singletok, df_noun_s)
probs_df_singletok = get_verb_predictions(df_singletok, data_source='single-token')

# Create sentences and get predictions for MULTI-TOKEN, MORPHEMIC plurals
df_multitok_morph = make_sentences_df(dct_noun_multi_morph, df_noun_mm)
probs_df_multitok_morph = get_verb_predictions(df_multitok_morph, data_source='morphemic')

# Create sentences and get predictions for MULTI-TOKEN, NONMORPHEMIC plurals
df_multitok_nonmorph = make_sentences_df(dct_noun_multi_nonmorph, df_noun_nm)
probs_df_multitok_nonmorph = get_verb_predictions(df_multitok_nonmorph, data_source='nonmorphemic')


In [None]:
probs_df_singletok['surprisal'] = probs_df_singletok['verb_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_morph['surprisal'] = probs_df_multitok_morph['verb_probs'].apply(lambda x: -np.log(x))
probs_df_multitok_nonmorph['surprisal'] = probs_df_multitok_nonmorph['verb_probs'].apply(lambda x: -np.log(x))

In [None]:
### Save each of these to a dataframe!

savepath = 'results_noisy_verb-agreement/'

if not os.path.exists(savepath): 
    os.mkdir(savepath)
    

probs_df_singletok.to_csv(os.path.join(savepath,'results_singletok.csv'))
probs_df_multitok_morph.to_csv(os.path.join(savepath,'results_multitok_morph.csv'))
probs_df_multitok_nonmorph.to_csv(os.path.join(savepath,'results_multitok_nonmorph.csv'))