In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import Counter
import math
import random
import statistics

import matplotlib.pyplot as plt
from unidecode import unidecode
import numpy as np
from scipy.stats import norm
from tqdm import tqdm
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

from masked_models.utils import sentence_logprob, tokenize

In [None]:
sentences = []
stereotypes = []

for line in open('./data/samples.txt'):
    words = line.split()
    sentences.append(' '.join(words[:-1]))
    stereotypes.append(int(words[-1]))

stereo_names = """
Emotional
Gentle
Empathetic
Neat
Social
Weak
Beautiful
Tough and rough
Self-confident
Professional
Rational
Providers
Leaders
Childish
Sexual
Strong
""".strip().split('\n')

In [None]:
from translators.google_translate import GoogleTranslate
from translators.amazon_translate import AmazonTranslate
from translators.deepl import DeepL

translators = [
    AmazonTranslate(
        data_path='./cache/translations/amazon_translate',
        target_language='sk',
        enable_api=False,
    ).load(),
    DeepL(
        data_path='./cache/translations/deepl',
        target_language='sk',
        enable_api=False,
    ).load(),
    GoogleTranslate(
        data_path='./cache/translations/google_translate',
        target_language='sk',
        enable_api=False,
    ).load(),
]

In [None]:
from translators.helpers import gender_translate

suffixes = [
    ('', 'a'),      # robil > robila
    ('ol', 'la'),   # mohol > mohla
    ('ý', 'á'),     # pekný > pekná
    ('y', 'a'),     # odvážny > odvážna
    ('i', 'a'),     # ohromujúci > ohromujúca
    ('í', 'ia'),    # lepší > lepšia
    ('iel', 'la'),  # išiel > išla
    ('ým', 'ou'),   # šťastným > šťastnou
    ('', 'ka'),     # amatér > amatérka
    ('om', 'kou'),  # víťazom > víťazkou
    ('k', 'čka'),   # odborník > odborníčka
    ('ného', 'nú'), # neschopného > neschopnú
    ('ím', 'ou'),   # šťastnejším šťastnejšou
    ('í', 'ie'),    # efektívnejší > efektívnejšie
    ('í', 'é'),     # zlí > zlé       
    ('rád', 'rada'), 
    ('sám', 'sama'), 
]    

def match_gender(male, female):
    male = male.lower().strip('.').strip(',').strip('?').strip('!')
    female = female.lower().strip('.').strip(',').strip('!')

    return any(
       (
            # or None is here to handle the case when we have zero suffix
            # str[:-0] will return empty string
            # str[:None] will return the entire string
            female[:-len(female_suffix)] == male[:-len(male_suffix) or None] and
            male.endswith(male_suffix)
            and female.endswith(female_suffix)
        )
        for male_suffix, female_suffix in suffixes
    )

candidates = []
x = 0
for sentence, stereotype in zip(sentences, stereotypes):
    for translator in translators:
        m, f = gender_translate(sentence, translator, 'male'), gender_translate(sentence, translator, 'female')
        if m is None or f is None:
            continue
        m_words = m.split()
        f_words = f.split()
        if len(m_words) == len(f_words) and sum(mw != fw for mw, fw in zip(m_words, f_words)) == 1:
            for mw, fw in zip(m_words, f_words):
                if mw != fw:
                    if match_gender(mw, fw):
                        candidates.append((m, f, stereotype))
                    break

candidates = set(candidates)
len(candidates)

In [None]:
from collections import Counter

Counter(c[2] for c in candidates).most_common(16)

In [None]:
def model_init(model_name):
    model, tokenizer = AutoModelForMaskedLM.from_pretrained(model_name), AutoTokenizer.from_pretrained(model_name)
    if torch.cuda.is_available():
        model = model.to('cuda:0')
    return model, tokenizer

def tokenize(sen, tokenizer, only_ids=False, **kwargs):
    batch_encoding = tokenizer(sen, return_tensors="pt", **kwargs)
    return batch_encoding['input_ids'][0].tolist()

def bootstrap_ci(scores, alpha=0.95):
    """
    Bootstrapping based estimate.
    
    Return mean and confidence interval (lower and upper bound)
    """
    loc, scale = norm.fit(scores)    
    bootstrap = [sum(random.choices(scores, k=len(scores))) / len(scores) for _ in range(1000)]
    lower, upper = norm.interval(alpha, *norm.fit(bootstrap))
        
    return loc, lower, upper

In [None]:
def set_size(w,h, ax=None):
    """
    Used to size the figures with subplots
    
    https://stackoverflow.com/questions/44970010/axes-class-set-explicitly-size-width-height-of-axes-in-given-units
    """
    if not ax: ax=plt.gca()
    l = ax.figure.subplotpars.left
    r = ax.figure.subplotpars.right
    t = ax.figure.subplotpars.top
    b = ax.figure.subplotpars.bottom
    figw = float(w)/(r-l)
    figh = float(h)/(t-b)
    ax.figure.set_size_inches(figw, figh)

fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
axes = axes.reshape(-1)

for model_handle, model_name, ax in zip(
    ['gerulata/slovakbert', 'bert-base-multilingual-cased', 'xlm-roberta-base', 'xlm-roberta-large'],
    ['SlovakBERT', 'mBERT', 'XLM-RoBERTa Base', 'XLM-RoBERTa Large'],
    axes
):
    model, tokenizer = model_init(model_handle)

    scores = []
    for c1, c2, s in tqdm(list(candidates)):
        if len(tokenize(c1, tokenizer)) != len(tokenize(c2, tokenizer)):
            continue
        a = sentence_logprob(c1, c2, tokenizer, model)
        b = sentence_logprob(c2, c1, tokenizer, model)
        scores.append((a-b, s))
    
    for i in range(1, 17):
        l, m, u = bootstrap_ci([score for score, stereotype_id in scores if stereotype_id == i])
        ax.plot([l, u], [17-i, 17-i], c=('pink' if i < 8 else 'lightblue'))

    male_rate = np.mean([score for score, stereotype_id in scores if stereotype_id >= 8])
    female_rate = np.mean([score for score, stereotype_id in scores if stereotype_id < 8])
    print(male_rate, female_rate)
    ax.vlines(male_rate, 1, 16, linestyle=':', color='lightblue')
    ax.vlines(female_rate, 1, 16, linestyle=':', color='pink')
    
    ax.set_yticks(range(1, 17), stereo_names[::-1])
    ax.grid(visible=True, which='major', axis='y', c='lightgrey', linewidth=0.2)
    ax.set_title(model_name)

set_size(5, 6)
fig.subplots_adjust(left=0.2)
plt.tight_layout()
plt.savefig('lms.png', dpi=300)
plt.show()
    
