# Classify same-gender sentences
Let's try to classify same-gender vs. different-gender sentences and then identify matching sentences in same corpus data.

In [None]:
# extract sentences that mention relationships
import bz2
import re
import pandas as pd
from tqdm import tqdm
from data_helpers import load_relationship_occupation_template_data
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
PUNCT_MATCHERS = [
    [re.compile('&apos;'), "'"],
]
def clean_web_text(text):
    clean_text = text.strip()
    for m_i, w_i in PUNCT_MATCHERS:
        clean_text = m_i.sub(w_i, clean_text)
    # remove HTML junk
    text_soup = BeautifulSoup(clean_text)
    clean_text = text_soup.text
    return clean_text
occupation_words, relationship_word_data, relationship_sents, langs, lang_art_PRON_lookup, lang_POSS_PRON_lookup = load_relationship_occupation_template_data()
langs = ['it']
subject_genders = ['male', 'female']
relationship_sents = []
for lang_i in langs:
    relationship_phrases_i = []
    for gender_j in subject_genders:
        # possessive pronoun + noun e.g. "sua moglie"
# #         pron_j = lang_POSS_PRON_lookup[lang_i][gender_j]
# #         # remove "il"/"la" for IT pron
# #         if(lang_i == 'it'):
# #             pron_j = pron_j.split(' ')[-1]
#         relationship_phrases_j = relationship_word_data.loc[:, f'{lang_i}_{gender_j}'].apply(lambda x: f'{pron_j} {x}')
        # normal relationship words e.g. "la moglie" => looking for possessors "la moglie del generale"
        relationship_phrases_j = relationship_word_data.loc[:, f'{lang_i}_{gender_j}']
        relationship_phrases_i.extend(relationship_phrases_j.values.tolist())
    relationship_phrase_matcher_i = re.compile('|'.join(relationship_phrases_i))
    file_name_i = f'data/wiki/{lang_i}wiki-20181001-corpus.xml.bz2'
    matching_sents_i = []
    for l in tqdm(bz2.open(file_name_i, 'rt')):
        l = clean_web_text(l)
        l_sents = sent_tokenize(l)
        for sent_j in l_sents:
            relationship_phrase_search_l = relationship_phrase_matcher_i.search(sent_j.lower())
            if(relationship_phrase_search_l is not None):
                matching_sents_i.append([relationship_phrase_search_l.group(0), sent_j])
    matching_sents_i = pd.DataFrame(matching_sents_i, columns=['relationship_word', 'sent'])
    matching_sents_i = matching_sents_i.assign(**{'lang' : lang_i})
    relationship_sents.append(matching_sents_i)
relationship_sents = pd.concat(relationship_sents, axis=0)

In [None]:
display(relationship_sents.loc[:, 'relationship_word'].value_counts())

In [None]:
## save to file!!
# relationship_sents.to_csv('data/wiki/relationship_sent_data.gz', sep='\t', compression='gzip', index=False) # possessive PRONOUN + phrase
relationship_sents.to_csv('data/wiki/relationship_words_sent_data.gz', sep='\t', compression='gzip', index=False) # single word

In [1]:
## reload
import pandas as pd
# relationship_sents = pd.read_csv('data/wiki/relationship_sent_data.gz', sep='\t')  # possessive PRONOUN + phrase
relationship_sents = pd.read_csv('data/wiki/relationship_words_sent_data.gz', sep='\t')  # single word
display(relationship_sents.head())

In [2]:
## example sentences
pd.set_option('display.max_colwidth', 1000)
for lang_i, data_i in relationship_sents.groupby('lang'):
    for word_j, data_j in data_i.groupby('relationship_word'):
        print(f'word = {word_j}')
        display(data_j.loc[:, ['sent']].head(5))

### Get subject nouns for relationship target

What if we filter to sentences that have the relationship target as dependent on the subject?

In [3]:
## reload
import pandas as pd
# relationship_sent_output_data = pd.read_csv('data/wiki/relationship_sent_data.gz', sep='\t')
relationship_sent_output_data = pd.read_csv('data/wiki/relationship_words_sent_data.gz', sep='\t')

In [4]:
import spacy
# nlp_pipeline = spacy.load('it_core_news_sm') # small parser sucks!!
nlp_pipeline = spacy.load('it_core_news_lg') # use bigger parser when possible!!

In [33]:
def find_phrase_head(sent, phrase):
    ## TODO: restrict to extremely simple sentences like 
    # "il generale e sua moglie mangiano"
    ## i.e. head noun should be (1) close to target and (2) be either subject or object of sentence
    phrase_tokens = phrase.split(' ')
#     sent_phrase_tokens = []
    phrase_head = None
    for i in range(len(sent)-len(phrase_tokens)):
        if(all([sent[i+j].text==phrase_tokens[j] for j in range(len(phrase_tokens))])):
            sent_phrase_tokens = sent[i:(i+len(phrase_tokens))]
            # tmp debug
#             print(f'phrase tokens = {sent_phrase_tokens}; POS = {[x.pos_ for x in sent_phrase_tokens]}')
            # get head noun
            phrase_nouns = list(filter(lambda x: x.pos_ == 'NOUN', sent_phrase_tokens))
            if(len(phrase_nouns) > 0):
                head_noun = phrase_nouns[0]
                # get other noun in conjunction
                if(head_noun.dep_ == 'conj'):
                    phrase_head = head_noun.head
                    break
    return phrase_head
import re
possessor_word_matcher_lookup = {
    'it' : re.compile('|'.join(['de', 'di', 'del', 'da', 'della']))
}
def is_token_connected_to_possessor(token, possessor_word_matcher):
    token_children = list(token.children)
#     print(f'children = {token_children}')
    possessor_children = list(filter(lambda x: possessor_word_matcher.match(x.text.lower()) is not None, token_children))
    return len(possessor_children)
def find_phrase_possessor(sent, phrase_word, lang):
    ## TODO: make it even more strict => "the X of Y" where "Y" is parent of "of"
    # get possessor of phrase via "nmod"
#     print(f'sent = {[x for x in sent]}; phrase_word = {phrase_word}')
    phrase_token_matches = list(filter(lambda x: x.text.lower() == phrase_word, sent))
    possessor = None
    if(len(phrase_token_matches) > 0):
        phrase_token = phrase_token_matches[0]
        # look for source noun with NMOD dep and possessor child
        possessor_word_matcher = possessor_word_matcher_lookup[lang]
        phrase_children = list(filter(lambda x: x.dep_=='nmod' and is_token_connected_to_possessor(x, possessor_word_matcher), phrase_token.children))
        if(len(phrase_children) > 0):
            possessor = phrase_children[0]
    return possessor
## phrase head test
# phrase = 'sua moglie'
# sent = 'la donna e sua moglie sono andate al negozio'
# sent = """weihe nacque il 30 gennaio 1779 a mennighüffen, il secondo dei dodici figli di karl justus weihe (1752-1829), un pastore, e di sua moglie anna (nata rebeker)."""
# should connect "moglie" to "pastore"
# sent = nlp_pipeline(sent)
# print(sent)
# print(find_phrase_head(sent, phrase))
## possessor test
phrase_word = 'moglie'
sent = 'era la moglie de la abogada'
lang = 'it'
# sent = 'terza moglie Isabelita Perón'
sent = nlp_pipeline(sent)
print(find_phrase_possessor(sent, phrase_word, lang))

In [34]:
from tqdm import tqdm
tqdm.pandas()
# relationship_sent_output_data = relationship_sent_output_data.assign(**{
#     'sent_parse' : relationship_sent_output_data.loc[:, 'sent'].progress_apply(nlp_pipeline)
# })
## attempt to get phrase head => trash
# relationship_sent_output_data = relationship_sent_output_data.assign(**{
#     'relationship_word_source' : relationship_sent_output_data.apply(lambda x: find_phrase_head(x.loc['sent_parse'], x.loc['relationship_word']), axis=1)
# })
relationship_sent_output_data = relationship_sent_output_data.assign(**{
    'relationship_word_source' : relationship_sent_output_data.progress_apply(lambda x: find_phrase_possessor(x.loc['sent_parse'], x.loc['relationship_word']), axis=1)
})
# get article => approximate gender
# import re
# male_articles = ['il', 'un', 'uno', 'del', 'dello']
# female_articles = ['la', "l'", 'una', "un'", 'da', 'della', "dell'"]
# articles = male_articles + female_articles
# article_matcher = re.compile('|'.join(articles))
# relationship_sent_output_data = relationship_sent_output_data.assign(**{
#     'relationship_word_head_article' : relationship_sent_output_data.loc[:, 'relationship_word_head'].apply(lambda x: list(filter(lambda y: article_matcher.match(y.text) is not None, x.children)) if x is not None else [])
# })
# relationship_sent_output_data = relationship_sent_output_data.assign(**{
#     'relationship_word_head_article' : relationship_sent_output_data.loc[:, 'relationship_word_head_article'].apply(lambda x: x[0] if len(x) > 0 else None)
# })
# get actual gender from morphology
relationship_sent_output_data = relationship_sent_output_data.assign(**{
    'relationship_word_source_gender' : relationship_sent_output_data.loc[:, 'relationship_word_source'].apply(lambda x: x.morph.get('Gender')[0] if x is not None and len(x.morph.get('Gender')) > 0 else None)
})

In [35]:
pd.set_option('display.max_colwidth', 1000)
relationship_with_source_sent_data = relationship_sent_output_data[relationship_sent_output_data.loc[:, 'relationship_word_source'].apply(lambda x: x is not None)]
# fix gender labels
gender_lookup = {
    'Masc' : 'male',
    'Fem' : 'female'
}
relationship_with_source_sent_data = relationship_with_source_sent_data.assign(**{'relationship_word_source_gender' : relationship_with_source_sent_data.loc[:, 'relationship_word_source_gender'].apply(gender_lookup.get)})
display(relationship_with_source_sent_data.loc[:, ['sent', 'relationship_word', 'relationship_word_source', 'relationship_word_source_gender']])

Can we use this approximation to find same-gender and different-gender sentences?

In [38]:
## have to get relationship word gender
from data_helpers import load_relationship_occupation_template_data
occupation_words, relationship_words, relationship_sents, langs, lang_art_PRON_lookup, lang_POSS_PRON_lookup = load_relationship_occupation_template_data()
# lookup gender
langs = ['en', 'es', 'fr', 'it']
genders = ['male', 'female']
relationship_word_gender_lookup = {
    l : {g : relationship_words.loc[:, f'{l}_{g}'].values for g in genders}
    for l in langs
}
relationship_word_gender_lookup = {
    k : {v : k1 for k1, v1 in v.items() for v in v1}
    for k,v in relationship_word_gender_lookup.items()
}
## look up gender, compare w/ head noun gender, etc
relationship_with_source_sent_data = relationship_with_source_sent_data.assign(**{
    'relationship_word_gender' : relationship_with_source_sent_data.apply(lambda x: relationship_word_gender_lookup[x.loc['lang']][x.loc['relationship_word'].split(' ')[-1]], axis=1) 
})
word_gender_vars = ['relationship_word_gender', 'relationship_word_source_gender']
print(relationship_with_source_sent_data.loc[:, word_gender_vars].value_counts())
# normalize by total
display(relationship_with_source_sent_data.groupby('relationship_word_gender').apply(lambda x: x.loc[:, 'relationship_word_source_gender'].value_counts() / x.loc[:, 'relationship_word_source_gender'].value_counts().sum()).reset_index().pivot(index='relationship_word_gender', columns=['level_1'], values='relationship_word_source_gender').sort_index())

The skew is actually less bad than I thought. Let's look at some example phrases.

In [47]:
for relationship_word_i, data_i in relationship_with_source_sent_data.groupby('relationship_word'):
    print(f'*** relationship word = {relationship_word_i} ***')
    relationship_word_gender_i = data_i.loc[:, 'relationship_word_gender'].iloc[0]
    for gender_j, data_j in data_i.groupby('relationship_word_source_gender'):
        if(gender_j == relationship_word_gender_i):
            print(f'** gender = same-gender **')
        else:
            print(f'** gender = diff-gender **')
        display(data_j.loc[:, ['sent', 'relationship_word_source']].head())

Some of these relationship source words are not people-related, e.g. `il fidanzato di bronzo` where `bronzo` is not a person.

Let's filter these relationship word source words to only have person-related words, using [multilingual Wordnet](https://www.nltk.org/howto/wordnet.html).

In [203]:
## import nltk
## nltk.download('omw')
from nltk.corpus import wordnet
import numpy as np
person_category_matcher = re.compile('person.n.01')
lemma_num_matcher = re.compile('(?<=\.n\.)(\d+)(?=\.)')
wordnet_lang_lookup = {
    'es' : 'spa',
    'fr' : 'fra',
    'it' : 'ita',
}
def is_word_a_person(word, lang):
    wordnet_lang = wordnet_lang_lookup[lang]
    word_is_person = False
    # assume capital letter => name => person
    if(word.istitle()):
        word_is_person = True
    else:
    #     print(f'word type={type(word)}')
        word_lemmas = wordnet.lemmas(word, lang=wordnet_lang)
#         print(f'word={word}; lemmas={word_lemmas}')
        # find main word sense
        # sort lemmas by number: lower number => more "core" meaning
        word_lemma_nums = list(map(lambda x: int(lemma_num_matcher.search(str(x)).group(0)) if lemma_num_matcher.search(str(x)) is not None else np.inf, word_lemmas))
        if(len(word_lemma_nums) > 0):
            max_word_lemma_num = min(word_lemma_nums)
            main_lemmas = [x for x,y in zip(word_lemmas, word_lemma_nums) if y==max_word_lemma_num]
            # best case: match lemma name w/ weird format e.g. "donna.n.8.donna"
            word_lemma_matcher = re.compile(f'Lemma\(\'({word})\.n.+')
            if(len(main_lemmas) > 1):
                word_match_main_lemma = list(filter(lambda x: word_lemma_matcher.match(str(x)), main_lemmas))
                if(len(word_match_main_lemma) > 0):
                    main_lemma = word_match_main_lemma[0]
                else:
                    main_lemma = main_lemmas[0]
                # get hypernyms for main lemma
                main_lemma_hypernym_paths = main_lemma.synset().hypernym_paths()
                main_lemma_main_path = main_lemma_hypernym_paths[0]
                path_contains_person_category = any(map(lambda x: person_category_matcher.match(x.name()), main_lemma_main_path))
                if(path_contains_person_category):
                    word_is_person = True
    return word_is_person

lang = 'it'
word = 'donna'
assert is_word_a_person(word, lang)
word = 'cane'
assert not is_word_a_person(word, lang)

In [204]:
relationship_with_source_sent_data = relationship_with_source_sent_data.assign(**{
    'relationship_word_source_is_person' : relationship_with_source_sent_data.apply(lambda x: is_word_a_person(x.loc['relationship_word_source'].text, lang=x.loc['lang']), axis=1)
})

In [207]:
relationship_with_person_source_sent_data = relationship_with_source_sent_data[relationship_with_source_sent_data.loc[:, 'relationship_word_source_is_person']]
display(relationship_with_person_source_sent_data.loc[:, ['sent', 'relationship_word', 'relationship_word_source']].head(5))

Same test as before: test split in male/female relationships.

In [220]:
relationship_word_gender_counts = relationship_with_person_source_sent_data.groupby('relationship_word_gender').apply(lambda x: x.loc[:, 'relationship_word_source_gender'].value_counts()).reset_index().pivot(index='relationship_word_gender', columns=['level_1'], values='relationship_word_source_gender').sort_index()
display(relationship_with_person_source_sent_data.groupby('relationship_word_gender').apply(lambda x: x.loc[:, 'relationship_word_source_gender'].value_counts() / x.loc[:, 'relationship_word_source_gender'].value_counts().sum()).reset_index().pivot(index='relationship_word_gender', columns=['level_1'], values='relationship_word_source_gender').sort_index())
## test significance
from scipy.stats import chi2_contingency
test_stat, p_val, dof, expected = chi2_contingency(relationship_word_gender_counts)
display(relationship_word_gender_counts)
print(test_stat, p_val, dof, relationship_word_gender_counts.sum().sum())

OK! This is definitely more of the split that I expected. Let's look at some examples of different relationship words.

In [211]:
for relationship_word_i, data_i in relationship_with_person_source_sent_data.groupby('relationship_word'):
    print(f'*** relationship word = {relationship_word_i} ***')
    relationship_word_gender_i = data_i.loc[:, 'relationship_word_gender'].iloc[0]
    for gender_j, data_j in data_i.groupby('relationship_word_source_gender'):
        if(gender_j == relationship_word_gender_i):
            print(f'** gender = same-gender **')
        else:
            print(f'** gender = diff-gender **')
        display(data_j.loc[:, ['sent', 'relationship_word_source', 'relationship_word_source_gender']].head())

## Compare relationship differences by language
Now that we've collected data from all languages, let's get the same stats for each language.

In [1]:
import pandas as pd
import os
langs = ['es', 'fr', 'it']
relationship_sent_data = []
relationship_sent_data_dir = 'data/wiki/'
for lang_i in langs:
    relationship_sent_data_file_i = os.path.join(relationship_sent_data_dir, f'lang={lang_i}_relationship_words_with_source_sent_data.gz')
    relationship_sent_data_i = pd.read_csv(relationship_sent_data_file_i, sep='\t')
    relationship_sent_data_i = relationship_sent_data_i.assign(**{
        'lang' : lang_i
    })
    relationship_sent_data.append(relationship_sent_data_i)
relationship_sent_data = pd.concat(relationship_sent_data, axis=0)

In [14]:
## test diffs w/ chi-2
pd.set_option('display.float_format', '{:.2f}'.format)
from scipy.stats import chi2_contingency
for lang_i, data_i in relationship_sent_data.groupby('lang'):
    print(f'lang={lang_i}')
    gender_counts_i = data_i.groupby('relationship_word_gender').apply(lambda x: x.loc[:, 'relationship_word_source_gender'].value_counts()).reset_index().pivot(index='relationship_word_gender', columns=['level_1'], values='relationship_word_source_gender').sort_index()
    N_i = data_i.shape[0]
    # norm per-row
#     display(data_i.groupby('relationship_word_gender').apply(lambda x: x.loc[:, 'relationship_word_source_gender'].value_counts() / x.loc[:, 'relationship_word_source_gender'].value_counts().sum()).reset_index().pivot(index='relationship_word_gender', columns=['level_1'], values='relationship_word_source_gender').sort_index())
    # norm total
    # I am confusion
#     for gender_j, data_j in data_i.groupby('relationship_word_gender'):
#         print(f'relationship word gender={gender_j}')
#         print(data_j.loc[:, 'relationship_word_source_gender'].value_counts() / N_i)
#         pass
    gender_pct_i = data_i.groupby('relationship_word_gender').apply(lambda x: x.loc[:, 'relationship_word_source_gender'].value_counts() / N_i).reset_index().pivot(index='relationship_word_gender', columns=['level_1'], values='relationship_word_source_gender').sort_index() * 100
    display(gender_pct_i)
    # row = relationship word gender, col = target gender
    ## test significance
    test_stat, p_val, dof, expected = chi2_contingency(gender_counts_i)
    print(f'X2={test_stat} (p={p_val}, dof={dof}, N={gender_counts_i.sum().sum()}')

## Old code

### Train classifier
Training code that we can't run on the LIT server because of memory problems

In [1]:
import os
import torch
device_id = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id)

In [2]:
## load sentence data
from data_helpers import load_clean_relationship_sent_data
langs = ['es', 'fr', 'it']
relationship_sent_data = load_clean_relationship_sent_data(langs=langs)
display(relationship_sent_data.head())
it_relationship_sent_data = relationship_sent_data[relationship_sent_data.loc[:, 'lang']=='it']

In [3]:
## organize data
from transformers import MBartTokenizer, MBartForSequenceClassification
from datasets.arrow_dataset import Dataset
# model_name = 'facebook/mbart-large' # too big??
# model_name = 'sshleifer/tiny-mbart' # too small, doesn't learn anything?
model_name = 'facebook/mbart-large-cc25'
# tokenizer = MBart50Tokenizer.from_pretrained(model_name)
tokenizer = MBartTokenizer.from_pretrained(model_name)
# debug: try w/ only Italian data
# max_length = it_relationship_sent_data.loc[:, 'sent'].apply()
max_length = 48
input_data = tokenizer.batch_encode_plus(it_relationship_sent_data.loc[:, 'sent'], max_length=max_length, truncation=True)
# add labels
input_data['labels'] = (it_relationship_sent_data.loc[:, 'relationship_type']=='same_gender').astype(int)
input_data = Dataset.from_dict(input_data)
input_data.set_format(columns=['input_ids', 'attention_mask', 'labels'], type='torch')
# split train/test
split_input_data = input_data.train_test_split(train_size=0.9, seed=123)
train_data = split_input_data['train']
test_data = split_input_data['test']

In [5]:
## train!!
output_dir = 'relationship_type_classifier/'
model = MBartForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = model.to(torch.cuda.current_device())

In [6]:
## train etc.
from datasets import load_metric
from transformers import TrainingArguments, Trainer
out_dir = 'data/sentence_relationship_gender_classifier/'
batch_size = 1
num_train_epochs = 3
training_args = TrainingArguments(
    out_dir,
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=num_train_epochs
)
compute_metric = load_metric('f1')
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
## test lol
import torch
from tqdm import tqdm
model.eval()
with torch.no_grad():
    test_data_output = [model(**{k : v.unsqueeze(0).to(torch.cuda.current_device()) for k,v in x.items()}) for x  in tqdm(test_data)]
test_data_output_logits = torch.vstack([x['logits'] for x in test_data_output])
## TODO: is this the best way to convert logits to labels?? range is [-inf, +inf]
# test_data_output_logit_labels = (test_data_output_logits > 0.).to(int)
test_data_output_logit_labels = test_data_output_logits.argmax(axis=1).cpu()
from sklearn.metrics import f1_score, roc_auc_score
test_data_labels = test_data['labels'].to(int)
auc_overall = roc_auc_score(test_data_labels,
                            test_data_output_logit_labels)
f1_overall = f1_score(test_data_output_logit_labels,
                      test_data_labels,
                      average='macro')
print(auc_overall, f1_overall)

### Classify

Let's try to classify these sentences by relationship type using the classifier trained on the same data as before.

In [3]:
## set GPU
import os
device_id = 0
os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id)

In [4]:
from data_helpers import load_multilingual_tokenizer
from transformers import MBartForSequenceClassification
import torch
lang = 'it'
tokenizer = load_multilingual_tokenizer(tgt_lang_token=lang)
trained_model_file = 'relationship_type_classifier/checkpoint-2000/'
model = MBartForSequenceClassification.from_pretrained(trained_model_file)
model = model.to(torch.cuda.current_device())

In [5]:
## process data
from datasets.arrow_dataset import Dataset
max_length = 1024
relationship_input_data = tokenizer.batch_encode_plus(relationship_sents.loc[:, 'sent'].values, max_length=max_length, truncation=True)
relationship_input_data = Dataset.from_dict(relationship_input_data)
relationship_input_data.set_format(columns=['input_ids', 'attention_mask'], type='torch')

In [6]:
## predict
import torch
from tqdm import tqdm
relationship_output_data = []
with torch.no_grad():
    for x in tqdm(relationship_input_data):
        x_output = model(**{k : v.unsqueeze(0).to(torch.cuda.current_device()) for k,v in x.items()})
        relationship_output_data.append(x_output.logits.view(-1).cpu().numpy())

In [19]:
## combine
relationship_output_df = pd.DataFrame(relationship_output_data)
# normalize probabilities
from scipy.special import logsumexp
relationship_output_data = relationship_output_df.apply(lambda x: np.exp(x - logsumexp(x)), axis=1)
relationship_output_df = relationship_output_df.assign(**{
    'class_label' : relationship_output_df.apply(lambda x: x.argmax(), axis=1),
    'class_label_prob' : relationship_output_df.apply(lambda x: x.max(), axis=1),
}).drop([0,1], axis=1)
# fix class names
class_label_name_lookup = {
    0 : 'diff_gender',
    1 : 'same_gender',
}
relationship_output_df = relationship_output_df.assign(**{
    'class_label' : relationship_output_df.loc[:, 'class_label'].apply(class_label_name_lookup.get)
})
print(relationship_output_df.loc[:, 'class_label'].value_counts())
## recombine w/ sents
relationship_sent_output_data = pd.concat([
    relationship_output_df,
    relationship_sents,
], axis=1)
display(relationship_sent_output_data.head())

Are the predictions accurate?

In [22]:
for label_i, data_i in relationship_sent_output_data.groupby('class_label'):
    print(f'example sents for label={label_i}')
    data_i.sort_values('class_label_prob', ascending=False, inplace=True)
    display(data_i.loc[:, ['relationship_word', 'sent', 'class_label_prob']].head(20))

OK! Not great. The model seemed to learn about coccurrence of nouns with the same/different genders, but not about actual relationships.

- `diff_gender`
    - (IT) `**la signora** nance precedette **suo marito** nella morte.` (EN) `Mrs. Nance preceded her husband in death.`
- `same_gender`
    - (IT) `**la coppia** conosce anche peppino e **la sua fidanzata** katrine che successivamente si fidanza con mirko.` (EN) `the couple also meets peppino and his girlfriend katrine who later gets engaged to mirko.`

In [23]:
## save for posterity
relationship_sent_output_data.to_csv('data/wiki/relationship_sent_pred_output_data.gz', sep='\t', compression='gzip', index=False)