In [2]:
import pandas as pd

# Load our predictions

This is from a model trained on all of our langauges (English, Greek, Turkish, Arabic, Danish) and Amharic and then used to predict a test set of the Amharic.

In [2]:
adf = pd.read_csv("am_pred_data.csv")

correct_off = adf.loc[(adf["PRED"] == 1) & (adf["LABEL"] == 1)]
correct_non = adf.loc[(adf["PRED"] == 0) & (adf["LABEL"] == 0)]

false_non = adf.loc[(adf["PRED"] == 0) & (adf["LABEL"] == 1)]
false_off = adf.loc[(adf["PRED"] == 1) & (adf["LABEL"] == 0)]
print(" false negatives count {}", false_non.count())
print(" false positives count {}", false_off.count())

 false negatives count {} Unnamed: 0    1213
PRED_NON      1213
PRED_OFF      1213
PRED          1213
LABEL         1213
TEXT          1213
dtype: int64
 false positives count {} Unnamed: 0    1414
PRED_NON      1414
PRED_OFF      1414
PRED          1414
LABEL         1414
TEXT          1414
dtype: int64


Let's get a tokenizer to parse our FB comments out and see what we might be able to find

In [3]:
MODEL_TYPE = 'xlm-roberta-base'

from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# xlm-roberta-large
print('Loading XLMRoberta tokenizer...')
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)
print('Tokenizer loaded')

ModuleNotFoundError: No module named 'transformers'

In [None]:
def tokenize_dataset( training_set ):
    
    _ids = []
    
    # look through all records
    for index, row in training_set.iterrows():
        encoded_dict = tokenizer.encode_plus(
                    row['TEXT'], # Sentence to encode.
                    add_special_tokens = False,      # Add '[CLS]' and '[SEP]'
                    max_length = 256,           # Pad or truncate.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
                   )

        # Add this example to our lists.
        _ids.append(encoded_dict['input_ids'])
    
    return _ids

In [None]:
false_non_id = tokenize_dataset(false_non)
false_off_id = tokenize_dataset(false_off)

correct_off_id = tokenize_dataset(correct_off)
correct_non_id = tokenize_dataset(correct_non)

Now we'll get counts of all the words in these:

In [None]:
from collections import Counter
false_non_count = Counter()
for xs in false_non_id:
    for x in set(xs[0].tolist()):
        false_non_count[x] += 1

false_off_count = Counter()
for xs in false_off_id:
    for x in set(xs[0].tolist()):
        false_off_count[x] += 1

        
correct_off_count = Counter()
for xs in correct_off_id:
    for x in set(xs[0].tolist()):
        correct_off_count[x] += 1

        
correct_non_count = Counter()
for xs in correct_non_id:
    for x in set(xs[0].tolist()):
        correct_non_count[x] += 1


In [None]:
unique_to_false_non = false_non_count - false_off_count - correct_non_count - correct_off_count
unique_to_false_non.most_common()

In [None]:
unique_to_false_off = false_off_count - false_non_count - correct_non_count - correct_off_count
unique_to_false_off.most_common()

In [None]:
unique_to_off = correct_off_count - false_off_count - false_non_count - correct_non_count
unique_to_off.most_common()

In [None]:
def detokenize_dataset( ids ):

    _texts = []

    # look through all records
    for row in ids:
        _texts.append(tokenizer.decode(row))

    return _texts

So what's most common in tweets identified as 'offensive' but which are not?

In [None]:
false_off_words = detokenize_dataset([k for k,v in unique_to_false_off.most_common()])

የሩሳሌም : Of Jerusalem


ይመልከቱ : look at, behold, browse

A little hard to see what might be offensive without context.

So what's most common in tweets identified as 'inoffensive' but which are offensive?

In [None]:
false_non_words = detokenize_dataset([k for k,v in unique_to_false_non.most_common()])

እንቅስቃሴ : life, lives, movement

ኢየሱስ : jesus

ተከታታይ : successive, following (one another)


So this is pretty hard to parse.

Let's pick one of these: የወያኔ

Google has it as "Oh my gosh"

[This](https://dictionary.abyssinica.com/%E1%8B%A8%E1%8B%88%E1%8B%AB%E1%8A%94) Amharic dictionary on the other hand, has it as "tribal movement in Tigray, northern Ethiopia", the site of a civil war.

## Maybe we could think of ways to reweight some of these words or phrases around these words?

Maybe an n-gram model of creating synthetic tweets that use phrases we know *should* be inoffensive or offensive?

# After speaking with Zach

We need a dataset that's isolated from our 'key term' identification to test against. Make that by grabbing 1000 examples, and training everything without those examples in the pipeline, and then doing the detection again.

In [4]:
from sklearn import model_selection

am_split = pd.read_csv("data/Amharic/amharic.csv")

am_train, am_test_reserve = model_selection.train_test_split(am_split, test_size=500)

In [5]:
am_test_reserve.to_csv("data/Amharic/amharic_test_reserve_500.csv", index=False)
am_train.to_csv("data/Amharic/amharic_train_29.5k.csv", index=False)