In [3]:
import pandas as pd

In [4]:
adf = pd.read_csv("am_pred_data.csv")

correct_off = adf.loc[(adf["PRED"] == 1) & (adf["LABEL"] == 1)]
correct_non = adf.loc[(adf["PRED"] == 0) & (adf["LABEL"] == 0)]

false_non = adf.loc[(adf["PRED"] == 0) & (adf["LABEL"] == 1)]
false_off = adf.loc[(adf["PRED"] == 1) & (adf["LABEL"] == 0)]
print(" false negatives count {}", false_non.count())
print(" false positives count {}", false_off.count())

 false negatives count {} Unnamed: 0    1213
PRED_NON      1213
PRED_OFF      1213
PRED          1213
LABEL         1213
TEXT          1213
dtype: int64
 false positives count {} Unnamed: 0    1414
PRED_NON      1414
PRED_OFF      1414
PRED          1414
LABEL         1414
TEXT          1414
dtype: int64


In [5]:
MODEL_TYPE = 'xlm-roberta-base'

from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# xlm-roberta-large
print('Loading XLMRoberta tokenizer...')
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)
print('Tokenizer loaded')

Loading XLMRoberta tokenizer...
Tokenizer loaded


In [6]:
def tokenize_dataset( training_set ):
    
    _ids = []
    
    # look through all records
    for index, row in training_set.iterrows():
        encoded_dict = tokenizer.encode_plus(
                    row['TEXT'], # Sentence to encode.
                    add_special_tokens = False,      # Add '[CLS]' and '[SEP]'
                    max_length = 256,           # Pad or truncate.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
                   )

        # Add this example to our lists.
        _ids.append(encoded_dict['input_ids'])
    
    return _ids

In [7]:
false_non_id = tokenize_dataset(false_non)
false_off_id = tokenize_dataset(false_off)

correct_off_id = tokenize_dataset(correct_off)
correct_non_id = tokenize_dataset(correct_non)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
from collections import Counter
false_non_count = Counter()
for xs in false_non_id:
    for x in set(xs[0].tolist()):
        false_non_count[x] += 1

false_off_count = Counter()
for xs in false_off_id:
    for x in set(xs[0].tolist()):
        false_off_count[x] += 1

        
correct_off_count = Counter()
for xs in correct_off_id:
    for x in set(xs[0].tolist()):
        correct_off_count[x] += 1

        
correct_non_count = Counter()
for xs in correct_non_id:
    for x in set(xs[0].tolist()):
        correct_non_count[x] += 1


In [9]:
unique_to_false_non = false_non_count - false_off_count - correct_non_count - correct_off_count
unique_to_false_non.most_common()

[(77945, 3),
 (74684, 3),
 (140340, 2),
 (175777, 2),
 (222768, 2),
 (162872, 2),
 (242454, 2),
 (235538, 2),
 (243857, 2),
 (238626, 2),
 (83899, 1),
 (242463, 1),
 (217511, 1),
 (137249, 1),
 (247417, 1),
 (241915, 1),
 (229567, 1),
 (235388, 1),
 (215889, 1),
 (144924, 1),
 (223128, 1),
 (181020, 1),
 (221404, 1),
 (189512, 1),
 (216661, 1),
 (103114, 1),
 (221428, 1),
 (224819, 1),
 (217961, 1),
 (38018, 1)]

In [11]:
unique_to_false_off = false_off_count - false_non_count - correct_non_count - correct_off_count
unique_to_false_off.most_common()

[(217042, 2),
 (242046, 2),
 (223589, 2),
 (173203, 1),
 (217512, 1),
 (247088, 1),
 (230717, 1),
 (224896, 1),
 (221652, 1),
 (196936, 1),
 (245908, 1),
 (1146, 1),
 (28677, 1),
 (246316, 1),
 (238201, 1)]

In [12]:
unique_to_off = correct_off_count - false_off_count - false_non_count - correct_non_count
unique_to_off.most_common()

[(97081, 113),
 (78906, 34),
 (211576, 32),
 (128103, 23),
 (153196, 23),
 (70337, 22),
 (176415, 21),
 (134567, 19),
 (151666, 19),
 (112944, 19),
 (101616, 18),
 (49648, 18),
 (216110, 17),
 (225551, 15),
 (199683, 15),
 (59581, 14),
 (94914, 13),
 (160215, 12),
 (73845, 11),
 (120083, 10),
 (73585, 10),
 (121015, 9),
 (190886, 9),
 (191878, 9),
 (168538, 8),
 (156828, 8),
 (177838, 8),
 (244345, 7),
 (229217, 7),
 (242478, 7),
 (189821, 7),
 (178283, 7),
 (167723, 7),
 (103047, 6),
 (166870, 6),
 (183630, 6),
 (121702, 6),
 (220753, 6),
 (209005, 6),
 (212735, 6),
 (233755, 6),
 (204634, 6),
 (244633, 6),
 (124449, 6),
 (215108, 6),
 (99177, 6),
 (222748, 5),
 (168112, 5),
 (113922, 5),
 (86838, 5),
 (162354, 5),
 (124956, 5),
 (59876, 5),
 (182035, 5),
 (239082, 5),
 (211445, 4),
 (63853, 4),
 (199345, 4),
 (160815, 4),
 (123602, 4),
 (155165, 4),
 (201760, 4),
 (242410, 4),
 (182168, 4),
 (45572, 3),
 (112346, 3),
 (161605, 3),
 (177970, 3),
 (184111, 3),
 (239086, 3),
 (136779, 3

In [16]:
def detokenize_dataset( ids ):

    _texts = []

    # look through all records
    for row in ids:
        _texts.append(tokenizer.decode(row))

    return _texts

So what's most common in tweets identified as 'offensive' but which are not?

In [19]:
print(detokenize_dataset([k for k,v in unique_to_false_off.most_common()]))

['የሩሳሌም', 'ይመልከቱ', '•••', 'ዲስ', 'እስረኞች', 'ዯ', 'አውሮፕላን', 'እጥረት', 'ፓርቲው', 'ዴሞክራሲያዊ', 'ሇ', '!!', '¤', 'ቑ', '፭']


የሩሳሌም : Of Jerusalem


ይመልከቱ : look at, behold, browse

A little hard to see what might be offensive without context.

So what's most common in tweets identified as 'inoffensive' but which are offensive?

In [17]:
print(detokenize_dataset([k for k,v in unique_to_false_non.most_common()]))

['እንቅስቃሴ', 'ኢየሱስ', 'ጋዜጣ', 'ምክትል', 'ኪዳን', 'እስራኤል', 'መዋቅር', 'ተሰኘው', 'ኰ', 'አስተያየቶች', 'በርካታ', 'ርቀት', 'አስመልክቶ', 'ጥቃት', 'ቌ', 'ዋሽንግተን', 'ተከታታይ', 'ጋብቻ', 'የምግብ', '፪', 'ተወላጆች', 'የገንዘብ', 'ጠቅላላ', 'ክሲ', 'ቅዳሜ', 'ፕሮግራም', 'ይናገራሉ', 'ቕ', 'ጀርመን', '¿']


እንቅስቃሴ : life, lives, movement

ኢየሱስ : jesus

ተከታታይ : successive, following (one another)


So this is pretty hard to parse.

In [18]:
print(detokenize_dataset([k for k,v in unique_to_off.most_common()]))

['አንተ', 'ለህ', 'ደደ', 'ነህ', 'ወራ', 'አት', 'ታም', 'እሱ', 'ረኛ', 'አፍ', 'ወያኔ', 'ልጅ', 'እንዴ', 'ወሬ', 'ኦነግ', 'የምት', 'ጉ', 'ክራ', 'መጥ', 'ንቅ', 'በላ', 'ፍር', 'ለሽ', 'ትግሬ', 'ዣ', 'የወያኔ', 'ርካ', 'ዠ', 'አሸባሪ', 'ጥላቻ', 'በቀለ', 'ዳም', 'ነገሩ', 'ደል', 'ጉድ', 'መስል', 'ድብ', 'ናገረው', 'ርጉ', 'አሰፋ', 'ሰይጣን', 'በሉ', 'ጬ', 'ማሪ', 'ግፍ', 'ዲ', 'ቃወም', 'ወያኔ', 'እያለ', 'ሽ', 'የሞ', 'ካን', 'ፈው', 'ጥሩ', 'ወለድ', 'የሚባለው', 'ቆ', 'አሸ', 'ለቅ', 'ሸ', 'ፏ', 'ይባላል', 'ስብስብ', 'የመጀመሪያው', 'ሰው', 'ከራ', 'ሰባት', 'ዷ', 'ላቸውን', 'ቀርቶ', 'ተራ', 'እንደነበር', 'እዩ', 'አጋ', 'የር', 'አቅ', 'ግማሽ', 'ሌሊት', 'ቁም', 'ራችሁ', 'ወንጀል', 'ርነት', 'ባሻገር', 'ዱት', 'ወንዝ', 'ሻሻ', 'ዘጠኝ', 'ስለተ', 'ለጥ', 'ቃቸው', 'የተባለ', 'ኢሳያስ', 'እዚያ', 'አበ', 'ሰኞ', 'ተስፋዬ', 'ሰበር', 'ሮቹ', 'ወሰን', 'ዎችና', 'ስርጭት', 'ወርቅ', 'ምንጮች', 'ፌስቡክ', 'ምንድነው', 'ራፊ', 'ታማኝ', 'ስማ', 'መኪና', 'መጥፎ', 'ለዚህም', 'ብሪ', 'ጋ', 'እንደ', 'ውጪ', 'ኴ', 'ቀዳሚ', 'ከዚያም', 'የግለሰቦች', 'ማክሰኞ', 'አስገራሚ', 'መለከቱ', 'ይወ', 'ዘገባ', 'ታስ', 'ሰራዊት', 'መንፈስ', 'እርሳቸው', 'ለወጥ', '∞', 'በቅርብ', 'ኄ', 'ወስድ', 'ሁለተኛው', 'ሿ', 'የእርስዎ', 'አርበኞች', 'እንግሊዝ', 'ኟ', 'ደራሲ', 'የኢትዮጵያ', 'መጋቢት', 'ጒ', 'ሽብር', '_____', 'ግንቦት', 'ኲ', 'ፈረ

Let's pick one of these: የወያኔ

Google has it as "Oh my gosh"

[This](https://dictionary.abyssinica.com/%E1%8B%A8%E1%8B%88%E1%8B%AB%E1%8A%94) Amharic dictionary on the other hand, has it as "tribal movement in Tigray, northern Ethiopia", the site of a civil war.

## Maybe we could think of ways to reweight some of these words or phrases around these words?