In [1]:
import pandas as pd
import os
from collections import Counter
import re

In [2]:
# Count total data
c = Counter()
hate_c = Counter()
non_hate_c = Counter()
for root, dirs, files in os.walk("Finetuned_Models", topdown=False):
    for name in files:
        if 'IGNORE_FOR_NOW' in root:
            continue
        
        if name != 'data.csv':
            continue
            
        path = os.path.join(root, name)
        language = root.split('\\')[1]
        df = pd.read_csv(path)
        c[language] += len(df)
        hate_c[language] += len(df[df['Label'] == 1])
        non_hate_c[language] += len(df[df['Label'] == 0])
    
for key in c:
    print(f"{key:8s} {hate_c[key]}/{c[key]} are hateful ({(hate_c[key]/c[key])*100:0.2f}%)")
# c, hate_c, non_hate_c

Arabic   4166/9199 are hateful (45.29%)
English  24403/33783 are hateful (72.23%)
French   3193/4014 are hateful (79.55%)
Hindi    4138/9384 are hateful (44.10%)
Italian  622/1263 are hateful (49.25%)
Portugese 1788/5670 are hateful (31.53%)
Spanish  3424/10499 are hateful (32.61%)


### Concatenate Data

In [3]:
# Count total data
all_data = pd.DataFrame(columns=['Text', 'Label', 'Language'])
for root, dirs, files in os.walk("Finetuned_Models", topdown=False):
    for name in files:
        if 'IGNORE_FOR_NOW' in root:
            continue
        
        if name != 'data.csv':
            continue
            
        path = os.path.join(root, name)
        language = root.split('\\')[1]
        df = pd.read_csv(path)[['Text', 'Label']]
        df['Language'] = [language for i in range(len(df))]
        all_data = pd.concat([df, all_data])
        
    
assert len(all_data) == sum(c.values())

In [4]:
all_data.to_csv('all_data.csv', index=False)

### Clean Data

In [5]:
def clean_text(text):
    text = re.sub(r"#(\w+)", "", text)
    text = re.sub(r"@(\w+)", "", text)
    text = re.sub('http\S+', "", text)
    return text

In [6]:
all_data['Text'] = [clean_text(t) for t in all_data['Text']]

all_data = all_data[all_data['Text'] != ''].dropna()

In [7]:
all_data.to_csv('all_data_clean.csv', index=False)

### Sample Data

In [8]:
SAMPLE_SIZE = 600
RANDOM_STATE = 1

In [9]:
sampled_data = pd.DataFrame(columns=['Text', 'Label', 'Language'])

for language in c:
    for label in [0,1]:
        data = all_data
        data = data[data['Language'] == language]
        data = data[data['Label'] == label]
        
        data = data.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE).reset_index(drop=True)
        sampled_data = pd.concat([data, sampled_data]).reset_index(drop=True)
        
assert len(sampled_data[sampled_data['Label'] == 0]) / len(sampled_data) == 0.5

In [10]:
sampled_data

Unnamed: 0,Text,Label,Language
0,"Han sido los moros, que hijos de puta los Don...",1,Spanish
1,¿Lamban haber si tienes vergüenza y demuestras...,1,Spanish
2,¿Qué hacen tantos subsaharianos de pie en la P...,1,Spanish
3,Tú eres perra,1,Spanish
4,Qué peste a culé esta pillando la afición madr...,1,Spanish
...,...,...,...
8395,Ø§Ù„Ø±Ù‘Ø¦ÙŠØ³ Ø§Ù„ØªÙ‘ÙˆÙ†Ø³ÙŠÙ‘ ÙŠØ®Ø³Ø± Ø¯Ø...,0,Arabic
8396,Ø£Ø®ÙŠ Ø§Ù„Ù…Ø±ÙŠØ¶ Ù‚Ù… Ø¨Ø²ÙŠØ§Ø±Ø© Ù…Ø³ØªØ´...,0,Arabic
8397,Ù„Ø­ÙƒÙŠ Ù…ÙˆØ¬Ù‡ Ù„Ø¬Ø­Ø§Ø´ Ø§Ø±Ø¨Ø¹Ø·Ø¹Ø´ Ø§...,0,Arabic
8398,وفي ختام الندوة قدم الحقوقي والمدير التنفيذي ل...,0,Arabic


In [11]:
sampled_data.to_csv('sampled_clean_data.csv', index=False)

### Get English Test Data

In [13]:
SAMPLE_SIZE = 600
RANDOM_STATE = 2

In [14]:
sampled_data = pd.DataFrame(columns=['Text', 'Label', 'Language'])

for language in c:
    for label in [0,1]:
        data = all_data
        data = data[data['Language'] == language]
        data = data[data['Label'] == label]
        
        data = data.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE).reset_index(drop=True)
        sampled_data = pd.concat([data, sampled_data]).reset_index(drop=True)
        
assert len(sampled_data[sampled_data['Label'] == 0]) / len(sampled_data) == 0.5

In [18]:
sampled_data = sampled_data[sampled_data['Language'] == "English"].reset_index(drop=True)

In [19]:
sampled_data

Unnamed: 0,Text,Label,Language
0,"lmaoo one nigga had that 2nd pic as ""Looove t...",1,English
1,fuck outta here faggot that wouldn't even b...,1,English
2,Okay don't reply then I didn't want you to any...,1,English
3,"""Don't make me make you fall in live with a ni...",1,English
4,My Ralph Lauren jacket comes tomorrow. Im givi...,1,English
...,...,...,...
1195,Afghan refugee Waheed Adrian who moved to to ...,0,English
1196,Bitch please. You're nothing more than Putin...,0,English
1197,"""There is a special time and place for decaf ...",0,English
1198,Muslim refugee: In Greece they don't have enou...,0,English


In [20]:
sampled_data.to_csv("sampled_clean_data_eng_test.csv")