## load waasem

In [1]:
import numpy as np
import os

from sklearn.model_selection import train_test_split
from data.preprocess import concat_unshared_task_datasets, preprocess_tweet

In [2]:
data = concat_unshared_task_datasets()

Unshared task dataset concat done.
Label Count: Sexism-3940, Racism-2062, None-12762


In [4]:
def split(data, labels):
    _data = {"train": {}, "valid": {}, "test": {}}
    for i, key in enumerate(labels):
        _data["train"][key], x, _, _ = train_test_split(data[key],
                                                              np.zeros_like(data[key]), 
                                                              test_size=0.2)
        train_length = len(_data["train"][key])
        _data["valid"][key], _data["test"][key], _, _ = train_test_split(x, np.zeros_like(x), test_size=0.5)
        
        valid_length = len(_data["valid"][key])
        test_length = len(_data["test"][key])
        print("splitted %s: %s/%s/%s" % (key, train_length, valid_length, test_length))
    return _data

In [5]:
splitted = split(data, ["none", "racism", "sexism"])

splitted none: 10209/1276/1277
splitted racism: 1649/206/207
splitted sexism: 3152/394/394


In [6]:
def save_file(data_name, data):
    path = "./data/preprocessed/"
    if not os.path.exists(path):
        os.makedirs(path)
    
    for split in data.keys():
        for label in data[split].keys():
            file_name = "%s_%s_%s.txt" % (split, label, data_name)
            with open(path + file_name, "w") as f:
                for tweet in data[split][label]:
                    try:
                        f.write("%s\n" % tweet)
                    except UnicodeEncodeError:
                        print("unicode encode error. skipping line")
                print("Wrote on %s" % file_name)

In [7]:
save_file("waasem", splitted)

Wrote on valid_racism_waasem.txt
Wrote on valid_none_waasem.txt
Wrote on valid_sexism_waasem.txt
Wrote on train_racism_waasem.txt
Wrote on train_none_waasem.txt
Wrote on train_sexism_waasem.txt
Wrote on test_racism_waasem.txt
Wrote on test_none_waasem.txt
Wrote on test_sexism_waasem.txt


## load davidson

 0 - hate speech 1 - offensive language 2 - neither

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("./data/crawled/davidson.csv", sep=",", error_bad_lines=False)

In [10]:
df.describe(include="all")

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0,24783
unique,,,,,,,24783
top,,,,,,,RT @mckayllaa: I wish I had pretty colored eyes .
freq,,,,,,,1
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277,
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089,
min,0.0,3.0,0.0,0.0,0.0,0.0,
25%,6372.5,3.0,0.0,2.0,0.0,1.0,
50%,12703.0,3.0,0.0,3.0,0.0,1.0,
75%,18995.5,3.0,0.0,3.0,0.0,1.0,


In [11]:
df["class"].value_counts()

1    19190
2     4163
0     1430
Name: class, dtype: int64

In [12]:
abusive_text = list(map(preprocess_tweet, df[df["class"] == 1].tweet.tolist())) 
hate_text = list(map(preprocess_tweet, df[df["class"] == 0].tweet.tolist()))
none_text = list(map(preprocess_tweet, df[df["class"] == 2].tweet.tolist()))

In [13]:
abusive_text += hate_text
len(abusive_text)

20620

In [14]:
len(none_text)

4163

In [15]:
splitted_davidson = split({"abusive": abusive_text, "none": none_text}, ["none", "abusive"])

splitted none: 3330/416/417
splitted abusive: 16496/2062/2062


In [16]:
save_file("davidson", splitted_davidson)

Wrote on valid_abusive_davidson.txt
Wrote on valid_none_davidson.txt
Wrote on train_abusive_davidson.txt
Wrote on train_none_davidson.txt
Wrote on test_abusive_davidson.txt
Wrote on test_none_davidson.txt
