In [1]:
from data.preprocess import load_from_file
from tqdm import tqdm_notebook as tqdm
from collections import Counter

In [2]:
data_d = load_from_file("davidson", ["none","abusive"])
data_w = load_from_file("waasem", ["none","sexism", "racism"])

loaded preprocessed tweets for none:3330
loaded preprocessed tweets for abusive:16546
loaded preprocessed tweets for none:458
loaded preprocessed tweets for abusive:2083
loaded preprocessed tweets for none:417
loaded preprocessed tweets for abusive:2062
loaded preprocessed tweets for none:10209
loaded preprocessed tweets for sexism:3152
loaded preprocessed tweets for racism:1649
loaded preprocessed tweets for none:1276
loaded preprocessed tweets for sexism:394
loaded preprocessed tweets for racism:206
loaded preprocessed tweets for none:1277
loaded preprocessed tweets for sexism:394
loaded preprocessed tweets for racism:207


## word

In [19]:
vocab = Counter()

In [20]:
max_len = 0
for data in [data_d, data_w]:
    for label in data["train"].keys():
        for row in data["train"][label]:
            vocab.update(row)
            if max_len < len(row):
                max_len = len(row)

In [23]:
min_freq = 2
count = 0
for word in vocab.keys():
    if vocab[word] >= min_freq:
        count += 1
vocab_size = count

In [24]:
vocab_size

13857

In [26]:
_vocab = ["PAD", "UNK"]
for word, _ in vocab.most_common(vocab_size):
    _vocab.append(word)

In [27]:
id2word = {}
word2id = {}
for i, word in enumerate(_vocab):
    id2word[i] = word
    word2id[word] = i

In [28]:
max_len = 40
def tokens2id(tokens):
    idx = []
    for t in tokens[:max_len]:
        if t not in word2id.keys():
            idx.append(word2id["UNK"])
        else:
            idx.append(word2id[t])
    padding_needed = max_len - len(idx) if max_len > len(idx) else 0
    for _ in range(padding_needed):
        idx.append(word2id["PAD"])
    assert len(idx) == max_len
    return idx
            

In [29]:
print(data_d["train"]["none"][0])
print(tokens2id(data_d["train"]["none"][0]))

["y'all", 'gots', 'to', 'take', 'da', 'colored', 'vacation', '.', 'kentucky', 'fried', 'chicken', 'but', 'only', 'da', 'white', 'meat']
[197, 3951, 8, 171, 259, 456, 6754, 2, 7540, 1851, 867, 30, 112, 259, 170, 1617, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [32]:
def idx_dataset(data):
    idx_data = {}
    for key in ["train", "valid", "test"]:
        idx_data[key] = {}
        for label in tqdm(data[key].keys()):
            idx_data[key][label] = list(map(tokens2id, data[key][label]))
    return idx_data

In [33]:
idx_d = idx_dataset(data_d)






In [34]:
idx_w = idx_dataset(data_w)






In [3]:
import pickle
import numpy as np

In [48]:
def save_npy(data, name):
    file_format = "./data/word_outputs/%s_%s_%s.npy"
    for key in data.keys():
        for label in data[key].keys():
            file_name = file_format % (key, label, name)
            array = np.array(data[key][label])
            np.save(file_name, array)
            print("Saved in %s. %s" % (file_name, str(array.shape)))

In [49]:
save_npy(idx_w, "waasem")

Saved in ./data/word_outputs/train_racism_waasem.npy. (1649, 40)
Saved in ./data/word_outputs/train_sexism_waasem.npy. (3152, 40)
Saved in ./data/word_outputs/train_none_waasem.npy. (10209, 40)
Saved in ./data/word_outputs/test_racism_waasem.npy. (207, 40)
Saved in ./data/word_outputs/test_sexism_waasem.npy. (394, 40)
Saved in ./data/word_outputs/test_none_waasem.npy. (1277, 40)
Saved in ./data/word_outputs/valid_racism_waasem.npy. (206, 40)
Saved in ./data/word_outputs/valid_sexism_waasem.npy. (394, 40)
Saved in ./data/word_outputs/valid_none_waasem.npy. (1276, 40)


In [50]:
save_npy(idx_d, "davidson")

Saved in ./data/word_outputs/train_abusive_davidson.npy. (16546, 40)
Saved in ./data/word_outputs/train_none_davidson.npy. (3330, 40)
Saved in ./data/word_outputs/test_abusive_davidson.npy. (2062, 40)
Saved in ./data/word_outputs/test_none_davidson.npy. (417, 40)
Saved in ./data/word_outputs/valid_abusive_davidson.npy. (2083, 40)
Saved in ./data/word_outputs/valid_none_davidson.npy. (458, 40)


In [52]:
with open("./data/word_outputs/vocab.pkl", "wb") as f:
    pickle.dump({"word2id": word2id, "id2word":id2word}, f)

## Char preprocess

In [4]:
from data.char import text_to_1hot_matrix

In [5]:
def char_idx_dataset(data):
    idx_data = {}
    for key in ["train", "valid", "test"]:
        idx_data[key] = {}
        for label in tqdm(data[key].keys()):
            idx_data[key][label] = [text_to_1hot_matrix(" ".join(row)) for row in data[key][label]]
    return idx_data

In [6]:
char_idx_d = char_idx_dataset(data_d)






In [7]:
char_idx_w = char_idx_dataset(data_w)






In [8]:
def save_npy(data, name):
    file_format = "./data/char_outputs/%s_%s_%s.npy"
    for key in data.keys():
        for label in data[key].keys():
            file_name = file_format % (key, label, name)
            array = np.array(data[key][label])
            np.save(file_name, array)
            print("Saved in %s. %s" % (file_name, str(array.shape)))

In [9]:
save_npy(char_idx_d, "davidson")

Saved in ./data/char_outputs/train_abusive_davidson.npy. (16546, 140, 70)
Saved in ./data/char_outputs/train_none_davidson.npy. (3330, 140, 70)
Saved in ./data/char_outputs/valid_abusive_davidson.npy. (2083, 140, 70)
Saved in ./data/char_outputs/valid_none_davidson.npy. (458, 140, 70)
Saved in ./data/char_outputs/test_abusive_davidson.npy. (2062, 140, 70)
Saved in ./data/char_outputs/test_none_davidson.npy. (417, 140, 70)


In [10]:
save_npy(char_idx_w, "waasem")

Saved in ./data/char_outputs/train_sexism_waasem.npy. (3152, 140, 70)
Saved in ./data/char_outputs/train_none_waasem.npy. (10209, 140, 70)
Saved in ./data/char_outputs/train_racism_waasem.npy. (1649, 140, 70)
Saved in ./data/char_outputs/valid_sexism_waasem.npy. (394, 140, 70)
Saved in ./data/char_outputs/valid_none_waasem.npy. (1276, 140, 70)
Saved in ./data/char_outputs/valid_racism_waasem.npy. (206, 140, 70)
Saved in ./data/char_outputs/test_sexism_waasem.npy. (394, 140, 70)
Saved in ./data/char_outputs/test_none_waasem.npy. (1277, 140, 70)
Saved in ./data/char_outputs/test_racism_waasem.npy. (207, 140, 70)
