In [1]:
from data.preprocess import load_from_file
from tqdm import tqdm_notebook as tqdm
from collections import Counter

In [2]:
data_d = load_from_file("davidson", ["none","abusive"])
data_w = load_from_file("waasem", ["none","sexism", "racism"])
data_relabel = load_from_file("relabel", ["none"])

loaded preprocessed tweets for none:3372
loaded preprocessed tweets for abusive:16553
loaded preprocessed tweets for none:416
loaded preprocessed tweets for abusive:2062
loaded preprocessed tweets for none:417
loaded preprocessed tweets for abusive:2062
loaded preprocessed tweets for none:10376
loaded preprocessed tweets for sexism:3152
loaded preprocessed tweets for racism:1649
loaded preprocessed tweets for none:1297
loaded preprocessed tweets for sexism:394
loaded preprocessed tweets for racism:206
loaded preprocessed tweets for none:1297
loaded preprocessed tweets for sexism:395
loaded preprocessed tweets for racism:207
loaded preprocessed tweets for none:488
cannot open split valid
cannot open split test


## word

In [3]:
vocab = Counter()

In [4]:
max_len = 0
for data in [data_d, data_w, data_relabel]:
    for label in data["train"].keys():
        for row in data["train"][label]:
            vocab.update(row)
            if max_len < len(row):
                max_len = len(row)

In [5]:
min_freq = 2
count = 0
for word in vocab.keys():
    if vocab[word] >= min_freq:
        count += 1
vocab_size = count

In [6]:
vocab_size

13479

In [7]:
_vocab = ["PAD", "UNK"]
for word, _ in vocab.most_common(vocab_size):
    _vocab.append(word)

In [8]:
id2word = {}
word2id = {}
for i, word in enumerate(_vocab):
    id2word[i] = word
    word2id[word] = i

In [9]:
max_len = 40
def tokens2id(tokens):
    idx = []
    for t in tokens[:max_len]:
        if t not in word2id.keys():
            idx.append(word2id["UNK"])
        else:
            idx.append(word2id[t])
    padding_needed = max_len - len(idx) if max_len > len(idx) else 0
    for _ in range(padding_needed):
        idx.append(word2id["PAD"])
    assert len(idx) == max_len
    return idx
            

In [10]:
print(data_d["train"]["none"][0])
print(tokens2id(data_d["train"]["none"][0]))

['<user>', 'i', 'have', 'never', 'once', 'in', 'my', 'life', 'been', 'to', 'a', "'", 'coffee', 'shop', "'", 'like', 'starbucks', '.', 'i', 'am', 'a', 'hick', 'though', '.']
[2, 8, 47, 130, 497, 22, 24, 191, 142, 10, 5, 90, 1360, 2246, 90, 34, 3479, 3, 8, 167, 5, 2722, 371, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
def idx_dataset(data):
    idx_data = {}
    for key in ["train", "valid", "test"]:
        idx_data[key] = {}
        for label in tqdm(data[key].keys()):
            idx_data[key][label] = list(map(tokens2id, data[key][label]))
    return idx_data

In [12]:
idx_d = idx_dataset(data_d)






In [13]:
idx_w = idx_dataset(data_w)






In [14]:
idx_relabel = idx_dataset(data_relabel)






In [15]:
import pickle
import numpy as np

In [16]:
def save_npy(data, name):
    file_format = "./data/word_outputs/%s_%s_%s.npy"
    for key in data.keys():
        for label in data[key].keys():
            file_name = file_format % (key, label, name)
            array = np.array(data[key][label])
            np.save(file_name, array)
            print("Saved in %s. %s" % (file_name, str(array.shape)))

In [17]:
save_npy(idx_w, "waasem")

Saved in ./data/word_outputs/train_racism_waasem.npy. (1649, 40)
Saved in ./data/word_outputs/train_none_waasem.npy. (10376, 40)
Saved in ./data/word_outputs/train_sexism_waasem.npy. (3152, 40)
Saved in ./data/word_outputs/test_racism_waasem.npy. (207, 40)
Saved in ./data/word_outputs/test_none_waasem.npy. (1297, 40)
Saved in ./data/word_outputs/test_sexism_waasem.npy. (395, 40)
Saved in ./data/word_outputs/valid_racism_waasem.npy. (206, 40)
Saved in ./data/word_outputs/valid_none_waasem.npy. (1297, 40)
Saved in ./data/word_outputs/valid_sexism_waasem.npy. (394, 40)


In [18]:
save_npy(idx_d, "davidson")

Saved in ./data/word_outputs/train_abusive_davidson.npy. (16553, 40)
Saved in ./data/word_outputs/train_none_davidson.npy. (3372, 40)
Saved in ./data/word_outputs/test_abusive_davidson.npy. (2062, 40)
Saved in ./data/word_outputs/test_none_davidson.npy. (417, 40)
Saved in ./data/word_outputs/valid_abusive_davidson.npy. (2062, 40)
Saved in ./data/word_outputs/valid_none_davidson.npy. (416, 40)


In [19]:
save_npy(idx_relabel, "relabel")

Saved in ./data/word_outputs/train_none_relabel.npy. (488, 40)


In [20]:
with open("./data/word_outputs/vocab.pkl", "wb") as f:
    pickle.dump({"word2id": word2id, "id2word":id2word}, f)

## Char preprocess

In [21]:
from data.char import text_to_1hot_matrix

In [22]:
def char_idx_dataset(data):
    idx_data = {}
    for key in ["train", "valid", "test"]:
        idx_data[key] = {}
        for label in tqdm(data[key].keys()):
            idx_data[key][label] = [text_to_1hot_matrix(" ".join(row)) for row in data[key][label]]
    return idx_data

In [23]:
char_idx_d = char_idx_dataset(data_d)






In [24]:
char_idx_w = char_idx_dataset(data_w)






In [25]:
char_idx_relabel = char_idx_dataset(data_relabel)






In [26]:
def save_npy(data, name):
    file_format = "./data/char_outputs/%s_%s_%s.npy"
    for key in data.keys():
        for label in data[key].keys():
            file_name = file_format % (key, label, name)
            array = np.array(data[key][label])
            np.save(file_name, array)
            print("Saved in %s. %s" % (file_name, str(array.shape)))

In [27]:
save_npy(char_idx_d, "davidson")

Saved in ./data/char_outputs/train_abusive_davidson.npy. (16553, 140, 70)
Saved in ./data/char_outputs/train_none_davidson.npy. (3372, 140, 70)
Saved in ./data/char_outputs/test_abusive_davidson.npy. (2062, 140, 70)
Saved in ./data/char_outputs/test_none_davidson.npy. (417, 140, 70)
Saved in ./data/char_outputs/valid_abusive_davidson.npy. (2062, 140, 70)
Saved in ./data/char_outputs/valid_none_davidson.npy. (416, 140, 70)


In [28]:
save_npy(char_idx_w, "waasem")

Saved in ./data/char_outputs/train_racism_waasem.npy. (1649, 140, 70)
Saved in ./data/char_outputs/train_none_waasem.npy. (10376, 140, 70)
Saved in ./data/char_outputs/train_sexism_waasem.npy. (3152, 140, 70)
Saved in ./data/char_outputs/test_racism_waasem.npy. (207, 140, 70)
Saved in ./data/char_outputs/test_none_waasem.npy. (1297, 140, 70)
Saved in ./data/char_outputs/test_sexism_waasem.npy. (395, 140, 70)
Saved in ./data/char_outputs/valid_racism_waasem.npy. (206, 140, 70)
Saved in ./data/char_outputs/valid_none_waasem.npy. (1297, 140, 70)
Saved in ./data/char_outputs/valid_sexism_waasem.npy. (394, 140, 70)


In [29]:
save_npy(char_idx_relabel, "relabel")

Saved in ./data/char_outputs/train_none_relabel.npy. (488, 140, 70)
