In [1]:
import numpy as np
import random
import os

from data.utils import split_dataset_binary
from data.preprocess import concat_unshared_task_datasets, save_preprocessed_data, preprocess_tweet

In [2]:
from sklearn.model_selection import StratifiedKFold


In [3]:
data = concat_unshared_task_datasets()

Unshared task dataset concat done.
Label Count: Sexism-3864, Racism-2059, None-12428


In [4]:
random.shuffle(data["none"])
random.shuffle(data["racism"])
random.shuffle(data["sexism"])

In [5]:
instances = np.array(data["none"] + data["racism"] + data["sexism"])

In [6]:
labels = np.concatenate([np.zeros_like(data["none"], dtype=np.int), np.ones_like(data["racism"], dtype=np.int), np.full_like(data["sexism"], 2, dtype=np.int)])

In [7]:
assert len(instances) == len(labels)

In [8]:
def save_file(data_name, x_train, y_train, x_test, y_test):
    assert len(x_train) == len(y_train)
    assert len(x_test) == len(y_test)
    
    path = "./data/preprocessed/"
    if not os.path.exists(path):
        os.makedirs(path)
    
    file_name = "train_%s.txt" % data_name
    with open(path + file_name, "w") as f:
        for i in range(len(x_train)):
            try:
                f.write("%s\t%s\n" % (x_train[i], y_train[i]))
            except UnicodeEncodeError:
                print("unicode encode error. skipping line")
    print("Wrote on %s" % file_name)
    
    file_name = "test_%s.txt" % data_name
    with open(path + file_name, "w") as f:
        for i in range(len(x_test)):
            try:
                f.write("%s\t%s\n" % (x_test[i], y_test[i]))
            except UnicodeEncodeError:
                print("unicode encode error. skipping line")
    print("Wrote on %s" % file_name)

In [9]:
skf = StratifiedKFold(n_splits=10)
for i, (train_idx, test_idx) in enumerate(skf.split(instances, labels)):
    save_file("fcv-%s" % i, instances[train_idx], labels[train_idx], instances[test_idx], labels[test_idx])

Wrote on train_fcv-0.txt
Wrote on test_fcv-0.txt
Wrote on train_fcv-1.txt
Wrote on test_fcv-1.txt
Wrote on train_fcv-2.txt
Wrote on test_fcv-2.txt
Wrote on train_fcv-3.txt
Wrote on test_fcv-3.txt
Wrote on train_fcv-4.txt
Wrote on test_fcv-4.txt
Wrote on train_fcv-5.txt
Wrote on test_fcv-5.txt
Wrote on train_fcv-6.txt
Wrote on test_fcv-6.txt
Wrote on train_fcv-7.txt
Wrote on test_fcv-7.txt
Wrote on train_fcv-8.txt
Wrote on test_fcv-8.txt
Wrote on train_fcv-9.txt
Wrote on test_fcv-9.txt
