In [1]:
import pandas as pd 
import numpy as np
from pathlib import Path

In [2]:
DATA_ROOT = Path("../data") / "jigsaw"

In [3]:
train = pd.read_csv(DATA_ROOT / "train.csv")
test = pd.read_csv(DATA_ROOT / "test_proced.csv")

In [None]:
toxic_trn = train[train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(1) > 0]
toxic_trn.shape

In [None]:
toxic_trn.head(10)

In [None]:
import re
ptrn = re.compile("(\n|\.|\?|!)")
def split_sent(s: str):
    splits = ptrn.split(s)
    for i, (sts, nsts) in enumerate(zip(splits, splits[1:])):
        if i % 2 == 0:
            yield sts + nsts
    if len(splits[-1]) > 0:
        yield splits[-1]

# Pattern 0: Use individual sentences as training data as well

In [None]:
extra_sents = [(split_sent(row["comment_text"]), row) for i, row in toxic_trn.iterrows()]

In [None]:
min_words_in_sentence = 3

In [None]:
train_extra = []
for sents, row in extra_sents:
    for s in sents:
        if len(s.split()) > min_words_in_sentence:
            train_extra.append({
                "id": row["id"],
                "comment_text": s,
                "toxic": row["toxic"],
                "severe_toxic": row["severe_toxic"],
                "obscene": row["obscene"],
                "threat": row["threat"],
                "insult": row["insult"],
                "identity_hate": row["identity_hate"],
            })

In [None]:
train_extra = pd.DataFrame(train_extra)

In [None]:
train_extra[train.columns].to_csv(DATA_ROOT / "train_extra.csv", index=False)

# Pattern 1: Only interpolate within toxic class

In [None]:
train_extra = pd.read_csv(DATA_ROOT / "train_extra.csv")

In [None]:
train_extra["lens"] = train_extra["comment_text"].apply(len)

In [None]:
np.random.seed(100)
p1, p2 = np.random.permutation(len(train_extra)), np.random.permutation(len(train_extra))
new_data = []
for i1, i2 in zip(p1, p2):
    r1, r2 = train_extra.iloc[i1], train_extra.iloc[i2]
    new_data.append({
        "id": r1["id"] + "_" + r2["id"],
        "comment_text": r1["comment_text"] + " " + r2["comment_text"],
        "toxic": (r1["toxic"] + r2["toxic"]) / 2,
        "toxic": (r1["toxic"] + r2["toxic"]) / 2,
        "severe_toxic": (r1["severe_toxic"] + r2["severe_toxic"]) / 2,
        "obscene": (r1["obscene"] + r2["obscene"]) / 2,
        "threat": (r1["threat"] + r2["threat"]) / 2,
        "insult": (r1["insult"] + r2["insult"]) / 2,
        "identity_hate": (r1["identity_hate"] + r2["identity_hate"]) / 2,
    })

In [None]:
new_data = pd.DataFrame(new_data)

In [None]:
new_data[train.columns].to_csv(DATA_ROOT / "train_extra_interpolated.csv", index=False)

# Use augmented data provided by authors

In [12]:
aug_train = pd.read_csv(DATA_ROOT / "train_aug_bt.csv")

In [13]:
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# aug_train_pos = aug_train[aug_train[cols].sum(1) > 0][train.columns]
aug_train_pos = aug_train[train.columns]

In [14]:
pd.concat([train, aug_train_pos], axis=0).to_csv(DATA_ROOT / "train_with_bt.csv")