In [2]:
import pandas as pd 
import numpy as np
from pathlib import Path

In [3]:
DATA_ROOT = Path("../data") / "jigsaw"

In [4]:
train = pd.read_csv(DATA_ROOT / "train.csv")
test = pd.read_csv(DATA_ROOT / "test_proced.csv")

In [6]:
toxic_trn = train[train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(1) > 0]
toxic_trn.shape

(16225, 8)

In [7]:
toxic_trn.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
44,001956c382006abd,I'm Sorry \n\nI'm sorry I screwed around with ...,1,0,0,0,0,0
51,001dc38a83d420cf,GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK T...,1,0,1,0,0,0
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0
56,0020fd96ed3b8c8b,=Tony Sidaway is obviously a fistfuckee. He lo...,1,0,1,0,1,0
58,0021fe88bc4da3e6,My Band Page's deletion. You thought I was gon...,1,0,1,0,0,0


In [6]:
train.mean(axis=0, numeric_only=True)

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

In [7]:
train.corr()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
toxic,1.0,0.308619,0.676515,0.157058,0.647518,0.266009
severe_toxic,0.308619,1.0,0.403014,0.123601,0.375807,0.2016
obscene,0.676515,0.403014,1.0,0.141179,0.741272,0.286867
threat,0.157058,0.123601,0.141179,1.0,0.150022,0.115128
insult,0.647518,0.375807,0.741272,0.150022,1.0,0.337736
identity_hate,0.266009,0.2016,0.286867,0.115128,0.337736,1.0


In [11]:
pd.crosstab(test.toxic,test.severe_toxic)

severe_toxic,0,1
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,57888,0
1,5723,367


In [None]:
import re
ptrn = re.compile("(\n|\.|\?|!)")
def split_sent(s: str):
    splits = ptrn.split(s)
    for i, (sts, nsts) in enumerate(zip(splits, splits[1:])):
        if i % 2 == 0:
            yield sts + nsts
    if len(splits[-1]) > 0:
        yield splits[-1]

# Pattern 0: Use individual sentences as training data as well

In [None]:
extra_sents = [(split_sent(row["comment_text"]), row) for i, row in toxic_trn.iterrows()]

In [None]:
min_words_in_sentence = 3

In [None]:
train_extra = []
for sents, row in extra_sents:
    for s in sents:
        if len(s.split()) > min_words_in_sentence:
            train_extra.append({
                "id": row["id"],
                "comment_text": s,
                "toxic": row["toxic"],
                "severe_toxic": row["severe_toxic"],
                "obscene": row["obscene"],
                "threat": row["threat"],
                "insult": row["insult"],
                "identity_hate": row["identity_hate"],
            })

In [None]:
train_extra = pd.DataFrame(train_extra)

In [None]:
train_extra[train.columns].to_csv(DATA_ROOT / "train_extra.csv", index=False)

# Pattern 1: Only interpolate within toxic class

In [None]:
train_extra = pd.read_csv(DATA_ROOT / "train_extra.csv")

In [None]:
train_extra["lens"] = train_extra["comment_text"].apply(len)

In [None]:
np.random.seed(100)
p1, p2 = np.random.permutation(len(train_extra)), np.random.permutation(len(train_extra))
new_data = []
for i1, i2 in zip(p1, p2):
    r1, r2 = train_extra.iloc[i1], train_extra.iloc[i2]
    new_data.append({
        "id": r1["id"] + "_" + r2["id"],
        "comment_text": r1["comment_text"] + " " + r2["comment_text"],
        "toxic": (r1["toxic"] + r2["toxic"]) / 2,
        "toxic": (r1["toxic"] + r2["toxic"]) / 2,
        "severe_toxic": (r1["severe_toxic"] + r2["severe_toxic"]) / 2,
        "obscene": (r1["obscene"] + r2["obscene"]) / 2,
        "threat": (r1["threat"] + r2["threat"]) / 2,
        "insult": (r1["insult"] + r2["insult"]) / 2,
        "identity_hate": (r1["identity_hate"] + r2["identity_hate"]) / 2,
    })

In [None]:
new_data = pd.DataFrame(new_data)

In [None]:
new_data[train.columns].to_csv(DATA_ROOT / "train_extra_interpolated.csv", index=False)

# Use augmented data provided by authors

In [12]:
aug_train = pd.read_csv(DATA_ROOT / "train_aug_bt.csv")

In [13]:
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# aug_train_pos = aug_train[aug_train[cols].sum(1) > 0][train.columns]
aug_train_pos = aug_train[train.columns]

In [14]:
pd.concat([train, aug_train_pos], axis=0).to_csv(DATA_ROOT / "train_with_bt.csv")