This notebook is for preprocessing data. I converted data from different sources into tsv with a uniform format, and removed texts with fewer than or equal to five words.

In [1]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Download Toxic Data

In [None]:
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification
!yes | unzip jigsaw-unintended-bias-in-toxicity-classification.zip
!rm jigsaw-unintended-bias-in-toxicity-classification.zip

# Set Up

In [5]:
import numpy as np, pandas as pd

In [4]:
class PATHS:
    save = '/content/drive/MyDrive'
    train = 'train.csv'

# Preprocess Toxic Data

In [None]:
threshold = 0.9
df = pd.read_csv(PATHS.train)
df = df[df['target'] >= threshold]
df = df[['id', 'comment_text']]
df = df.rename(columns={'comment_text': 'text'})
df['text'] = df['text'].str.replace('\n', ' ')
df.to_csv(f'{PATHS.save}/jigsaw_toxic_2019_threshold_{threshold}.tsv', sep='\t', index=False)

# Preprocess Amazon and Reddit Data

Convert json to tsv and change 'timestamp' column to 'id'.

In [None]:
for name in ['amazon_review_All_Beauty_threshold_0.4']:
    path = f'{PATHS.save}/{name}.json'
    df = pd.read_json(path, convert_dates=False, lines=True)
    df = df.rename(columns={'timestamp': 'id'})
    df = df[['id', 'text']]
    df.to_csv(f'{PATHS.save}/{name}.tsv', sep='\t', index=False)

Remove rows whose text is fewer than or equal to five words because it lacks
context. They are not helpful in training.

In [6]:
for name in ['subreddit_SuicideWatch_900_v2', 'subreddit_abusiverelationships_900_v2', 'subreddit_abusiverelationships_600_v2_comments', 'amazon_review_All_Beauty_threshold_0.4']:
    path = f'{PATHS.save}/{name}.tsv'
    df = pd.read_csv(path, sep='\t')
    df = df.drop_duplicates(subset=['id'])
    mask = (df['text'].str.count(' ') < 5)
    print(f'Trim {sum(mask)} rows from {name}.tsv')
    df = df[~mask]
    df.to_csv(f'{PATHS.save}/{name}_trimmed.tsv', sep='\t', index=False)

Trim 647 rows from subreddit_SuicideWatch_900_v2.tsv
Trim 655 rows from subreddit_abusiverelationships_900_v2.tsv
Trim 974 rows from subreddit_abusiverelationships_600_v2_comments.tsv
Trim 913 rows from amazon_review_All_Beauty_threshold_0.4.tsv
