In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)

global_seed = 2022

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/all_data.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test_public_expanded.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test_private_expanded.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/toxicity_individual_annotations.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/train.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/identity_individual_annotations.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test.csv


In [2]:
!pip install datasets

import datasets

Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.8/346.8 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-2.2.2 responses-0.18.0 xxhash-3.0.0
[0m

# Config

In [3]:
from dataclasses import dataclass

@dataclass
class Config:
    model_name = 'bert-base-uncased'
    dataset_name = 'jigsaw-unintended-bias-in-toxicity-classification'
    text_column = 'comment_text'
    # if id column is string, replace that with an integer index during preprocessing
    id_column = 'id'

    # target in raw dataset. However, it will be renamed to `labels` here to facilitate training setup
    raw_target_column = 'toxicity'
    target_column = 'labels'
    
    undersample = True
    need_to_split = True
    
    test_size = 0.2
    max_seq_length = 128
    seed = 2022

# Dataset

In [4]:
%%time
JIGSAW_PATH = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
df = pd.read_csv(os.path.join(JIGSAW_PATH,'all_data.csv'))
df.head()

CPU times: user 18.8 s, sys: 6.08 s, total: 24.8 s
Wall time: 33.7 s


Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till a...,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,0,2,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,,,,,,,,,,,,,,,,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental...,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,1,2,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,,,,,,,,,,,,,,,,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by ...,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,2,3,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\...",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,,,,,,,,,,,,,,,,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,0,1,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,80


In [5]:
text_column = Config.text_column
target_column = Config.target_column

# Make sure all comment_text values are strings
df[text_column] = df[text_column].astype(str) 
df[target_column] = np.where(df[Config.raw_target_column]>=0.5, 1, 0)

In [6]:
df[target_column].value_counts()

0    1839734
1     159782
Name: labels, dtype: int64

## Drop unnecessary columns

In [7]:
id_column = Config.id_column
identities = ['male', 'female', 'white', 'black']
selected_columns = [id_column, text_column, target_column, 'split'] + identities
df = df[selected_columns]

## Train-test split

In [8]:
from sklearn.model_selection import train_test_split

train = df[df['split']=='train'].reset_index(drop=True)
test = df[df['split']=='test'].reset_index(drop=True)

train.drop(columns='split', inplace=True)
test.drop(columns='split', inplace=True)

In [9]:
x_train, x_val, y_train, y_val = train_test_split(
    train.drop(columns=target_column),
    train[target_column],
    stratify=train[target_column],
    test_size=Config.test_size,
    random_state=global_seed
)

# Undersample

In [10]:
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler(random_state=global_seed)
train_undersampled, y_undersampled = sampler.fit_resample(x_train, y_train)

train_undersampled[target_column] = y_undersampled
train_undersampled = train_undersampled.sample(frac=1).reset_index(drop=True)

## Value counts

In [11]:
def value_count(df, value=target_column):
    counts = df[value].value_counts().reset_index()
    counts.columns = ['Value', 'Count']
    counts['Count(%)'] = counts['Count'] * 100 / counts['Count'].sum()
    print(counts, '\n')

In [12]:
x_train[target_column] = y_train
x_val[target_column] = y_val

train = x_train.reset_index(drop=True)
validation = x_val.reset_index(drop=True)

In [13]:
print('Total dataset')
value_count(df)

print('Train dataset')
value_count(train)

print('Validation dataset')
value_count(validation)

print('Test dataset')
value_count(test)

Total dataset
   Value    Count   Count(%)
0      0  1839734  92.008966
1      1   159782   7.991034 

Train dataset
   Value    Count   Count(%)
0      0  1328433  92.003117
1      1   115467   7.996883 

Validation dataset
   Value   Count   Count(%)
0      0  332108  92.003047
1      1   28867   7.996953 

Test dataset
   Value   Count   Count(%)
0      0  179193  92.063337
1      1   15448   7.936663 



## Dump Dataframe

In [14]:
train_undersampled.to_csv('train_undersampled.csv', index=False)
test.to_csv('test.csv', index=False)
validation.to_csv('validation.csv', index=False)

## Drop columns not needed in tokenizer

In [15]:
finally_selected_columns = [id_column, text_column, target_column]
train_undersampled = train_undersampled[finally_selected_columns]
test = test[finally_selected_columns]
validation = validation[finally_selected_columns]

## Convert to dataset

In [16]:
train_dataset = datasets.Dataset.from_pandas(train_undersampled)
val_dataset = datasets.Dataset.from_pandas(validation)
test_dataset = datasets.Dataset.from_pandas(test)

# Tokenize

https://huggingface.co/docs/transformers/main_classes/tokenizer

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    Config.model_name,
    do_lower_case=True,
)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [18]:
def tokenize_function(examples):
    return tokenizer(list(examples[text_column]), padding="max_length", max_length=Config.max_seq_length, truncation=True)

In [19]:
train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/231 [00:00<?, ?ba/s]

  0%|          | 0/361 [00:00<?, ?ba/s]

  0%|          | 0/195 [00:00<?, ?ba/s]

In [20]:
train_tokenized.column_names, val_tokenized.column_names, test_tokenized.column_names

(['id',
  'comment_text',
  'labels',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 ['id',
  'comment_text',
  'labels',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 ['id',
  'comment_text',
  'labels',
  'input_ids',
  'token_type_ids',
  'attention_mask'])

In [21]:
# https://huggingface.co/docs/datasets/access
# drop string columns because they cause error during training phase

train_tokenized = train_tokenized.remove_columns([text_column])
train_tokenized.set_format("torch")

val_tokenized = val_tokenized.remove_columns([text_column])
val_tokenized.set_format("torch")

test_tokenized = test_tokenized.remove_columns([text_column])
test_tokenized.set_format("torch")

## Dump tokenized data

In [22]:
import pickle

with open('train.pkl', 'wb') as output:
    pickle.dump(train_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
with open('validation.pkl', 'wb') as output:
    pickle.dump(val_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
with open('test.pkl', 'wb') as output:
    pickle.dump(test_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()