In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)

global_seed = 2022

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/all_data.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test_public_expanded.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test_private_expanded.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/toxicity_individual_annotations.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/train.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/identity_individual_annotations.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test.csv


# Dataset

In [2]:
%%time
JIGSAW_PATH = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
dataset = pd.read_csv(os.path.join(JIGSAW_PATH,'all_data.csv'))
dataset.head()

CPU times: user 18 s, sys: 3.23 s, total: 21.2 s
Wall time: 29.2 s


Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till a...,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,0,2,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,,,,,,,,,,,,,,,,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental...,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,1,2,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,,,,,,,,,,,,,,,,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by ...,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,2,3,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\...",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,,,,,,,,,,,,,,,,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,0,1,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,80


In [3]:
text = 'comment_text'
target = 'labels'

# Make sure all comment_text values are strings
dataset[text] = dataset[text].astype(str) 
dataset[target] = np.where(dataset['toxicity']>=0.5, 1, 0)

## Drop unnecessary columns

In [4]:
dataset = dataset[['id', text, 'split', target]]

## Train-test split

In [5]:
# TODO: add train-validation split
train_df = dataset[dataset['split']=='train'].reset_index(drop=True)
test_df = dataset[dataset['split']=='test'].reset_index(drop=True)

train_df.drop(columns='split', inplace=True)
test_df.drop(columns='split', inplace=True)

# Undersample

In [6]:
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler(random_state=global_seed)
x_undersampled, y_undersampled = sampler.fit_resample(train_df.drop(columns=target), train_df[target])

x_undersampled[target] = y_undersampled
train_df_undersampled = x_undersampled.sample(frac=1).reset_index(drop=True)

## Value counts

In [7]:
def value_count(df, value):
    counts = df[value].value_counts().reset_index()
    counts.columns = ['Value', 'Count']
    counts['Count(%)'] = counts['Count'] * 100 / counts['Count'].sum()
    print(counts, '\n')

In [8]:
print('Total dataset')
value_count(dataset, target)

print('Train dataset')
value_count(train_df, target)

print('Test dataset')
value_count(test_df, target)

Total dataset
   Value    Count   Count(%)
0      0  1839734  92.008966
1      1   159782   7.991034 

Train dataset
   Value    Count   Count(%)
0      0  1660541  92.003103
1      1   144334   7.996897 

Test dataset
   Value   Count   Count(%)
0      0  179193  92.063337
1      1   15448   7.936663 



## Convert to dataset

In [9]:
!pip install datasets

import datasets

train_dataset = datasets.Dataset.from_pandas(train_df)
test_dataset = datasets.Dataset.from_pandas(test_df)
train_dataset_undersampled = datasets.Dataset.from_pandas(train_df_undersampled)

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 KB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-2.1.0 responses-0.18.0 xxhash-3.0.0
[0m

# Tokenize

https://huggingface.co/docs/transformers/main_classes/tokenizer

In [10]:
from transformers import AutoTokenizer

# model_name = "bert-base-uncased"
model_name = 'prajjwal1/bert-small'
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    do_lower_case=True,
)

Downloading:   0%|          | 0.00/286 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [11]:
MAX_SEQ_LENGTH = 128

def tokenize_function(examples):
    return tokenizer(list(examples[text]), padding="max_length", max_length=MAX_SEQ_LENGTH, truncation=True)

In [12]:
train_tokenized = train_dataset.map(tokenize_function, batched=True)
train_undersampled_tokenized = train_dataset_undersampled.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/1805 [00:00<?, ?ba/s]

  0%|          | 0/289 [00:00<?, ?ba/s]

  0%|          | 0/195 [00:00<?, ?ba/s]

In [13]:
train_tokenized.column_names

['id',
 'comment_text',
 'labels',
 'input_ids',
 'token_type_ids',
 'attention_mask']

In [14]:
# https://huggingface.co/docs/datasets/access
# drop string columns because they cause error during training phase

train_tokenized = train_tokenized.remove_columns([text])
train_tokenized.set_format("torch")

train_undersampled_tokenized = train_undersampled_tokenized.remove_columns([text])
train_undersampled_tokenized.set_format("torch")

test_tokenized = test_tokenized.remove_columns([text])
test_tokenized.set_format("torch")

## Dump tokenized data

In [15]:
import pickle

with open('train.pkl', 'wb') as output:
    pickle.dump(train_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
with open('test.pkl', 'wb') as output:
    pickle.dump(test_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
with open('train_undersampled.pkl', 'wb') as output:
    pickle.dump(train_undersampled_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()