In [79]:
from transformers import pipeline
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

from transformers import pipeline
import pandas as pd
import boto3
import numpy as np
from collections import Counter
import os
import random

import re, string
from typing import Dict

import torch, torchtext
import torchvision.models as models
from torchtext.data.utils import get_tokenizer
from torch.utils.data import TensorDataset, DataLoader
from torchtext.vocab import GloVe
from torch import nn, optim
from torch.nn import Module, Embedding, LSTM, RNN, GRU, Linear, Sequential, Dropout
from torch.nn.functional import sigmoid, relu, elu, tanh
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.nn.utils.rnn import PackedSequence

from transformers import AutoTokenizer

from tqdm import tqdm

SEED = 1234
N_SAMPLES = 10_000

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score

import time

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [80]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [81]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Today I'm going to use {device.type}")

Today I'm going to use cuda


In [82]:
chunks = pd.read_csv("../data/toxic_data.csv", chunksize=100000)
df = pd.concat(chunks)
df.head()

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till after the election in 2 yrs.... dirty politicians need to be afraid of Tar and feathers again... but they aren't and so the people get screwed.,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,0,2,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,,,,,,,,,,,,,,,,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental hospitals. Boorah,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,1,2,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,,,,,,,,,,,,,,,,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by not making this announcement himself.\n\nWhat an awful human being .....,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,2,3,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,,,,,,,,,,,,,,,,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,0,1,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,80


In [83]:
df['comment_text'] = df['comment_text'].fillna("")
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['toxicity']:
    df.loc[:, col] = np.where(df[col] >= 0.5, True, False)

In [84]:
train_df = df[df['split'] == 'train']
test_df = df[df['split'] != 'train']

In [14]:
def preprocess(df):
    cols_to_remove = [col for col in df.columns if col not in ['comment_text', 'toxicity']]
    dfs = Dataset.from_pandas(df)
    dfs = dfs.remove_columns(cols_to_remove)
    dfs = dfs.rename_column("toxicity", "labels")
    dfs = dfs.remove_columns('__index_level_0__')
    
    return dfs

In [15]:
# sample = train_df.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
train_text, val_text  = train_test_split(df, test_size=0.2, random_state=SEED)
# train_text, train_label = sample['comment_text'], sample['toxicity']
test_text = test_df
train_text.shape, val_text.shape, test_text.shape

((1599612, 46), (399904, 46), (194641, 46))

In [16]:
train_text = preprocess(train_text)
val_text = preprocess(val_text)
test_text = preprocess(test_text)

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased");

In [22]:
def preprocess_function(examples):
    return tokenizer(examples["comment_text"], truncation=True)

In [23]:
tokenized_train = train_text.map(preprocess_function, batched=True)
tokenized_val = val_text.map(preprocess_function, batched=True)
tokenized_test = test_text.map(preprocess_function, batched=True)

  0%|          | 0/1600 [00:00<?, ?ba/s]

  0%|          | 0/400 [00:00<?, ?ba/s]

  0%|          | 0/195 [00:00<?, ?ba/s]

In [26]:
small_train_dataset = tokenized_train.shuffle(seed=SEED).select(range(10_000))
small_eval_dataset = tokenized_val.shuffle(seed=SEED).select(range(10_000))

In [24]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

In [27]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3125


Step,Training Loss
500,0.2213
1000,0.1395
1500,0.1002
2000,0.0607
2500,0.0316
3000,0.0147


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

TrainOutput(global_step=3125, training_loss=0.09161824722290039, metrics={'train_runtime': 895.0433, 'train_samples_per_second': 55.863, 'train_steps_per_second': 3.491, 'total_flos': 2583979451489472.0, 'train_loss': 0.09161824722290039, 'epoch': 5.0})

In [31]:
predictions = trainer.predict( test_dataset=tokenized_test)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 194641
  Batch size = 16


In [37]:
predictions

PredictionOutput(predictions=array([[-3.6360977 ,  2.7444556 ],
       [-3.93156   ,  2.9900427 ],
       [-3.8799765 ,  2.9813883 ],
       ...,
       [ 3.2454646 , -3.581788  ],
       [-0.7947256 ,  0.43042183],
       [-3.0760767 ,  2.2627935 ]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.3166797161102295, 'test_runtime': 1165.3545, 'test_samples_per_second': 167.023, 'test_steps_per_second': 10.44})

In [85]:
oof_name = 'predicted_target'
test_df[oof_name] = predictions.label_ids

In [86]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, oof_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[oof_name])

def compute_bpsn_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bnsp_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
0,male,4386,1.0,1.0,1.0
1,female,5155,1.0,1.0,1.0
2,homosexual_gay_or_lesbian,1065,1.0,1.0,1.0
3,christian,4226,1.0,1.0,1.0
4,jewish,835,1.0,1.0,1.0
5,muslim,2040,1.0,1.0,1.0
6,black,1519,1.0,1.0,1.0
7,white,2452,1.0,1.0,1.0
8,psychiatric_or_mental_illness,511,1.0,1.0,1.0


In [88]:
def calculate_overall_auc(df, oof_name):
    true_labels = df['toxicity']
    predicted_labels = df[oof_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 1.0


In [95]:
test_df[['comment_text', 'toxicity', 'predicted_target']]

Unnamed: 0,comment_text,toxicity,predicted_target
3,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",True,1
10,"NO ! There are no alternative facts. Go check for yourself. It is people like you, who have no idea what you are talking about that has gotten this State and Country into the mess it is in. People who think the Goverment, be it State or Federal, can spend the peoples money better than they can, is stupid and nonsensical. Politicians use taxes as Personal slush accounts to continue their carrers, buying votes from the lame and the lazy.",True,1
11,the more you whine sore loser Artster\n\nthe more we enjoy your agony,True,1
38,"There's rarely opportunity to agree with Bennet on much, but in this case he's right. Trump is POTUS mostly because the electorate has grown so sick and tired of the status quo in Washington DC. And electing Trump was their backlash. \n\nAnd for the final paragraph, he'd be more accurate if he'd replaced the word ""price"" with the word ""cost"". Prices are high because costs are high. Any reform needs to be focused on COST containment.",False,0
42,The Law has every freedom to be an asss!,True,1
...,...,...,...
1999453,"Herod's ""slaughter of the innocents"" in Matthew's nativity narrative is a literary device echoing Pharaoh's slaughter of newborn Hebrew boys in Exodus, consistent with Matthew's conception of Jesus as the Second Moses.\n\nThough Augustus is famously said to have quipped, “It is better to be Herod's pig than his son.”",False,0
1999456,"You don't know that he would kill them if he could, and given he's mentally-ill, is it a surprise he shot at them given what they showed up with and that they lobbed gas canisters and stun grenades at him just because he wouldn't go outside?",False,0
1999492,My son shouldn't have to be afraid during THOSE Muslim prayers at school either.,False,0
1999495,Homosexuals are NOT pedophiles. The pedophiles in the catholic church are just that: pedophiles. Their sexual orientation doesn't indicate their likelihood to be a pedophile no more than your sexuality does.,False,0


In [99]:
predictions.predictions

array([[-3.6360977 ,  2.7444556 ],
       [-3.93156   ,  2.9900427 ],
       [-3.8799765 ,  2.9813883 ],
       ...,
       [ 3.2454646 , -3.581788  ],
       [-0.7947256 ,  0.43042183],
       [-3.0760767 ,  2.2627935 ]], dtype=float32)

In [100]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [102]:
probas = sigmoid(predictions.predictions)

array([0.9395995 , 0.9521222 , 0.9517262 , ..., 0.02707258, 0.60597444,
       0.9057484 ], dtype=float32)

In [116]:
oof_name = 'predicted_probas'
test_df[oof_name] = probas[:, -1]

In [117]:
test_df[['comment_text', 'toxicity', 'predicted_target', 'predicted_probas']]

Unnamed: 0,comment_text,toxicity,predicted_target,predicted_probas
3,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",True,1,0.939600
10,"NO ! There are no alternative facts. Go check for yourself. It is people like you, who have no idea what you are talking about that has gotten this State and Country into the mess it is in. People who think the Goverment, be it State or Federal, can spend the peoples money better than they can, is stupid and nonsensical. Politicians use taxes as Personal slush accounts to continue their carrers, buying votes from the lame and the lazy.",True,1,0.952122
11,the more you whine sore loser Artster\n\nthe more we enjoy your agony,True,1,0.951726
38,"There's rarely opportunity to agree with Bennet on much, but in this case he's right. Trump is POTUS mostly because the electorate has grown so sick and tired of the status quo in Washington DC. And electing Trump was their backlash. \n\nAnd for the final paragraph, he'd be more accurate if he'd replaced the word ""price"" with the word ""cost"". Prices are high because costs are high. Any reform needs to be focused on COST containment.",False,0,0.014447
42,The Law has every freedom to be an asss!,True,1,0.943584
...,...,...,...,...
1999453,"Herod's ""slaughter of the innocents"" in Matthew's nativity narrative is a literary device echoing Pharaoh's slaughter of newborn Hebrew boys in Exodus, consistent with Matthew's conception of Jesus as the Second Moses.\n\nThough Augustus is famously said to have quipped, “It is better to be Herod's pig than his son.”",False,0,0.940627
1999456,"You don't know that he would kill them if he could, and given he's mentally-ill, is it a surprise he shot at them given what they showed up with and that they lobbed gas canisters and stun grenades at him just because he wouldn't go outside?",False,0,0.916144
1999492,My son shouldn't have to be afraid during THOSE Muslim prayers at school either.,False,0,0.027073
1999495,Homosexuals are NOT pedophiles. The pedophiles in the catholic church are just that: pedophiles. Their sexual orientation doesn't indicate their likelihood to be a pedophile no more than your sexuality does.,False,0,0.605974
