In [1]:
from datasets import Dataset
from transformers import AutoTokenizer
import os
import random
from sklearn.model_selection import train_test_split
import gc
import torch
from datetime import datetime
import awswrangler as wr
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Today I'm going to use {device.type}")

Today I'm going to use cuda


In [3]:
SEED = 1234
N_SAMPLES = 20000
TODAY = datetime.today().strftime("%Y%m%d")
BUCKET_NAME = 'sagemaker-godeltech'
TRAIN_PATH = f"s3://{BUCKET_NAME}/data/train/train.csv"
VAL_PATH = f"s3://{BUCKET_NAME}/data/validate/validate.csv"
TEST_PATH = f"s3://{BUCKET_NAME}/data/test/test.csv"
VOCAB_PATH = "lstm/vocab"
MODEL_PATH = "lstm/models"

In [4]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()
gc.collect()
torch.cuda.empty_cache()
# del model
# del Trainer

In [5]:
train = wr.s3.read_csv([TRAIN_PATH])
val = wr.s3.read_csv([VAL_PATH])
test = wr.s3.read_csv([TEST_PATH])

In [6]:
train_sample = train.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
val_sample = val.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
train.shape, val.shape, test.shape

((1443900, 2), (360975, 2), (194641, 12))

In [7]:
train_sample['toxicity'] = train_sample['toxicity'].astype('int')
val_sample['toxicity'] = val_sample['toxicity'].astype('int')

In [8]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir = '../tmp/AutoTokenizer');

# create tokenization function
def tokenize(batch):
    return tokenizer(batch["comment_text"], padding="max_length", truncation=True)

# tokenize train and test datasets
train_dataset = Dataset.from_pandas(train_sample).map(tokenize, batched=True)
val_dataset = Dataset.from_pandas(val_sample).map(tokenize, batched=True)

# set dataset format for PyTorch
train_dataset =  train_dataset.rename_column("toxicity", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset = val_dataset.rename_column("toxicity", "labels")
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, cache_dir = '../tmp/AutoModel')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

In [10]:
training_args = TrainingArguments(
    output_dir="../tmp/results",
    logging_dir="../tmp/results/logs",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    seed=SEED,
    load_best_model_at_end=True,
    eval_steps=500
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1875


Step,Training Loss,Validation Loss
500,0.1895,0.15189
1000,0.1247,0.144131
1500,0.0924,0.170952


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 32
Saving model checkpoint to ../tmp/results/checkpoint-500
Configuration saved in ../tmp/results/checkpoint-500/config.json
Model weights saved in ../tmp/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../tmp/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../tmp/results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this messag

TrainOutput(global_step=1875, training_loss=0.12296106567382813, metrics={'train_runtime': 4373.2602, 'train_samples_per_second': 13.72, 'train_steps_per_second': 0.429, 'total_flos': 7948043919360000.0, 'train_loss': 0.12296106567382813, 'epoch': 3.0})

In [53]:
test_text = test[['comment_text', 'toxicity']][:100000]
test_text['toxicity'] = test_text['toxicity'].astype('int')

In [54]:
# tokenize train and test datasets
test_dataset = Dataset.from_pandas(test_text).map(tokenize, batched=True)

# set dataset format for PyTorch
test_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

  0%|          | 0/100 [00:00<?, ?ba/s]

In [55]:
outputs = trainer.predict(test_dataset)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text, toxicity. If comment_text, toxicity are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100000
  Batch size = 32


In [56]:
y_pred = outputs.predictions.argmax(1)

In [57]:
test_true = test[:100000]

In [58]:
from quality_calculator import compute_bias_metrics_for_model, calculate_overall_auc, get_final_metric


oof_name = 'predicted_target'
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
test_true[oof_name] = y_pred
#evaluation
bias_metrics_df = compute_bias_metrics_for_model(test_true, identity_columns, oof_name, 'toxicity')
display(bias_metrics_df)
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_true, oof_name))
print(f"FINAL SCORE FOR LSTM IS {FINAL_SCORE}")   

  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_s

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,122,0.752858,0.80222,0.84875
6,black,227,0.764829,0.834398,0.829584
7,white,382,0.765809,0.772912,0.89083
4,jewish,103,0.77283,0.857634,0.813311
8,psychiatric_or_mental_illness,67,0.779487,0.837977,0.839451
5,muslim,207,0.793257,0.879766,0.812535
3,christian,313,0.819222,0.884357,0.833481
1,female,520,0.820065,0.859077,0.859938
0,male,471,0.840821,0.860521,0.878589


FINAL SCORE FOR LSTM IS 0.841515555294606
