In [8]:
from datasets import Dataset
from transformers import AutoTokenizer
import os
import random
from sklearn.model_selection import train_test_split
import gc
import torch
from datetime import datetime
import awswrangler as wr
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Today I'm going to use {device.type}")

Today I'm going to use cuda


In [10]:
SEED = 1234
N_SAMPLES = 66000
TODAY = datetime.today().strftime("%Y%m%d")
BUCKET_NAME = 'sagemaker-godeltech'
TRAIN_PATH = f"s3://{BUCKET_NAME}/data/train/train.csv"
VAL_PATH = f"s3://{BUCKET_NAME}/data/validate/validate.csv"
TEST_PATH = f"s3://{BUCKET_NAME}/data/test/test.csv"
MODEL_PATH = "local_transformers/models"

In [11]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()
gc.collect()
torch.cuda.empty_cache()
# del model
# del Trainer
# del tokenizer

In [12]:
train = wr.s3.read_csv([TRAIN_PATH])
val = wr.s3.read_csv([VAL_PATH])
test = wr.s3.read_csv([TEST_PATH])

In [13]:
train_sample = train.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
val_sample = val.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
train.shape, val.shape, test.shape

((1443900, 2), (360975, 2), (194641, 12))

In [14]:
train_sample['toxicity'] = train_sample['toxicity'].astype('int')
val_sample['toxicity'] = val_sample['toxicity'].astype('int')

In [15]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir = '../tmp/AutoTokenizer');

# create tokenization function
def tokenize(batch):
    return tokenizer(batch["comment_text"], padding="max_length", truncation=True)

# tokenize train and test datasets
train_dataset = Dataset.from_pandas(train_sample).map(tokenize, batched=True)
val_dataset = Dataset.from_pandas(val_sample).map(tokenize, batched=True)

# set dataset format for PyTorch
train_dataset =  train_dataset.rename_column("toxicity", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset = val_dataset.rename_column("toxicity", "labels")
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/66 [00:00<?, ?ba/s]

  0%|          | 0/66 [00:00<?, ?ba/s]

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, cache_dir = '../tmp/AutoModel')

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [18]:
training_args = TrainingArguments(
    output_dir="../tmp/results",
    logging_dir="../tmp/results/logs",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    seed=SEED,
    load_best_model_at_end=True,
    overwrite_output_dir=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 66000
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2063


Step,Training Loss,Validation Loss
500,0.1847,0.145882
1000,0.1463,0.13408
1500,0.1396,0.134594
2000,0.135,0.126562


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66000
  Batch size = 32
Saving model checkpoint to ../tmp/results/checkpoint-500
Configuration saved in ../tmp/results/checkpoint-500/config.json
Model weights saved in ../tmp/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../tmp/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../tmp/results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this messag

TrainOutput(global_step=2063, training_loss=0.1512481760643549, metrics={'train_runtime': 7808.2012, 'train_samples_per_second': 8.453, 'train_steps_per_second': 0.264, 'total_flos': 8742848311296000.0, 'train_loss': 0.1512481760643549, 'epoch': 1.0})

In [20]:
test_text = test[['comment_text', 'toxicity']][:10000]
test_text['toxicity'] = test_text['toxicity'].astype('int')

In [21]:
# tokenize train and test datasets
test_dataset = Dataset.from_pandas(test_text).map(tokenize, batched=True)

# set dataset format for PyTorch
test_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

  0%|          | 0/10 [00:00<?, ?ba/s]

In [22]:
outputs = trainer.predict(test_dataset)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: toxicity, comment_text. If toxicity, comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10000
  Batch size = 32


In [32]:
from utils import save_to_s3

y_pred = outputs.predictions.argmax(1)
np.savetxt(f"../tmp/transformers_predictions{TODAY}.csv", y_pred, delimiter=",")
save_to_s3(BUCKET_NAME, f"../tmp/transformers_predictions{TODAY}.csv", f"{MODEL_PATH}/transformers_predictions{TODAY}.csv")

In [29]:
test_true = test[:10000]

In [31]:
from quality_calculator import compute_bias_metrics_for_model, calculate_overall_auc, get_final_metric


oof_name = 'predicted_target'
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
test_true[oof_name] = y_pred
#evaluation
bias_metrics_df = compute_bias_metrics_for_model(test_true, identity_columns, oof_name, 'toxicity')
display(bias_metrics_df)
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_true, oof_name))
print(f"FINAL SCORE FOR CUSTOM TRANSFORMERS IS {FINAL_SCORE}")   

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
8,psychiatric_or_mental_illness,14,0.458333,0.657833,0.69881
2,homosexual_gay_or_lesbian,13,0.568182,0.656883,0.808658
0,male,71,0.627941,0.683114,0.843868
6,black,36,0.633333,0.656699,0.873923
7,white,68,0.666667,0.701538,0.863362
1,female,74,0.722034,0.809693,0.812613
3,christian,24,0.722689,0.836092,0.784588
4,jewish,11,0.928571,0.906124,0.918987
5,muslim,26,0.931818,0.905846,0.922233


FINAL SCORE FOR CUSTOM TRANSFORMERS IS 0.7618243654295905
