In [1]:
# %%capture
# import IPython
# %conda install -c conda-forge ipywidgets -y
# IPython.Application.instance().kernel.do_shutdown(True)

In [32]:
from datasets import Dataset
from transformers import AutoTokenizer
# from datasets.filesystems import S3FileSystem
from sagemaker.huggingface import HuggingFace
from sagemaker.huggingface.model import HuggingFaceModel


from sklearn.model_selection import train_test_split
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

SEED = 1234

In [43]:
import os
import sagemaker
import pandas as pd
import awswrangler as wr
import numpy as np
import botocore
import string
import re
from sklearn.metrics import roc_auc_score

# sess = sagemaker.Session()
sagemaker_session_bucket = 'godelsagemaker'
# if sagemaker_session_bucket is None and sess is not None:
#     sagemaker_session_bucket = sess.default_bucket()

# try:
#     role = sagemaker.get_execution_role()
# except ValueError:
#     role = os.getenv('SAGEMAKER_ROLE')


# sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

# print(f"sagemaker role arn: {role}")
# print(f"sagemaker bucket: {sess.default_bucket()}")
# print(f"sagemaker session region: {sess.boto_region_name}")

In [33]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

device(type='cuda')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Today I'm going to use {device.type}")

In [4]:
##HERE WILL BE VARIABLES
SEED = 1234
N_SAMPLES = 100000

PATH_DATA = f"s3://{sagemaker_session_bucket}/data/toxic_data.csv"
S3_PREFIX = 'HuggingFaceExperiment'
PATH_SCRIPT = f"s3://{sagemaker_session_bucket}/HuggingFaceExperiment/scripts/"

In [5]:
# chunks= wr.s3.read_csv(path=PATH_DATA, chunksize=10000)
chunks= pd.read_csv("../data/toxic_data.csv", chunksize=10000)
df = pd.concat(chunks)

In [6]:
df['comment_text'] = df['comment_text'].fillna("")
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['toxicity']:
    df.loc[:, col] = np.where(df[col] >= 0.5, 1, 0)

In [7]:
col_to_drop = [col for col in df.columns if col not in identity_columns + ['toxicity', 'comment_text', 'split']]
df = df.drop(col_to_drop, axis=1)

In [8]:
train_df = df[df['split'] == 'train']
test_df = df[df['split'] != 'train']
train_df.shape, test_df.shape

((1804875, 12), (194641, 12))

In [9]:
sample = train_df.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
train_text, val_text = train_test_split(sample, test_size=0.2, random_state=SEED)
# train_text, train_label = sample['comment_text'], sample['toxicity']
test_text = test_df.reset_index(drop=True)

In [10]:
train_text = train_text[['comment_text', 'toxicity']].reset_index(drop=True)
val_text = val_text[['comment_text', 'toxicity']].reset_index(drop=True)

In [11]:
misspell_dict = {"aren't": "are not", "can't": "cannot", "couldn't": "could not",
                 "didn't": "did not", "doesn't": "does not", "don't": "do not",
                 "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                 "he'd": "he would", "he'll": "he will", "he's": "he is",
                 "i'd": "I had", "i'll": "I will", "i'm": "I am", "isn't": "is not",
                 "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us",
                 "mightn't": "might not", "mustn't": "must not", "shan't": "shall not",
                 "she'd": "she would", "she'll": "she will", "she's": "she is",
                 "shouldn't": "should not", "that's": "that is", "there's": "there is",
                 "they'd": "they would", "they'll": "they will", "they're": "they are",
                 "they've": "they have", "we'd": "we would", "we're": "we are",
                 "weren't": "were not", "we've": "we have", "what'll": "what will",
                 "what're": "what are", "what's": "what is", "what've": "what have",
                 "where's": "where is", "who'd": "who would", "who'll": "who will",
                 "who're": "who are", "who's": "who is", "who've": "who have",
                 "won't": "will not", "wouldn't": "would not", "you'd": "you would",
                 "you'll": "you will", "you're": "you are", "you've": "you have",
                 "'re": " are", "wasn't": "was not", "we'll": " will", "tryin'": "trying"}


def _get_misspell(misspell_dict):
    misspell_re = re.compile('(%s)' % '|'.join(misspell_dict.keys()))
    return misspell_dict, misspell_re


def replace_typical_misspell(text):
    misspellings, misspellings_re = _get_misspell(misspell_dict)

    def replace(match):
        return misspellings[match.group(0)]

    return misspellings_re.sub(replace, text)
    

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^',
          '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█',
          '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶',
          '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼',
          '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '\n']


def clean_text(x):
    x = str(x)
    for punct in puncts + list(string.punctuation):
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    return re.sub('\d+', ' ', x)

In [12]:
# clean misspellings
train_text['comment_text'] = train_text['comment_text'].apply(replace_typical_misspell)
val_text['comment_text'] = val_text['comment_text'].apply(replace_typical_misspell)

# clean the text
train_text['comment_text'] = train_text['comment_text'].apply(clean_text)
val_text['comment_text'] = val_text['comment_text'].apply(clean_text)

# clean numbers
train_text['comment_text'] = train_text['comment_text'].apply(clean_numbers)
val_text['comment_text'] = val_text['comment_text'].apply(clean_numbers)


In [13]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# create tokenization function
def tokenize(batch):
    return tokenizer(batch["comment_text"], padding="max_length", truncation=True)

# tokenize train and test datasets
train_dataset = Dataset.from_pandas(train_text).map(tokenize, batched=True)
val_dataset = Dataset.from_pandas(val_text).map(tokenize, batched=True)

# set dataset format for PyTorch
train_dataset =  train_dataset.rename_column("toxicity", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset = val_dataset.rename_column("toxicity", "labels")
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [14]:
# del model
# del trainer

In [34]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/studio-lab-user/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.3",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/studio-lab-user/.cache/huggin

In [35]:
training_args = TrainingArguments(
    output_dir="../data/results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 80000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5000


Step,Training Loss
500,0.2282
1000,0.1712
1500,0.1656
2000,0.1507
2500,0.1486
3000,0.1468
3500,0.1459
4000,0.1465
4500,0.1336
5000,0.1418


Saving model checkpoint to ../data/results/checkpoint-500
Configuration saved in ../data/results/checkpoint-500/config.json
Model weights saved in ../data/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../data/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../data/results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../data/results/checkpoint-1000
Configuration saved in ../data/results/checkpoint-1000/config.json
Model weights saved in ../data/results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../data/results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../data/results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ../data/results/checkpoint-1500
Configuration saved in ../data/results/checkpoint-1500/config.json
Model weights saved in ../data/results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ../data/results/checkpoint-1500/token

TrainOutput(global_step=5000, training_loss=0.1579070999145508, metrics={'train_runtime': 4019.9674, 'train_samples_per_second': 19.901, 'train_steps_per_second': 1.244, 'total_flos': 1.059739189248e+16, 'train_loss': 0.1579070999145508, 'epoch': 1.0})

In [36]:
# clean misspellings
test_text['comment_text'] = test_text['comment_text'].apply(replace_typical_misspell)
# clean the text
test_text['comment_text'] = test_text['comment_text'].apply(clean_text)
# clean numbers
test_text['comment_text'] = test_text['comment_text'].apply(clean_numbers)
# tokenize train and test datasets
test_dataset = Dataset.from_pandas(test_text).map(tokenize, batched=True)

# set dataset format for PyTorch
test_dataset =  test_dataset.rename_column("toxicity", "labels")
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

  0%|          | 0/195 [00:00<?, ?ba/s]

In [None]:
predictions = trainer.predict(test_dataset=test_dataset)
predictions[:10]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: black, male, female, homosexual_gay_or_lesbian, jewish, muslim, split, comment_text, white, christian, psychiatric_or_mental_illness. If black, male, female, homosexual_gay_or_lesbian, jewish, muslim, split, comment_text, white, christian, psychiatric_or_mental_illness are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 194641
  Batch size = 16


(array([[-0.6710094 ,  0.96837324],
        [-1.8586667 ,  1.8269457 ],
        [-1.6604707 ,  1.6546588 ],
        ...,
        [ 2.3753507 , -2.8038104 ],
        [ 0.09641217,  0.20841846],
        [-0.6406097 ,  0.8889576 ]], dtype=float32),
 array([1, 1, 1, ..., 0, 0, 0]),
 {'test_loss': 0.13494817912578583,
  'test_runtime': 3592.1353,
  'test_samples_per_second': 54.185,
  'test_steps_per_second': 3.387})

In [None]:
oof_name = 'predicted_target'
test_df[oof_name] = predictions.label_ids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[oof_name] = predictions.label_ids


In [44]:
for col in identity_columns + ['toxicity']:
    test_df.loc[:, col] = np.where(test_df[col] >= 0.5, True, False)


SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, oof_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[oof_name])

def compute_bpsn_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bnsp_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, col] = np.where(test_df[col] >= 0.5, True, False)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  examples = subgroup_negative_example

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
0,male,4386,1.0,1.0,1.0
1,female,5155,1.0,1.0,1.0
2,homosexual_gay_or_lesbian,1065,1.0,1.0,1.0
3,christian,4226,1.0,1.0,1.0
4,jewish,835,1.0,1.0,1.0
5,muslim,2040,1.0,1.0,1.0
6,black,1519,1.0,1.0,1.0
7,white,2452,1.0,1.0,1.0
8,psychiatric_or_mental_illness,511,1.0,1.0,1.0


In [45]:
def calculate_overall_auc(df, oof_name):
    true_labels = df['toxicity']
    predicted_labels = df[oof_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 1.0


In [51]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: comment_text. If comment_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 16


{'eval_loss': 0.14001785218715668,
 'eval_runtime': 352.7187,
 'eval_samples_per_second': 56.702,
 'eval_steps_per_second': 3.544,
 'epoch': 1.0}

In [53]:
trainer.save_model()

Saving model checkpoint to ../data/results
Configuration saved in ../data/results/config.json
Model weights saved in ../data/results/pytorch_model.bin
tokenizer config file saved in ../data/results/tokenizer_config.json
Special tokens file saved in ../data/results/special_tokens_map.json


In [55]:
test_string = "Hello, how are you?"

In [58]:
# tokenize train and test datasets
test_string = Dataset.from_dict({"comment_text": test_string}).map(tokenize, batched=True)

# set dataset format for PyTorch
test_string.set_format("torch", columns=["input_ids", "attention_mask", ])

  0%|          | 0/1 [00:00<?, ?ba/s]

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).