In [26]:
from sagemaker.huggingface.model import HuggingFaceModel
from datasets import Dataset
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import awswrangler as wr
from quality_calculator import compute_bias_metrics_for_model, calculate_overall_auc, get_final_metric
from tqdm import tqdm
tqdm.pandas()
import json

SEED = 1234

import os
import random
import gc
import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Today I'm going to use {device.type}")

model_name = 'unitary/toxic-bert'

model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir = '../tmp/AutoModel')
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = '../tmp/AutoTokenizer')    

Today I'm going to use cpu


In [3]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()
gc.collect()
torch.cuda.empty_cache()
# del model
# del Trainer
# del tokenizer

In [4]:
with open("../data/godel.txt") as f:
    lines = f.readlines()
godel_test_comments = pd.DataFrame(lines, columns = ['comment_text'])
godel_test_comments['comment_text'] = godel_test_comments['comment_text'].apply(lambda x: x.strip())

In [5]:
sigmoid = torch.nn.Sigmoid()

In [15]:
def predict_toxicity(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        model.to(device)
        outputs = model(**inputs).logits
    probas = np.around(sigmoid(outputs).cpu().detach().numpy(), 4).astype('str')
    return [{
        'toxicity': i[0],
        'severe_toxic': i[1],
        'obscene': i[2],
        'threat': i[3],
        'insult': i[4],
        'identity_hate': i[5]}
     for i in probas]

In [16]:
%%time
godel_test_comments['results'] = godel_test_comments['comment_text'].apply(predict_toxicity)

CPU times: user 35.8 s, sys: 70.6 ms, total: 35.8 s
Wall time: 18 s


In [40]:
json.dumps(godel_test_comments['results'][5][0])

'{"toxicity": "0.0012", "severe_toxic": "1e-04", "obscene": "0.0002", "threat": "1e-04", "insult": "0.0002", "identity_hate": "1e-04"}'

In [17]:
BUCKET_NAME = 'sagemaker-godeltech'
TEST_PATH = f"s3://{BUCKET_NAME}/data/test/test.csv"
test = wr.s3.read_csv([TEST_PATH])

In [18]:
%%time
results = test['comment_text'].progress_apply(predict_toxicity)
# results = np.vectorize(predict_toxicity)(test['comment_text'])

100%|██████████| 194641/194641 [2:02:56<00:00, 26.39it/s]  

CPU times: user 1h 28min 4s, sys: 35min 18s, total: 2h 3min 23s
Wall time: 2h 2min 56s





In [19]:
predictions = np.where(pd.json_normalize(results)['toxicity'] >= 0.5, 1, 0)

In [20]:
oof_name = 'predicted_target'
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
test[oof_name] = predictions
#evaluation
bias_metrics_df = compute_bias_metrics_for_model(test, identity_columns, oof_name, 'toxicity')
display(bias_metrics_df)
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test, oof_name))
print(f"FINAL SCORE FOR TOXIC-BERT IS {FINAL_SCORE}")  

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
5,muslim,2040,0.61284,0.706424,0.612986
7,white,2452,0.620279,0.696593,0.631096
6,black,1519,0.62693,0.695862,0.637012
3,christian,4226,0.627449,0.709378,0.623848
2,homosexual_gay_or_lesbian,1065,0.639722,0.682649,0.661615
4,jewish,835,0.642204,0.696075,0.650283
0,male,4386,0.67781,0.698222,0.684248
1,female,5155,0.679279,0.696321,0.687569
8,psychiatric_or_mental_illness,511,0.70891,0.690462,0.721993


FINAL SCORE FOR TOXIC-BERT IS 0.6741386142708594


In [33]:
from utils import save_to_s3, get_from_s3
from datetime import datetime

TODAY = datetime.today().strftime("%Y%m%d")
BUCKET_NAME = 'sagemaker-godeltech'
MODEL_PATH = "transformers/results"


np.savetxt(f"../tmp/pretrained_predictions_{TODAY}.csv", predictions, delimiter=",")
save_to_s3(BUCKET_NAME, f"../tmp/pretrained_predictions_{TODAY}.csv", f"{MODEL_PATH}/pretrained_predictions_{TODAY}.csv")