In [1]:
import os
import re
import string
import boto3
from botocore.exceptions import ClientError
import awswrangler as wr
import pandas as pd
import numpy as np

SEED = 1234
N_SAMPLES = 100_000
np.random.seed(SEED)

import gensim.downloader as api
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim import models
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense, corpus2csc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
from xgboost import XGBClassifier
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

from tqdm import tqdm

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
chunks = pd.read_csv('../data/toxic_data.csv', chunksize=100000)
df = pd.concat(chunks)
df.head()

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till after the election in 2 yrs.... dirty politicians need to be afraid of Tar and feathers again... but they aren't and so the people get screwed.,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,0,2,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,,,,,,,,,,,,,,,,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental hospitals. Boorah,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,1,2,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,,,,,,,,,,,,,,,,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by not making this announcement himself.\n\nWhat an awful human being .....,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,2,3,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,,,,,,,,,,,,,,,,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,0,1,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,80


In [3]:
df['comment_text'] = df['comment_text'].fillna("")

In [4]:
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['toxicity']:
    df.loc[:, col] = np.where(df[col] >= 0.5, True, False)

In [5]:
train_df = df[df['split'] == 'train']
test_df = df[df['split'] != 'train']

In [6]:
train_df.shape, test_df.shape

((1804875, 46), (194641, 46))

In [9]:
# sample = df
sample = train_df.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
train_text, val_text, train_label, val_label = train_test_split(sample['comment_text'], sample['toxicity'], test_size=0.2, random_state=SEED)
# train_text, train_label = sample['comment_text'], sample['toxicity']
test_text, test_label = test_df['comment_text'], test_df['toxicity']

# train_label = train_label.astype('int').to_numpy()
# val_label = val_label.astype('int').to_numpy()
# test_label = test_label.astype('int').to_numpy()

In [12]:
val_df = df.iloc[val_text.index]
val_df.shape

(20000, 46)

In [13]:
train_text.shape, val_text.shape, test_text.shape

((80000,), (20000,), (194641,))

In [14]:
misspell_dict = {"aren't": "are not", "can't": "cannot", "couldn't": "could not",
                 "didn't": "did not", "doesn't": "does not", "don't": "do not",
                 "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                 "he'd": "he would", "he'll": "he will", "he's": "he is",
                 "i'd": "I had", "i'll": "I will", "i'm": "I am", "isn't": "is not",
                 "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us",
                 "mightn't": "might not", "mustn't": "must not", "shan't": "shall not",
                 "she'd": "she would", "she'll": "she will", "she's": "she is",
                 "shouldn't": "should not", "that's": "that is", "there's": "there is",
                 "they'd": "they would", "they'll": "they will", "they're": "they are",
                 "they've": "they have", "we'd": "we would", "we're": "we are",
                 "weren't": "were not", "we've": "we have", "what'll": "what will",
                 "what're": "what are", "what's": "what is", "what've": "what have",
                 "where's": "where is", "who'd": "who would", "who'll": "who will",
                 "who're": "who are", "who's": "who is", "who've": "who have",
                 "won't": "will not", "wouldn't": "would not", "you'd": "you would",
                 "you'll": "you will", "you're": "you are", "you've": "you have",
                 "'re": " are", "wasn't": "was not", "we'll": " will", "tryin'": "trying"}


def _get_misspell(misspell_dict):
    misspell_re = re.compile('(%s)' % '|'.join(misspell_dict.keys()))
    return misspell_dict, misspell_re


def replace_typical_misspell(text):
    misspellings, misspellings_re = _get_misspell(misspell_dict)

    def replace(match):
        return misspellings[match.group(0)]

    return misspellings_re.sub(replace, text)
    

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^',
          '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█',
          '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶',
          '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼',
          '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']


def clean_text(x):
    x = str(x)
    for punct in puncts + list(string.punctuation):
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    return re.sub('\d+', ' ', x)

In [15]:
# clean misspellings
train_text = train_text.apply(replace_typical_misspell)
val_text = val_text.apply(replace_typical_misspell)
test_text = test_text.apply(replace_typical_misspell)

# clean the text
train_text = train_text.apply(clean_text)
val_text = val_text.apply(replace_typical_misspell)
test_text = test_text.apply(clean_text)

# clean numbers
train_text = train_text.apply(clean_numbers)
val_text = val_text.apply(replace_typical_misspell)
test_text = test_text.apply(clean_numbers)

In [16]:
def simple_preproc(text):
    for line in text:
        yield simple_preprocess(line)

In [17]:
# train_lists = list(map(lambda x: simple_preprocess(x), train_text))
# test_lists = list(map(lambda x: simple_preprocess(x), test_text))  

In [18]:
# train_lists = [simple_preprocess(doc) for doc in tqdm(train_text)]
# # val_lists = [simple_preprocess(doc) for doc in val_text]
# test_lists = [simple_preprocess(doc) for doc in tqdm(test_text)]

In [19]:
# Create the Dictionary and Corpus
dictionary = corpora.Dictionary()

#allow_update=True - add new words to dictionary
bow_train = [dictionary.doc2bow(doc, allow_update=True) for doc in tqdm(simple_preproc(train_text))]
bow_val = [dictionary.doc2bow(doc, allow_update=False) for doc in tqdm(simple_preproc(val_text))]
bow_test = [dictionary.doc2bow(doc, allow_update=False) for doc in tqdm(simple_preproc(test_text))] 

dictionary.save("my_dictionary_full")
loaded_dict = corpora.Dictionary.load("my_dictionary_full")

80000it [00:10, 7959.95it/s]
20000it [00:01, 10410.08it/s]
194641it [00:17, 11085.91it/s]


In [20]:
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
print(f"Number of docs is {num_docs}, there are {num_terms} words in dictionary")

Number of docs is 80000, there are 66301 words in dictionary


In [21]:
tfidf = models.TfidfModel(bow_train, dictionary=loaded_dict)
train_tfidf = tfidf[bow_train]
val_tfidf = tfidf[bow_val]
test_tfidf = tfidf[bow_test]

In [22]:
train_tfidf_sparse = corpus2csc(train_tfidf, num_terms=num_terms, num_docs=num_docs).T
val_tfidf_sparse = corpus2csc(val_tfidf, num_terms=num_terms).T
test_tfidf_sparse = corpus2csc(test_tfidf, num_terms=num_terms).T

In [23]:
# Getting all memory using os.popen()
total_memory, used_memory, free_memory = map(
    int, os.popen('free -t -m').readlines()[-1].split()[1:])
  
# Memory usage
print("RAM memory % used:", round((used_memory/total_memory) * 100, 2))

RAM memory % used: 65.12


In [24]:
%%time
logreg = LogisticRegression(max_iter=1000)
logreg.fit(train_tfidf_sparse, train_label)
oof_name = 'predicted_target'
test_df[oof_name] = logreg.predict_proba(test_tfidf_sparse)[:, 1]

CPU times: user 8.87 s, sys: 6.22 s, total: 15.1 s
Wall time: 4.55 s


In [25]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, oof_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[oof_name])

def compute_bpsn_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bnsp_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)
oof_name = 'predicted_target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,1065,0.741849,0.721643,0.938344
6,black,1519,0.757816,0.684025,0.955764
7,white,2452,0.787267,0.668299,0.968171
5,muslim,2040,0.790704,0.740633,0.945972
4,jewish,835,0.823446,0.833454,0.917689
8,psychiatric_or_mental_illness,511,0.854962,0.770391,0.957664
0,male,4386,0.857507,0.809902,0.948857
1,female,5155,0.863382,0.844982,0.935141
3,christian,4226,0.878163,0.903279,0.898506


In [26]:
def calculate_overall_auc(df, oof_name):
    true_labels = df['toxicity']
    predicted_labels = df[oof_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.8554141464717673


In [27]:
import pickle
# # Save to file in the current working directory
pkl_filename = "logreg_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(logreg, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    upload_pickle_model = pickle.load(file)


In [28]:
evaluation = [(train_tfidf_sparse, train_label), (val_tfidf_sparse, val_label)]

In [29]:
%%time
# fit model no training data
xgb_model = XGBClassifier(
    max_depth=6,
    eta=0.2,
    gamma=2.5,
    min_child_weight=4,
    subsample=0.5,
    reg_alpha=103.0, reg_lambda=0.913,
    verbosity=1,
    colsample_bytree=0.82,
    early_stopping_rounds=10, 
    objective='binary:logistic',
    )
xgb_model.fit(train_tfidf_sparse, train_label, eval_set=evaluation, verbose=False)

CPU times: user 1min 11s, sys: 83.7 ms, total: 1min 11s
Wall time: 31.1 s


In [30]:
xgb_model

In [31]:
xgb_model.save_model("model_xgb.json")

In [33]:
xgb_model2 = xgb.XGBClassifier()
xgb_model2.load_model("model_xgb.json")

In [34]:
oof_name = 'predicted_target_xgb'
test_df[oof_name] = xgb_model.predict_proba(test_tfidf_sparse)[:, 1]

In [35]:
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
5,muslim,2040,0.639236,0.739544,0.660617
2,homosexual_gay_or_lesbian,1065,0.642977,0.753479,0.646954
6,black,1519,0.665596,0.64634,0.777883
7,white,2452,0.671381,0.383721,0.946613
1,female,5155,0.709359,0.752724,0.713018
3,christian,4226,0.715402,0.7852,0.678134
0,male,4386,0.718436,0.709568,0.764281
4,jewish,835,0.723026,0.770162,0.70664
8,psychiatric_or_mental_illness,511,0.723725,0.750371,0.728261


In [36]:
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.6793432662425507


In [39]:
test_strings = ["you've fucked up Fd asshole $5", "55 go fuck yourself, fuckin' fuck. Die!@", "I am lesbian and I proud of it",
               "BLM", "BLM sucks", "I think people like you should die", "I love you friends", "BLM $uck$",
               "I hate software development", "I hate software development and you", 
               "Go and kill this motherfucke"]

for text in test_strings:
    line = replace_typical_misspell(text)
    line = clean_text(line)
    line = clean_numbers(line)
    bow = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc([line])]
    test_tfidf = tfidf[bow]
    test_tfidf_sparse = corpus2csc(test_tfidf, num_terms=num_terms).T
    preds_lr = upload_pickle_model.predict_proba(test_tfidf_sparse)[:, 1]
    preds_xgb = xgb_model2.predict_proba(test_tfidf_sparse)[:, 1]
    print(f"Text: {text}, Logreg = {preds_lr[0]}, XGB = {preds_xgb[0]}")

Text: you've fucked up Fd asshole $5, Logreg = 0.21266679867372354, XGB = 0.073288694024086
Text: 55 go fuck yourself, fuckin' fuck. Die!@, Logreg = 0.6255653885904803, XGB = 0.0632777065038681
Text: I am lesbian and I proud of it, Logreg = 0.07872407913449166, XGB = 0.06692002713680267
Text: BLM, Logreg = 0.04569240274833987, XGB = 0.0632777065038681
Text: BLM sucks, Logreg = 0.38191035258790695, XGB = 0.0632777065038681
Text: I think people like you should die, Logreg = 0.1713093092088512, XGB = 0.07706321030855179
Text: I love you friends, Logreg = 0.050822355079904485, XGB = 0.07682149857282639
Text: BLM $uck$, Logreg = 0.07854102989800502, XGB = 0.0632777065038681
Text: I hate software development, Logreg = 0.0655529238879135, XGB = 0.0632777065038681
Text: I hate software development and you, Logreg = 0.0899044900881707, XGB = 0.09542418271303177
Text: Go and kill this motherfucke, Logreg = 0.8279900674098539, XGB = 0.09661971032619476


In [45]:
import optuna

In [50]:
def objective(trial):
    dtrain = xgb.DMatrix(train_tfidf_sparse, train_label)
    dvalid = xgb.DMatrix(val_tfidf_sparse, val_label)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    oof_name = 'predicted_target_xgb'
    val_df[oof_name] = pred_labels
    bias_metrics_df = compute_bias_metrics_for_model(val_df, identity_columns, oof_name, 'toxicity')
    final_score = get_final_metric(bias_metrics_df, calculate_overall_auc(val_df, oof_name))

    
    return final_score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-09-05 13:01:29,433][0m A new study created in memory with name: no-name-ea92a188-a175-4283-8d74-1a8477ef3155[0m
[32m[I 2022-09-05 13:01:33,275][0m Trial 0 finished with value: 0.4975723746757953 and parameters: {'booster': 'dart', 'lambda': 0.010589527861494485, 'alpha': 3.599207773037466e-08, 'subsample': 0.9717306822464233, 'colsample_bytree': 0.6878529510648617, 'max_depth': 3, 'min_child_weight': 5, 'eta': 2.0927823984492195e-07, 'gamma': 1.0179939970887877e-05, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 1.3545813581179205e-08, 'skip_drop': 8.96242465217752e-07}. Best is trial 0 with value: 0.4975723746757953.[0m
[32m[I 2022-09-05 13:01:33,752][0m Trial 1 finished with value: 0.5 and parameters: {'booster': 'gblinear', 'lambda': 0.007168862991758686, 'alpha': 1.3632155923293704e-05, 'subsample': 0.9292950312169705, 'colsample_bytree': 0.8020031606250246}. Best is trial 1 with value: 0.5.[0m
[32m[I 2022-09-05 13