In [5]:
import os
import time
import awswrangler as wr
import pandas as pd
import numpy as np
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
from gensim.matutils import corpus2dense, corpus2csc
from tqdm import tqdm
from utils import setup_applevel_logger, get_logger, replace_typical_misspell, clean_text, clean_numbers
from utils import save_to_s3, get_from_s3
from datetime import datetime
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from quality_calculator import compute_bias_metrics_for_model, calculate_overall_auc, get_final_metric
import optuna
import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
TODAY = datetime.today().strftime("%Y%m%d")
BUCKET_NAME = 'sagemaker-godeltech'
TRAIN_PATH = f"s3://{BUCKET_NAME}/data/train/train.csv"
VAL_PATH = f"s3://{BUCKET_NAME}/data/validate/validate.csv"
TEST_PATH = f"s3://{BUCKET_NAME}/data/test/test.csv"
DICTIONARY_PATH = "xgboost/dictionary"
MODEL_PATH = "xgboost/models"
SEED = 1234
N_SAMPLES = 100000

In [7]:
train = wr.s3.read_csv([TRAIN_PATH])
val = wr.s3.read_csv([VAL_PATH])
test = wr.s3.read_csv([TEST_PATH])

In [11]:
train_sample = train.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
val_sample = val.sample(N_SAMPLES, random_state=SEED, ignore_index=True)

In [12]:
train.shape, val.shape, test.shape

((1443900, 2), (360975, 2), (194641, 12))

In [13]:
train_text = train_sample['comment_text']
val_text = val_sample['comment_text']
train_label = train_sample['toxicity']
val_label = val_sample['toxicity']
test_text = test['comment_text']

In [14]:
dictionary = corpora.Dictionary()

In [15]:
%%time

def simple_preproc(text):
    """
    It is a generator to preprocess texts.
    This lowercases, tokenizes, de-accents (optional) 
    the output are final tokens = unicode strings, that won’t be processed any further.
    """
    for line in text:
        yield simple_preprocess(line)



bow_train = [dictionary.doc2bow(doc, allow_update=True) for doc in simple_preproc(train_text)]
bow_val = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(val_text)]
bow_test = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(test_text)]

CPU times: user 33.3 s, sys: 1.24 s, total: 34.6 s
Wall time: 34.6 s


In [16]:
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
print(f"Number of docs is {num_docs}, there are {num_terms} words in dictionary")

Number of docs is 100000, there are 72759 words in dictionary


In [17]:
%%time

tfidf = models.TfidfModel(bow_train, dictionary=dictionary)
train_tfidf = tfidf[bow_train]
val_tfidf = tfidf[bow_val]
test_tfidf = tfidf[bow_test]
train_tfidf_sparse = corpus2csc(train_tfidf, num_terms=num_terms, num_docs=num_docs).T
val_tfidf_sparse = corpus2csc(val_tfidf, num_terms=num_terms).T
test_tfidf_sparse = corpus2csc(test_tfidf, num_terms=num_terms).T

CPU times: user 24.6 s, sys: 1.1 s, total: 25.7 s
Wall time: 25.8 s


In [18]:
## tip: if run on GPU than XGBRegressor(tree_method='gpu_hist', gpu_id=0)

BEST_SCORE = 0.0

def objective(trial):
    dtrain = xgb.DMatrix(train_tfidf_sparse, train_label)
    dvalid = xgb.DMatrix(val_tfidf_sparse, val_label)
    
    global BEST_SCORE
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }
    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    final_score = roc_auc_score(val_label, pred_labels)
    
    return final_score

optuna.logging.set_verbosity(50)

In [19]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))
print("Params: ")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))

Number of finished trials:  50
Best trial:
Value: 0.7301026807114539
Params: 
  booster: gblinear
  lambda: 5.790750836239364e-08
  alpha: 2.1646078362874597e-06
  subsample: 0.2812164532284933
  colsample_bytree: 0.31135975873656624


In [20]:
xgb_params = trial.params

In [21]:
train_text = train['comment_text']
val_text = val['comment_text']
train_label = train['toxicity']
val_label = val['toxicity']
test_text = test['comment_text']

In [22]:
dictionary = corpora.Dictionary()
bow_train = [dictionary.doc2bow(doc, allow_update=True) for doc in simple_preproc(train_text)]
bow_val = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(val_text)]
bow_test = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(test_text)]
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
print(f"Number of docs is {num_docs}, there are {num_terms} words in dictionary")

Number of docs is 1443900, there are 248642 words in dictionary


In [23]:
tfidf = models.TfidfModel(bow_train, dictionary=dictionary)
train_tfidf = tfidf[bow_train]
val_tfidf = tfidf[bow_val]
test_tfidf = tfidf[bow_test]
train_tfidf_sparse = corpus2csc(train_tfidf, num_terms=num_terms, num_docs=num_docs).T
val_tfidf_sparse = corpus2csc(val_tfidf, num_terms=num_terms).T
test_tfidf_sparse = corpus2csc(test_tfidf, num_terms=num_terms).T

In [28]:
%%time

evaluation = [(train_tfidf_sparse, train_label), (val_tfidf_sparse, val_label)]

# fit model no training data
xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(train_tfidf_sparse, train_label, eval_set=evaluation, verbose=False)
xgb_model.save_model(f"../tmp/xgb_model_{TODAY}.json")
save_to_s3(BUCKET_NAME, f"../tmp/xgb_model_{TODAY}.json", f"{MODEL_PATH}/xgb_model_{TODAY}.json")



CPU times: user 2min 31s, sys: 1.01 s, total: 2min 32s
Wall time: 1min 49s


In [29]:
predictions = xgb_model.predict_proba(test_tfidf_sparse)[:, 1]
np.savetxt(f"../tmp/xgboost_predictions_{TODAY}.csv", predictions, delimiter=",")
save_to_s3(BUCKET_NAME, f"../tmp/xgboost_predictions_{TODAY}.csv", f"{MODEL_PATH}/xgboost_predictions_{TODAY}.csv")
oof_name = 'predicted_target'
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
test[oof_name] = predictions
#evaluation
bias_metrics_df = compute_bias_metrics_for_model(test, identity_columns, oof_name, 'toxicity')
display(bias_metrics_df)
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test, oof_name))
print(f"FINAL SCORE FOR XGBOOST IS {FINAL_SCORE}")    

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,1065,0.778182,0.792753,0.953066
6,black,1519,0.778235,0.753908,0.968209
7,white,2452,0.813015,0.775915,0.970367
5,muslim,2040,0.814133,0.815249,0.958486
4,jewish,835,0.856975,0.856525,0.958015
8,psychiatric_or_mental_illness,511,0.882332,0.845194,0.96473
0,male,4386,0.8927,0.882911,0.958298
1,female,5155,0.90083,0.896151,0.955902
3,christian,4226,0.905837,0.925548,0.938251


FINAL SCORE FOR XGBOOST IS 0.8934281875538059
