In [1]:
import os
import time
import awswrangler as wr
import pandas as pd
import numpy as np
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
from gensim.matutils import corpus2dense, corpus2csc
from tqdm import tqdm
from utils import setup_applevel_logger, get_logger, replace_typical_misspell, clean_text, clean_numbers
from utils import save_to_s3, get_from_s3
from datetime import datetime
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from quality_calculator import compute_bias_metrics_for_model, calculate_overall_auc, get_final_metric
import optuna
import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
TODAY = datetime.today().strftime("%Y%m%d")
BUCKET_NAME = 'sagemaker-godeltech'
TRAIN_PATH = f"s3://{BUCKET_NAME}/data/train/train.csv"
VAL_PATH = f"s3://{BUCKET_NAME}/data/validate/validate.csv"
TEST_PATH = f"s3://{BUCKET_NAME}/data/test/test.csv"
DICTIONARY_PATH = "xgboost/dictionary"
MODEL_PATH = "xgboost/models"
SEED = 1234
N_SAMPLES = 100000

In [3]:
train = wr.s3.read_csv([TRAIN_PATH])
val = wr.s3.read_csv([VAL_PATH])
test = wr.s3.read_csv([TEST_PATH])

In [4]:
train_sample = train.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
val_sample = val.sample(N_SAMPLES, random_state=SEED, ignore_index=True)

In [5]:
train.shape, val.shape, test.shape

((1443900, 2), (360975, 2), (194641, 12))

In [6]:
train_text = train_sample['comment_text']
val_text = val_sample['comment_text']
train_label = train_sample['toxicity']
val_label = val_sample['toxicity']
test_text = test['comment_text']

In [7]:
dictionary = corpora.Dictionary()

In [8]:
%%time

def simple_preproc(text):
    """
    It is a generator to preprocess texts.
    This lowercases, tokenizes, de-accents (optional) 
    the output are final tokens = unicode strings, that won’t be processed any further.
    """
    for line in text:
        yield simple_preprocess(line)



bow_train = [dictionary.doc2bow(doc, allow_update=True) for doc in simple_preproc(train_text)]
bow_val = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(val_text)]
bow_test = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(test_text)]

CPU times: user 1min 5s, sys: 578 ms, total: 1min 6s
Wall time: 1min 6s


In [9]:
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
print(f"Number of docs is {num_docs}, there are {num_terms} words in dictionary")

Number of docs is 100000, there are 72759 words in dictionary


In [10]:
%%time

tfidf = models.TfidfModel(bow_train, dictionary=dictionary)
train_tfidf = tfidf[bow_train]
val_tfidf = tfidf[bow_val]
test_tfidf = tfidf[bow_test]
train_tfidf_sparse = corpus2csc(train_tfidf, num_terms=num_terms, num_docs=num_docs).T
val_tfidf_sparse = corpus2csc(val_tfidf, num_terms=num_terms).T
test_tfidf_sparse = corpus2csc(test_tfidf, num_terms=num_terms).T

CPU times: user 39.8 s, sys: 570 ms, total: 40.3 s
Wall time: 41.1 s


In [11]:
BEST_SCORE = 0.0

def objective(trial):
    dtrain = xgb.DMatrix(train_tfidf_sparse, train_label)
    dvalid = xgb.DMatrix(val_tfidf_sparse, val_label)
    
    global BEST_SCORE
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    final_score = roc_auc_score(val_label, pred_labels)
    
    return final_score

optuna.logging.set_verbosity(50)

In [12]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))
print("Params: ")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))

Number of finished trials:  50
Best trial:
Value: 0.7299360631561411
Params: 
  booster: gblinear
  lambda: 5.5004023750171485e-08
  alpha: 1.7494182917759452e-06
  subsample: 0.20047584650510672
  colsample_bytree: 0.6435510224914521


In [13]:
xgb_params = trial.params

In [15]:
train_text = train['comment_text']
val_text = val['comment_text']
train_label = train['toxicity']
val_label = val['toxicity']
test_text = test['comment_text']

In [16]:
dictionary = corpora.Dictionary()
bow_train = [dictionary.doc2bow(doc, allow_update=True) for doc in simple_preproc(train_text)]
bow_val = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(val_text)]
bow_test = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(test_text)]
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
print(f"Number of docs is {num_docs}, there are {num_terms} words in dictionary")

Number of docs is 1443900, there are 248642 words in dictionary


In [17]:
tfidf = models.TfidfModel(bow_train, dictionary=dictionary)
train_tfidf = tfidf[bow_train]
val_tfidf = tfidf[bow_val]
test_tfidf = tfidf[bow_test]
train_tfidf_sparse = corpus2csc(train_tfidf, num_terms=num_terms, num_docs=num_docs).T
val_tfidf_sparse = corpus2csc(val_tfidf, num_terms=num_terms).T
test_tfidf_sparse = corpus2csc(test_tfidf, num_terms=num_terms).T

In [18]:
%%time

evaluation = [(train_tfidf_sparse, train_label), (val_tfidf_sparse, val_label)]

# fit model no training data
xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(train_tfidf_sparse, train_label, eval_set=evaluation, verbose=False)
xgb_model.save_model("../tmp/xgb_model_{TODAY}.json")
save_to_s3(BUCKET_NAME, f"../tmp/xgb_model_{TODAY}.json", f"{MODEL_PATH}/xgb_model_{TODAY}.json")

CPU times: user 2min 59s, sys: 616 ms, total: 3min
Wall time: 1min 55s


In [19]:
predictions = xgb_model.predict_proba(test_tfidf_sparse)[:, 1]
np.savetxt(f"../tmp/xgboost_predictions_{TODAY}.csv", predictions, delimiter=",")
save_to_s3(BUCKET_NAME, f"../tmp/xgboost_predictions_{TODAY}.csv", f"{MODEL_PATH}/xgboost_predictions_{TODAY}.csv")
oof_name = 'predicted_target'
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
test[oof_name] = predictions
#evaluation
bias_metrics_df = compute_bias_metrics_for_model(test, identity_columns, oof_name, 'toxicity')
display(bias_metrics_df)
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test, oof_name))
print(f"FINAL SCORE FOR XGBOOST IS {FINAL_SCORE}")    

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,1065,0.778925,0.794768,0.952943
6,black,1519,0.779328,0.75657,0.968102
7,white,2452,0.814111,0.779363,0.970143
5,muslim,2040,0.814363,0.817747,0.958238
4,jewish,835,0.856258,0.857751,0.957543
8,psychiatric_or_mental_illness,511,0.884055,0.848257,0.964934
0,male,4386,0.893863,0.884663,0.958468
1,female,5155,0.90165,0.897523,0.956176
3,christian,4226,0.907966,0.926231,0.940139


FINAL SCORE FOR XGBOOST IS 0.894442968762847


In [35]:
with open("../data/godel.txt") as f:
    lines = f.readlines()

In [37]:
godel_test_comments = pd.DataFrame(lines, columns = ['comment_text'])
godel_test_text = godel_test_comments['comment_text'].apply(lambda x: x.strip())

In [43]:
godel_bow_test = [dictionary.doc2bow(doc, allow_update=False) for doc in simple_preproc(godel_test_text)]
godel_test_tfidf = tfidf[godel_bow_test]
godel_test_tfidf_sparse = corpus2csc(godel_test_tfidf, num_terms=num_terms).T

In [45]:
godel_predictions = xgb_model.predict_proba(godel_test_tfidf_sparse)[:, 1]

In [47]:
godel_test_comments['toxicity'] = godel_predictions

In [49]:
godel_test_comments.sort_values(by='toxicity', ascending=False)

Unnamed: 0,comment_text,toxicity
20,"Official response in in your contract: 15 days term from invoice issue. If you issued your invoice the last day of month, then you get your money on 15th as latest.\n",0.018163
25,"You work, you get paid on about 15th, that's it. Why even bother asking, pushing and dicsussing\n",0.016985
17,"Correct, nothing offensive. Just improper use of tag. It's more distractive. You can tag anyone with such matter, we all read the contract\n",0.016284
23,"Artyom Levchenia ""If you issued your invoice the last day of month, then you get your money on 15th as latest."" - It seems like you put yourself in a role of an attorney (which in the contenxt of your loalty to the emploter is kind of admirable ), but this is simply not true at the moment, we've experienced delays couple of times. ""On 4th you get a note: it's incorrect. You send another one on 4th. You get paid by 19th. Simple."" - I'm not sure about this one ... I think you can still get pai...",0.016124
3,"Hi folks, any news maybe on this vital topic: 12 or 16?\n",0.015504
28,"And that mail carries Andrew and Victoria inside, so no, not finito \n",0.015285
5,"Olesia Buldakova I'm no Viktoryia, but you are wrong Who says you get paid on 16th? Nobody said a word about 16th. And why random person illegitimate post makes you threaten with breach of contract, be nervous etc. I say, I you see a post by Godel administration stating this and that, you can, but this is not the case. Especially tagging busy person responsible for our payments, and she, instead of doing her job, has to now read these void compaints and delay the payments Tag post author in...",0.015254
1,"Or on Tuesday, 16th \n",0.013894
8,"And even if those are, there's no guarantee that PKO and others will process those the same day.So Embrace yourselves\n",0.013851
21,"Btw, this is B2B chat, not employees. Employees have a right to know when they get paid, and their employer have a right to delay payments (reasonable). What we only get to know as entrepreneurs - if our invoice was received. And as per contract - if no response in 5 days - it was. If you get response that the invoice was incorrect, you prolong your payment period from the date of such notice. Say you send it on 31st. On 4th you get a note: it's incorrect. You send another one on 4th. You ge...",0.013394
