In [1]:
import os
import boto3
from botocore.exceptions import ClientError
import awswrangler as wr
import pandas as pd
import numpy as np

SEED = 1234
N_SAMPLES = 10_000
np.random.seed(SEED)

import gensim.downloader as api
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim import models
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense, corpus2csc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

from tqdm import tqdm

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
chunks = pd.read_csv('../data/toxic_data.csv', chunksize=100000)
df = pd.concat(chunks)
df.head()

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till after the election in 2 yrs.... dirty politicians need to be afraid of Tar and feathers again... but they aren't and so the people get screwed.,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,0,2,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,,,,,,,,,,,,,,,,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental hospitals. Boorah,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,1,2,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,,,,,,,,,,,,,,,,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by not making this announcement himself.\n\nWhat an awful human being .....,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,2,3,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,,,,,,,,,,,,,,,,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,0,1,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,80


In [3]:
df['comment_text'] = df['comment_text'].fillna("")

In [4]:
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['toxicity']:
    df.loc[:, col] = np.where(df[col] >= 0.5, True, False)

In [5]:
train_df = df[df['split'] == 'train']
test_df = df[df['split'] != 'train']

In [6]:
train_df.shape, test_df.shape

((1804875, 46), (194641, 46))

In [7]:
sample = train_df.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
# train_text, val_text, train_label, val_label = train_test_split(sample['comment_text'], sample['toxicity'], test_size=0.2, random_state=SEED)
train_text, train_label = sample['comment_text'], sample['toxicity']
test_text, test_label = test_df['comment_text'], test_df['toxicity']

In [22]:
train_text.shape, test_text.shape

((10000,), (194641,))

In [8]:
train_lists = [simple_preprocess(doc) for doc in tqdm(train_text)]
# val_lists = [simple_preprocess(doc) for doc in val_text]
test_lists = [simple_preprocess(doc) for doc in tqdm(test_text)]

100%|██████████| 10000/10000 [00:00<00:00, 15614.27it/s]
100%|██████████| 194641/194641 [00:10<00:00, 18163.57it/s]


In [9]:
# Create the Dictionary and Corpus
dictionary = corpora.Dictionary()

#allow_update=True - add new words to dictionary
bow_train = [dictionary.doc2bow(doc, allow_update=True) for doc in tqdm(train_lists)]
# bow_val = [dictionary.doc2bow(doc, allow_update=False) for doc in tqdm(val_lists)] 
bow_test = [dictionary.doc2bow(doc, allow_update=False) for doc in tqdm(test_lists)] 

100%|██████████| 10000/10000 [00:01<00:00, 9997.47it/s]
100%|██████████| 194641/194641 [00:07<00:00, 25808.02it/s]


In [10]:
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
print(f"Number of docs is {num_docs}, there are {num_terms} words in dictionary")

Number of docs is 10000, there are 26602 words in dictionary


In [11]:
tfidf = models.TfidfModel(bow_train, dictionary=dictionary)
train_tfidf = tfidf[bow_train]
# val_tfidf = tfidf[bow_val]
test_tfidf = tfidf[bow_test]

In [12]:
train_tfidf_sparse = corpus2csc(train_tfidf, num_terms=num_terms, num_docs=num_docs).T
# val_tfidf_sparse = corpus2csc(val_tfidf, num_terms=num_terms).T
test_tfidf_sparse = corpus2csc(test_tfidf, num_terms=num_terms).T

In [13]:
# Getting all memory using os.popen()
total_memory, used_memory, free_memory = map(
    int, os.popen('free -t -m').readlines()[-1].split()[1:])
  
# Memory usage
print("RAM memory % used:", round((used_memory/total_memory) * 100, 2))

RAM memory % used: 35.41


In [14]:
%%time
logreg = LogisticRegression(max_iter=1000)
logreg.fit(train_tfidf_sparse, train_label)
oof_name = 'predicted_target'
test_df[oof_name] = logreg.predict_proba(test_tfidf_sparse)[:, 1]

CPU times: user 889 ms, sys: 725 ms, total: 1.61 s
Wall time: 522 ms


In [15]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, oof_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[oof_name])

def compute_bpsn_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bnsp_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)
oof_name = 'predicted_target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,1065,0.680772,0.662993,0.873788
6,black,1519,0.703869,0.590587,0.920969
5,muslim,2040,0.719848,0.734705,0.85081
7,white,2452,0.722973,0.523664,0.952675
4,jewish,835,0.765059,0.773723,0.856239
8,psychiatric_or_mental_illness,511,0.776671,0.832971,0.800919
0,male,4386,0.796861,0.69436,0.919761
1,female,5155,0.797418,0.76392,0.879388
3,christian,4226,0.819493,0.837312,0.837789


In [16]:
def calculate_overall_auc(df, oof_name):
    true_labels = df['toxicity']
    predicted_labels = df[oof_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.782651652886569


In [17]:
%%time

parameters = {'C': np.logspace(-3, 1, 5)}
clf = GridSearchCV(logreg, parameters, n_jobs=-1, scoring='roc_auc', cv=5, verbose=5)
clf.fit(train_tfidf_sparse, train_label)

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=10000), n_jobs=-1,
             param_grid=parameters,
             scoring='roc_auc')

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ...........................C=0.001;, score=0.785 total time=   0.2s
[CV 2/5] END ...........................C=0.001;, score=0.797 total time=   0.2s
[CV 3/5] END ...........................C=0.001;, score=0.772 total time=   0.2s
[CV 4/5] END ...........................C=0.001;, score=0.814 total time=   0.2s
[CV 5/5] END ...........................C=0.001;, score=0.769 total time=   0.1s
[CV 1/5] END ............................C=0.01;, score=0.785 total time=   0.1s
[CV 2/5] END ............................C=0.01;, score=0.797 total time=   0.1s
[CV 4/5] END ............................C=0.01;, score=0.815 total time=   0.1s
[CV 3/5] END ............................C=0.01;, score=0.773 total time=   0.2s
[CV 5/5] END ............................C=0.01;, score=0.769 total time=   0.2s
[CV 1/5] END .............................C=0.1;, score=0.789 total time=   0.2s
[CV 3/5] END .............................C=0.1;,

In [18]:
clf.best_score_, clf.best_params_ 

(0.8155288373598232, {'C': 10.0})

In [19]:
oof_name = 'best_predicted_target'
test_df[oof_name] = clf.best_estimator_.predict_proba(test_tfidf_sparse)[:, 1]
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.7835322787872586


In [20]:
test_df

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count,predicted_target,best_predicted_target
3,7084460,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,True,0.065789,0.552632,0.592105,0.000000,0.684211,0.105263,False,False,,,,False,,,False,False,False,,,,,False,False,,,,,,False,,0,76,0.140330,0.281908
10,7141509,"NO ! There are no alternative facts. Go check for yourself. It is people like you, who have no idea what you are talking about that has gotten this State and Country into the mess it is in. People who think the Goverment, be it State or Federal, can spend the peoples money better than they can, is stupid and nonsensical. Politicians use taxes as Personal slush accounts to continue their carrers, buying votes from the lame and the lazy.",test,2017-01-30 02:53:48.012277+00,21,919529.0,164687,approved,1,0,0,0,0,True,0.013889,0.097222,0.000000,0.097222,0.583333,0.000000,False,False,,,,False,,,False,False,False,,,,,False,False,,,,,,False,,0,72,0.232100,0.576079
11,7077814,the more you whine sore loser Artster\n\nthe more we enjoy your agony,test,2016-12-03 00:17:42.300700+00,54,649753.0,154126,approved,0,0,0,0,0,True,0.000000,0.050000,0.125000,0.000000,0.600000,0.112500,False,False,,,,False,,,False,False,False,,,,,False,False,,,,,,False,,0,80,0.190204,0.541990
38,7147990,"There's rarely opportunity to agree with Bennet on much, but in this case he's right. Trump is POTUS mostly because the electorate has grown so sick and tired of the status quo in Washington DC. And electing Trump was their backlash. \n\nAnd for the final paragraph, he'd be more accurate if he'd replaced the word ""price"" with the word ""cost"". Prices are high because costs are high. Any reform needs to be focused on COST containment.",test,2017-09-13 16:37:16.990602+00,102,,377304,approved,1,0,0,1,2,False,0.000000,0.000000,0.000000,0.000000,0.111111,0.000000,False,False,,,,False,,,False,False,False,,,,,False,False,,,,,,False,,0,9,0.090298,0.053549
42,7008066,The Law has every freedom to be an asss!,test,2017-07-09 07:03:44.153492+00,54,5556167.0,353158,approved,0,0,0,0,0,True,0.100000,0.300000,0.300000,0.100000,0.500000,0.000000,False,False,,,,False,,,False,False,False,,,,,False,False,,,,,,False,,0,10,0.067636,0.052852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999453,7046916,"Herod's ""slaughter of the innocents"" in Matthew's nativity narrative is a literary device echoing Pharaoh's slaughter of newborn Hebrew boys in Exodus, consistent with Matthew's conception of Jesus as the Second Moses.\n\nThough Augustus is famously said to have quipped, “It is better to be Herod's pig than his son.”",test,2016-11-21 14:57:40.452084+00,53,616479.0,152437,approved,0,0,0,12,0,False,0.000000,0.100000,0.000000,0.100000,0.300000,0.100000,False,False,0.0,0.0,0.0,False,0.0,0.0,True,False,False,0.0,0.0,0.0,0.0,False,False,0.0,0.0,0.0,0.0,0.0,False,0.0,10,10,0.062893,0.024223
1999456,7088523,"You don't know that he would kill them if he could, and given he's mentally-ill, is it a surprise he shot at them given what they showed up with and that they lobbed gas canisters and stun grenades at him just because he wouldn't go outside?",test,2016-09-23 11:17:37.339930+00,21,487899.0,146632,approved,0,0,0,1,0,False,0.100000,0.000000,0.000000,0.000000,0.200000,0.200000,False,False,0.0,0.0,0.0,False,0.0,0.0,False,False,False,0.0,0.0,0.0,0.0,False,False,0.0,0.0,0.0,0.0,0.0,True,0.0,10,10,0.223321,0.333816
1999492,7101851,My son shouldn't have to be afraid during THOSE Muslim prayers at school either.,test,2017-04-29 10:08:58.773502+00,54,,329940,rejected,0,0,0,0,0,False,0.000000,0.000000,0.000000,0.400000,0.100000,0.000000,False,False,0.0,0.0,0.0,False,0.0,0.0,False,False,True,0.0,0.0,0.0,0.0,False,False,0.0,0.0,0.0,0.0,0.0,False,0.0,10,10,0.044657,0.010022
1999495,7084928,Homosexuals are NOT pedophiles. The pedophiles in the catholic church are just that: pedophiles. Their sexual orientation doesn't indicate their likelihood to be a pedophile no more than your sexuality does.,test,2017-09-28 02:01:22.244719+00,13,6036612.0,382691,approved,0,0,0,5,1,False,0.100000,0.100000,0.200000,0.300000,0.300000,0.000000,False,False,0.0,0.0,0.0,True,0.0,0.0,True,False,False,0.0,0.0,0.0,0.1,False,False,0.0,0.0,0.0,0.0,0.0,False,0.0,10,10,0.081771,0.065228
