In [1]:
import os
import boto3
from botocore.exceptions import ClientError
import awswrangler as wr
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from tqdm import tqdm 
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

SEED = 1234
N_SAMPLES = 10_000
np.random.seed(SEED)

import gensim.downloader as api
import logging 
# Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
chunks = pd.read_csv('../data/toxic_data.csv', chunksize=100000)
df = pd.concat(chunks)
df.head()

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till after the election in 2 yrs.... dirty politicians need to be afraid of Tar and feathers again... but they aren't and so the people get screwed.,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,0,2,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,,,,,,,,,,,,,,,,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental hospitals. Boorah,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,1,2,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,,,,,,,,,,,,,,,,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by not making this announcement himself.\n\nWhat an awful human being .....,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,2,3,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,,,,,,,,,,,,,,,,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,0,1,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,80


In [3]:
df['comment_text'] = df['comment_text'].fillna("")
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['toxicity']:
    df.loc[:, col] = np.where(df[col] >= 0.5, True, False)

In [4]:
train_df = df[df['split'] == 'train']
test_df = df[df['split'] != 'train']

In [5]:
train_df.shape, test_df.shape

((1804875, 46), (194641, 46))

In [6]:
sample = train_df.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
train_text, train_label = sample['comment_text'], sample['toxicity']
test_text, test_label = test_df['comment_text'], test_df['toxicity']

In [7]:
train_text.shape, test_text.shape

((10000,), (194641,))

In [8]:
train_lists = [simple_preprocess(doc) for doc in tqdm(train_text)]
test_lists = [simple_preprocess(doc) for doc in tqdm(test_text)]

100%|██████████| 10000/10000 [00:00<00:00, 18181.69it/s]
100%|██████████| 194641/194641 [00:10<00:00, 17987.64it/s]


## Custom Word2Vec

In [9]:
#thanks to https://ethen8181.github.io/machine-learning/keras/text_classification/word2vec_text_classification.html

class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    Word vectors are averaged across to create the document-level vectors/features.
    gensim's own gensim.sklearn_api.W2VTransformer doesn't support out of vocabulary words,
    hence we roll out our own.
    All the parameters are gensim.models.Word2Vec's parameters.
    https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    """

    def __init__(self, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.vector_size = vector_size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epochs = epochs
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab
    
    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.key_to_index]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.vector_size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.vector_size)
    
    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

In [10]:
gensim_word2vec_tr = GensimWord2VecVectorizer(vector_size=100, min_count=1, sg=1, alpha=0.01, epochs=5)
# xgb = XGBClassifier(learning_rate=0.01, n_estimators=100, n_jobs=-1)
lr = LogisticRegression(max_iter=10_000)
# rf = RandomForestClassifier(n_estimators=300)

In [11]:
%%time

pipeline1 = Pipeline([
        ('w2v', gensim_word2vec_tr), 
        ('LogReg', lr)
    ])
pipeline1.fit(train_lists, train_label)
oof_name = 'predicted_target'
test_df[oof_name] = pipeline1.predict_proba(test_lists)[:, 1]

INFO - 10:14:22: collecting all words and their counts
INFO - 10:14:22: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 10:14:22: collected 26602 word types from a corpus of 497428 raw words and 10000 sentences
INFO - 10:14:22: Creating a fresh vocabulary
INFO - 10:14:22: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 26602 unique words (100.00% of original 26602, drops 0)', 'datetime': '2022-08-31T10:14:22.208514', 'gensim': '4.2.0', 'python': '3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) \n[GCC 10.3.0]', 'platform': 'Linux-4.14.287-215.504.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
INFO - 10:14:22: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 497428 word corpus (100.00% of original 497428, drops 0)', 'datetime': '2022-08-31T10:14:22.209502', 'gensim': '4.2.0', 'python': '3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) \n[GCC 10.3.0]', 'platform': 'Linux-4.14.287-215.50

CPU times: user 44.8 s, sys: 446 ms, total: 45.3 s
Wall time: 30.2 s


In [12]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, oof_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[oof_name])

def compute_bpsn_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bnsp_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)
oof_name = 'predicted_target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,1065,0.57408,0.592646,0.610731
5,muslim,2040,0.574092,0.6449,0.556778
7,white,2452,0.602472,0.556341,0.663755
3,christian,4226,0.610531,0.65146,0.581148
0,male,4386,0.616764,0.580403,0.654694
6,black,1519,0.623669,0.622352,0.621023
4,jewish,835,0.638669,0.635322,0.620466
1,female,5155,0.64382,0.628806,0.630476
8,psychiatric_or_mental_illness,511,0.665188,0.609278,0.671313


In [15]:
def calculate_overall_auc(df, oof_name):
    true_labels = df['toxicity']
    predicted_labels = df[oof_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.6148400353391639


## Pretrained Model

In [16]:
w2v_model = api.load("glove-wiki-gigaword-50") 

INFO - 10:15:50: loading projection weights from /home/studio-lab-user/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz
INFO - 10:16:04: KeyedVectors lifecycle event {'msg': 'loaded (400000, 50) matrix of type float32 from /home/studio-lab-user/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-08-31T10:16:04.842871', 'gensim': '4.2.0', 'python': '3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) \n[GCC 10.3.0]', 'platform': 'Linux-4.14.287-215.504.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'load_word2vec_format'}


In [17]:
class GlovoWord2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    Word vectors are averaged across to create the document-level vectors/features.
    gensim's own gensim.sklearn_api.W2VTransformer doesn't support out of vocabulary words,
    hence we roll out our own.
    All the parameters are gensim.models.Word2Vec's parameters.
    https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    """

    def __init__(self, model):
        self.model = model
        self.vector_size = model.vector_size

    def fit(self, X, y): # what are X and y?
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in tqdm(X)])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model.key_to_index]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.vector_size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.vector_size)

In [18]:
glovo_word2vec_tr = GlovoWord2VecVectorizer(w2v_model)
# xgb = XGBClassifier(learning_rate=0.01, n_estimators=100, n_jobs=-1)
lg = LogisticRegression(max_iter=10_000)

In [19]:
%%time

#1_000_000

pipeline2 = Pipeline([
    ('glovo', glovo_word2vec_tr), 
    ('LogReg', lg)
])
pipeline2.fit(train_lists, train_label)
oof_name = 'predicted_target_glovo'
test_df[oof_name] = pipeline2.predict_proba(test_lists)[:, 1]
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

100%|██████████| 10000/10000 [00:01<00:00, 6147.18it/s]
100%|██████████| 194641/194641 [00:22<00:00, 8817.64it/s] 


CPU times: user 24.4 s, sys: 700 ms, total: 25.1 s
Wall time: 25.7 s


Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,1065,0.668843,0.655776,0.786936
6,black,1519,0.711937,0.627373,0.830535
7,white,2452,0.714141,0.618394,0.83811
5,muslim,2040,0.722663,0.658737,0.817044
0,male,4386,0.725412,0.607731,0.855115
8,psychiatric_or_mental_illness,511,0.72638,0.746955,0.751674
1,female,5155,0.735267,0.672384,0.81836
4,jewish,835,0.742281,0.663796,0.824916
3,christian,4226,0.781263,0.749477,0.792213


In [20]:
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.739308681899916
