In [1]:
import os
import boto3
from botocore.exceptions import ClientError
import awswrangler as wr
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from tqdm import tqdm 
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

import gensim.downloader as api
import logging 
# Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
#list all buckets
s3 = boto3.resource('s3')
buckets = [bucket.name for bucket in s3.buckets.all()]
print(buckets)

INFO - 13:58:54: Found credentials in shared credentials file: ~/.aws/credentials


['godel-tf-state', 'godelsagemaker', 'sagemaker-eu-west-1-798631296162']


In [3]:
def get_list_of_objects_s3(operation_parameters):
    """This function is used to create a generator
    that will yield paths from S3 for the files that 
    are not older than N days
    Currently N is set to 7 because glue job runs onece per week
    Args:
        operation_parameters (_type_): params for the generator
    Yields:
        _type_: S3 path
    """
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(**operation_parameters)
    for page in page_iterator:
        for content in page.get('Contents'):
            # if content.get('LastModified') > utc.localize(datetime.now() - timedelta(days=7)):
                yield content.get('Key')
                
bucket = 'godelsagemaker'
params = {
        'Bucket' : bucket,
        'Prefix' : "data/",
        }
for i in get_list_of_objects_s3(params):
    print(i)

data/toxic_data.csv


In [4]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, oof_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[oof_name])

def compute_bpsn_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bnsp_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

def calculate_overall_auc(df, oof_name):
    true_labels = df['toxicity']
    predicted_labels = df[oof_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)

In [5]:
chunks = wr.s3.read_csv(path=f's3://{bucket}/data/toxic_data.csv', chunksize=100000)
df = pd.concat(chunks)
df['comment_text'] = df['comment_text'].fillna("")
sample = df.sample(1_000_000)
train_df = sample[sample['split'] == 'train']
test_df = sample[sample['split'] != 'train']

In [6]:
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['toxicity']:
    train_df.loc[:, col] = np.where(train_df[col] >= 0.5, True, False)
    test_df.loc[:, col] = np.where(test_df[col] >= 0.5, True, False)
y_train = train_df['toxicity']
y_test = test_df['toxicity']

In [7]:
train_lists = [simple_preprocess(doc) for doc in tqdm(train_df['comment_text'])]
test_lists = [simple_preprocess(doc) for doc in tqdm(test_df['comment_text'])]

100%|██████████| 902855/902855 [00:54<00:00, 16488.33it/s]
100%|██████████| 97145/97145 [00:05<00:00, 17215.93it/s]


## Custom Word2Vec

In [8]:
#thanks to https://ethen8181.github.io/machine-learning/keras/text_classification/word2vec_text_classification.html

class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    Word vectors are averaged across to create the document-level vectors/features.
    gensim's own gensim.sklearn_api.W2VTransformer doesn't support out of vocabulary words,
    hence we roll out our own.
    All the parameters are gensim.models.Word2Vec's parameters.
    https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    """

    def __init__(self, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.vector_size = vector_size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epochs = epochs
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab
    
    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.key_to_index]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.vector_size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.vector_size)
    
    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

In [9]:
gensim_word2vec_tr = GensimWord2VecVectorizer(vector_size=100, min_count=1, sg=1, alpha=0.01, epochs=5)
# xgb = XGBClassifier(learning_rate=0.01, n_estimators=100, n_jobs=-1)
lr = LogisticRegression(max_iter=10_000)
# rf = RandomForestClassifier(n_estimators=300)

In [10]:
%%time

#1_000_000
pipeline1 = Pipeline([
        ('w2v', gensim_word2vec_tr), 
        ('LogReg', lr)
    ])
pipeline1.fit(train_lists, y_train)
oof_name = 'predicted_target'
test_df[oof_name] = pipeline1.predict_proba(test_lists)[:, 1]
oof_name = 'predicted_target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

INFO - 14:03:31: collecting all words and their counts
INFO - 14:03:31: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 14:03:31: PROGRESS: at sentence #10000, processed 498311 words, keeping 26688 word types
INFO - 14:03:31: PROGRESS: at sentence #20000, processed 995634 words, keeping 36525 word types
INFO - 14:03:31: PROGRESS: at sentence #30000, processed 1493604 words, keeping 43613 word types
INFO - 14:03:31: PROGRESS: at sentence #40000, processed 1993446 words, keeping 49425 word types
INFO - 14:03:31: PROGRESS: at sentence #50000, processed 2490094 words, keeping 54320 word types
INFO - 14:03:31: PROGRESS: at sentence #60000, processed 2989277 words, keeping 58830 word types
INFO - 14:03:31: PROGRESS: at sentence #70000, processed 3483569 words, keeping 62805 word types
INFO - 14:03:31: PROGRESS: at sentence #80000, processed 3978620 words, keeping 66541 word types
INFO - 14:03:31: PROGRESS: at sentence #90000, processed 4481202 words, keeping 70070 wo

CPU times: user 39min 37s, sys: 15.2 s, total: 39min 52s
Wall time: 15min 22s


Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,537,0.648219,0.760493,0.752656
6,black,765,0.705002,0.65103,0.859183
7,white,1230,0.722561,0.614534,0.893683
8,psychiatric_or_mental_illness,259,0.734109,0.763102,0.797917
5,muslim,1011,0.738836,0.732673,0.832328
4,jewish,410,0.744916,0.781567,0.799293
0,male,2174,0.757363,0.686715,0.873649
1,female,2513,0.775088,0.733113,0.856349
3,christian,2076,0.784471,0.828803,0.781694


In [11]:
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.7700041118068204


## Pretrained Model

In [12]:
w2v_model = api.load("glove-wiki-gigaword-50") 

INFO - 14:18:53: loading projection weights from /home/studio-lab-user/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz
INFO - 14:19:06: KeyedVectors lifecycle event {'msg': 'loaded (400000, 50) matrix of type float32 from /home/studio-lab-user/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-08-24T14:19:06.592284', 'gensim': '4.2.0', 'python': '3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) \n[GCC 10.3.0]', 'platform': 'Linux-4.14.287-215.504.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'load_word2vec_format'}


In [17]:
class GlovoWord2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    Word vectors are averaged across to create the document-level vectors/features.
    gensim's own gensim.sklearn_api.W2VTransformer doesn't support out of vocabulary words,
    hence we roll out our own.
    All the parameters are gensim.models.Word2Vec's parameters.
    https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    """

    def __init__(self, model):
        self.model = model
        self.vector_size = model.vector_size

    def fit(self, X, y): # what are X and y?
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in tqdm(X)])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model.key_to_index]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.vector_size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.vector_size)

In [18]:
glovo_word2vec_tr = GlovoWord2VecVectorizer(w2v_model)
# xgb = XGBClassifier(learning_rate=0.01, n_estimators=100, n_jobs=-1)
lg = LogisticRegression(max_iter=10_000)

In [19]:
%%time

#1_000_000

pipeline2 = Pipeline([
    ('glovo', glovo_word2vec_tr), 
    ('LogReg', lg)
])
pipeline2.fit(train_lists, y_train)
oof_name = 'predicted_target'
test_df[oof_name] = pipeline2.predict_proba(test_lists)[:, 1]
oof_name = 'predicted_target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

100%|██████████| 902855/902855 [01:51<00:00, 8115.02it/s]
100%|██████████| 97145/97145 [00:11<00:00, 8332.89it/s]


CPU times: user 3min 30s, sys: 31.4 s, total: 4min 1s
Wall time: 2min 37s


Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,537,0.626855,0.65367,0.763716
6,black,765,0.694175,0.597202,0.84658
8,psychiatric_or_mental_illness,259,0.706906,0.773062,0.715572
4,jewish,410,0.71063,0.643902,0.824889
7,white,1230,0.710982,0.587918,0.85467
0,male,2174,0.713457,0.597702,0.858928
1,female,2513,0.738226,0.66129,0.831337
5,muslim,1011,0.742216,0.648065,0.842073
3,christian,2076,0.774936,0.766574,0.779131


In [20]:
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.7318606377659689
