In [67]:
import os
import boto3
from botocore.exceptions import ClientError
import awswrangler as wr
import pandas as pd
import numpy as np

SEED = 1234

import gensim.downloader as api
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim import models
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense, corpus2csc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
#list all buckets
s3 = boto3.resource('s3')
buckets = [bucket.name for bucket in s3.buckets.all()]
print(buckets)

['godel-tf-state', 'godelsagemaker']


In [5]:
def get_list_of_objects_s3(operation_parameters):
    """This function is used to create a generator
    that will yield paths from S3 for the files that 
    are not older than N days
    Currently N is set to 7 because glue job runs onece per week
    Args:
        operation_parameters (_type_): params for the generator
    Yields:
        _type_: S3 path
    """
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(**operation_parameters)
    for page in page_iterator:
        for content in page.get('Contents'):
            # if content.get('LastModified') > utc.localize(datetime.now() - timedelta(days=7)):
                yield content.get('Key')

In [6]:
bucket = 'godelsagemaker'
params = {
        'Bucket' : bucket,
        'Prefix' : "data/",
        }
for i in get_list_of_objects_s3(params):
    print(i)

data/toxic_data.csv


In [8]:
chunks = wr.s3.read_csv(path=f's3://{bucket}/data/toxic_data.csv', chunksize=100000)
df = pd.concat(chunks)
df.head()

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till after the election in 2 yrs.... dirty politicians need to be afraid of Tar and feathers again... but they aren't and so the people get screwed.,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,0,2,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,,,,,,,,,,,,,,,,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental hospitals. Boorah,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,1,2,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,,,,,,,,,,,,,,,,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by not making this announcement himself.\n\nWhat an awful human being .....,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,2,3,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,,,,,,,,,,,,,,,,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,0,1,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,80


In [52]:
df['comment_text'] = df['comment_text'].fillna("")

In [77]:
sample = df.sample(1_000_000)
train_df = sample[sample['split'] == 'train']
test_df = sample[sample['split'] != 'train']

# train_df = df[df['split'] == 'train']
# test_df = df[df['split'] != 'train']

In [78]:
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['toxicity']:
    train_df.loc[:, col] = np.where(train_df[col] >= 0.5, True, False)
    test_df.loc[:, col] = np.where(test_df[col] >= 0.5, True, False)

In [79]:
y_train = train_df['toxicity']
y_test = test_df['toxicity']

In [80]:
train_lists = [simple_preprocess(doc) for doc in train_df['comment_text']]
test_lists = [simple_preprocess(doc) for doc in test_df['comment_text']]

In [81]:
# Create the Dictionary and Corpus
dictionary = corpora.Dictionary()

#allow_update=True - add new words to dictionary
bow_train = [dictionary.doc2bow(doc, allow_update=True) for doc in train_lists]
bow_test = [dictionary.doc2bow(doc, allow_update=False) for doc in test_lists] 

In [82]:
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())
print(f"Number of docs is {num_docs}, there are {num_terms} words in dictionary")

Number of docs is 902741, there are 198987 words in dictionary


In [83]:
tfidf = models.TfidfModel(bow_train, dictionary=dictionary)
train_tfidf = tfidf[bow_train]
test_tfidf = tfidf[bow_test]

In [84]:
train_tfidf_sparse = corpus2csc(train_tfidf, num_terms=num_terms, num_docs=num_docs).T
test_tfidf_sparse = corpus2csc(test_tfidf, num_terms=num_terms).T

In [85]:
# Getting all memory using os.popen()
total_memory, used_memory, free_memory = map(
    int, os.popen('free -t -m').readlines()[-1].split()[1:])
  
# Memory usage
print("RAM memory % used:", round((used_memory/total_memory) * 100, 2))

RAM memory % used: 71.31


In [89]:
%%time
logreg = LogisticRegression(max_iter=1000)
logreg.fit(train_tfidf_sparse, y_train)
oof_name = 'predicted_target'
test_df[oof_name] = logreg.predict_proba(test_tfidf_sparse)[:, 1]

CPU times: user 3min 3s, sys: 2min 2s, total: 5min 5s
Wall time: 1min 30s


In [90]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, oof_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[oof_name])

def compute_bpsn_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bnsp_auc(df, subgroup, label, oof_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[oof_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)
oof_name = 'predicted_target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, oof_name, 'toxicity')
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,528,0.793987,0.806168,0.946596
6,black,752,0.805402,0.755164,0.972437
5,muslim,980,0.821616,0.820266,0.95759
7,white,1196,0.840706,0.779426,0.973069
4,jewish,425,0.889512,0.87016,0.962205
1,female,2545,0.896562,0.894176,0.953135
8,psychiatric_or_mental_illness,244,0.902831,0.862616,0.969046
0,male,2232,0.910564,0.880615,0.965613
3,christian,2110,0.910934,0.927559,0.939343


In [91]:
def calculate_overall_auc(df, oof_name):
    true_labels = df['toxicity']
    predicted_labels = df[oof_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.8994849346055059


In [97]:
%%time

parameters = {'C': np.logspace(-3, 1, 5)}
clf = GridSearchCV(logreg, parameters, n_jobs=-1, scoring='roc_auc', cv=5, verbose=5)
clf.fit(train_tfidf_sparse, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=10000), n_jobs=-1,
             param_grid=parameters,
             scoring='roc_auc')

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 2/5] END ...........................C=0.001;, score=0.854 total time=   5.6s
[CV 4/5] END ...........................C=0.001;, score=0.852 total time=   5.5s
[CV 3/5] END ...........................C=0.001;, score=0.851 total time=   5.6s
[CV 1/5] END ...........................C=0.001;, score=0.850 total time=   5.8s
[CV 5/5] END ...........................C=0.001;, score=0.851 total time=   5.5s
[CV 3/5] END ............................C=0.01;, score=0.884 total time=   8.9s
[CV 1/5] END ............................C=0.01;, score=0.884 total time=   9.6s
[CV 2/5] END ............................C=0.01;, score=0.887 total time=  10.0s
[CV 4/5] END ............................C=0.01;, score=0.885 total time=   8.6s
[CV 5/5] END ............................C=0.01;, score=0.885 total time=   9.1s
[CV 1/5] END .............................C=0.1;, score=0.929 total time=  24.0s
[CV 2/5] END .............................C=0.1;,

In [98]:
clf.best_score_, clf.best_params_ 

(0.9443671779100228, {'C': 1.0})

In [99]:
oof_name = 'predicted_target'
test_df[oof_name] = clf.best_estimator_.predict_proba(test_tfidf_sparse)[:, 1]
FINAL_SCORE = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, oof_name))
print(f"FINAL SCORE IS {FINAL_SCORE}")

FINAL SCORE IS 0.8994849346055059
