In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

In [None]:
train_df.head(10)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47
5,59859,0.666667,ur a sh*tty comment.,0.047619,0.638095,0.0,0.333333,0.0,,,...,2006,rejected,0,0,0,0,0,0.009524,0,105
6,59861,0.457627,hahahahahahahahhha suck it.,0.050847,0.305085,0.0,0.254237,0.0,,,...,2006,rejected,0,0,0,0,0,0.220339,0,59
7,59863,0.0,FFFFUUUUUUUUUUUUUUU,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
8,239575,0.0,The ranchers seem motivated by mostly by greed...,0.0,0.0,0.0,0.0,0.0,,,...,26662,approved,0,0,0,0,0,0.0,0,4
9,239576,0.0,It was a great show. Not a combo I'd of expect...,0.0,0.0,0.0,0.0,0.0,,,...,26650,approved,0,0,0,1,0,0.0,0,4


In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import os
import pandas as pd
import numpy as np
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats

from sklearn import metrics
from sklearn import model_selection

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.models import Model
from keras.models import load_model

Load and pre-process the data set¶


In [None]:
train = train_df
print('loaded %d records' % len(train))

# Make sure all comment_text values are strings
train['comment_text'] = train['comment_text'].astype(str)

# List all identities
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Convert taget and identity columns to booleans
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)

def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

train = convert_dataframe_to_bool(train)

loaded 1804874 records


Split the data into 80% train and 20% validate sets¶


In [None]:
train_df, validate_df = model_selection.train_test_split(train, test_size=0.2)
print('%d train comments, %d validate comments' % (len(train_df), len(validate_df)))


1443899 train comments, 360975 validate comments


In [None]:
MAX_NUM_WORDS = 10000
TOXICITY_COLUMN = 'target'
TEXT_COLUMN = 'comment_text'

# Create a text tokenizer.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df[TEXT_COLUMN])

# All comments must be truncated or padded to be the same length.
MAX_SEQUENCE_LENGTH = 250
def pad_text(texts, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)

Define and train a Convolutional Neural Net for classifying toxic comments

In [None]:
EMBEDDINGS_PATH = '/content/drive/MyDrive/glove.6B.100d.txt'
EMBEDDINGS_DIMENSION = 100
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128

def train_model(train_df, validate_df, tokenizer):
    # Prepare data
    train_text = pad_text(train_df[TEXT_COLUMN], tokenizer)
    train_labels = to_categorical(train_df[TOXICITY_COLUMN])
    validate_text = pad_text(validate_df[TEXT_COLUMN], tokenizer)
    validate_labels = to_categorical(validate_df[TOXICITY_COLUMN])

    # Load embeddings
    print('loading embeddings')
    embeddings_index = {}
    with open(EMBEDDINGS_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1,
                                 EMBEDDINGS_DIMENSION))
    num_words_in_embedding = 0
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            num_words_in_embedding += 1
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # Create model layers.
    def get_convolutional_neural_net_layers():
        """Returns (input_layer, output_layer)"""
        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                    EMBEDDINGS_DIMENSION,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
        x = embedding_layer(sequence_input)
        x = Conv1D(128, 2, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(128, 3, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(128, 4, activation='relu', padding='same')(x)
        x = MaxPooling1D(40, padding='same')(x)
        x = Flatten()(x)
        x = Dropout(DROPOUT_RATE)(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(2, activation='softmax')(x)
        return sequence_input, preds

    # Compile model
    print('compiling model')
    input_layer, output_layer = get_convolutional_neural_net_layers()
    model = Model(input_layer, output_layer)
    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(lr=LEARNING_RATE),
                  metrics=['acc'])

    # Train model.
    print('training model')
    model.fit(train_text,
              train_labels,
              batch_size=BATCH_SIZE,
              epochs=NUM_EPOCHS,
              validation_data=(validate_text, validate_labels),
              verbose=2)

    return model

model = train_model(train_df, validate_df, tokenizer)

loading embeddings
compiling model




training model
Epoch 1/10
11281/11281 - 1768s - loss: 0.1626 - acc: 0.9423 - val_loss: 0.1543 - val_acc: 0.9436 - 1768s/epoch - 157ms/step
Epoch 2/10
11281/11281 - 1775s - loss: 0.1504 - acc: 0.9460 - val_loss: 0.1541 - val_acc: 0.9463 - 1775s/epoch - 157ms/step
Epoch 3/10
11281/11281 - 1773s - loss: 0.1479 - acc: 0.9468 - val_loss: 0.1818 - val_acc: 0.9466 - 1773s/epoch - 157ms/step
Epoch 4/10
11281/11281 - 1795s - loss: 0.1473 - acc: 0.9471 - val_loss: 0.1504 - val_acc: 0.9426 - 1795s/epoch - 159ms/step
Epoch 5/10
11281/11281 - 1771s - loss: 0.1468 - acc: 0.9473 - val_loss: 0.1551 - val_acc: 0.9466 - 1771s/epoch - 157ms/step
Epoch 6/10
11281/11281 - 1762s - loss: 0.1468 - acc: 0.9473 - val_loss: 0.1496 - val_acc: 0.9430 - 1762s/epoch - 156ms/step
Epoch 7/10
11281/11281 - 1775s - loss: 0.1466 - acc: 0.9475 - val_loss: 0.1478 - val_acc: 0.9470 - 1775s/epoch - 157ms/step
Epoch 8/10
11281/11281 - 1784s - loss: 0.1462 - acc: 0.9477 - val_loss: 0.1612 - val_acc: 0.9380 - 1784s/epoch - 158m

Generate model predictions on the validation set¶


In [None]:
MODEL_NAME = 'CNN_model'
validate_df[MODEL_NAME] = model.predict(pad_text(validate_df[TEXT_COLUMN], tokenizer))[:, 1]



In [None]:
validate_df.head()


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,CNN_model
1283523,5682959,False,This is SOLELY based on socioeconomic status a...,0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,2,2,0.0,0,4,0.113713
575472,946390,False,The Secretary of Disinformation strikes again!...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,approved,0,0,0,3,0,0.0,5,4,0.066737
77751,337727,False,With govs. Ige and Waihee as well dozens of st...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,approved,0,0,0,1,0,0.0,4,4,0.23122
450188,795468,False,"JRemington, you are a slave to your prejudices...",0.166667,0.0,0.0,0.166667,0.0,0.0,0.0,...,approved,0,0,2,4,3,0.0,4,6,0.148898
617144,997552,False,Perhaps as the rounds of balloting go and the ...,0.0,0.0,0.0,0.0,0.0,,,...,approved,0,0,0,6,3,0.0,0,4,0.047907


\*Define* bias metrics, then evaluate our new model for bias using the validation set predictions¶


In [None]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

# def compute_bpsn_auc(df, subgroup, label, model_name):
#     """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
#     subgroup_negative_examples = df[df[subgroup] & ~df[label]]
#     non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
#     examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
#     return compute_auc(examples[label], examples[model_name])
def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    # Use concat instead of append
    examples = pd.concat([subgroup_negative_examples, non_subgroup_positive_examples])
    return compute_auc(examples[label], examples[model_name])

# def compute_bnsp_auc(df, subgroup, label, model_name):
#     """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
#     subgroup_positive_examples = df[df[subgroup] & df[label]]
#     non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
#     examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
#     return compute_auc(examples[label], examples[model_name])
def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    # Use concat instead of append
    examples = pd.concat([subgroup_positive_examples, non_subgroup_negative_examples])
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

bias_metrics_df = compute_bias_metrics_for_model(validate_df, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
bias_metrics_df


In [None]:
# @title subgroup_auc

from matplotlib import pyplot as plt
bias_metrics_df['subgroup_auc'].plot(kind='hist', bins=20, title='subgroup_auc')
plt.gca().spines[['top', 'right',]].set_visible(False)

*Calculate* the final score¶


In [None]:
def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)

get_final_metric(bias_metrics_df, calculate_overall_auc(validate_df, MODEL_NAME))

0.8845078597268023

Prediction on Test data

In [None]:
test = pd.read_csv('/content/drive/MyDrive/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/sample_submission.csv', index_col='id')

In [None]:
submission['prediction'] = model.predict(pad_text(test[TEXT_COLUMN], tokenizer))[:, 1]
submission.to_csv('submission.csv')

