In [1]:
%matplotlib inline
from IPython.display import HTML
import matplotlib.pyplot as plt
from joblib import load
import sqlite3 as sql
import pandas as pd
import numpy as np
import re

seed = 101

Load the entire dataset.

In [2]:
with sql.connect('../data/toxic.db') as conn:
    df = pd.read_sql_query('''select * from toxic''', conn)
df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y
0,2232.0,This:\n:One can make an analogy in mathematica...,2002,1,article,random,train,-1.0,1.0,0.4,0
1,4216.0,"""\n\n:Clarification for you (and Zundark's ri...",2002,1,user,random,train,0.0,2.0,0.5,0
2,8953.0,Elected or Electoral? JHK,2002,0,article,random,test,0.0,1.0,0.1,0
3,26547.0,"""This is such a fun entry. Devotchka\n\nI on...",2002,1,article,random,train,0.0,2.0,0.6,0
4,28959.0,Please relate the ozone hole to increases in c...,2002,1,article,random,test,-1.0,1.0,0.2,0


Split into two seperate dataframes: df_train and df_test.

In [3]:
df_train = df[df['split'] == 'train'].copy().reset_index(drop=True)
df_train.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y
0,2232.0,This:\n:One can make an analogy in mathematica...,2002,1,article,random,train,-1.0,1.0,0.4,0
1,4216.0,"""\n\n:Clarification for you (and Zundark's ri...",2002,1,user,random,train,0.0,2.0,0.5,0
2,26547.0,"""This is such a fun entry. Devotchka\n\nI on...",2002,1,article,random,train,0.0,2.0,0.6,0
3,37330.0,"""\n\n\nI fixed the link; I also removed """"home...",2002,1,article,random,train,-1.0,1.0,0.1,0
4,37346.0,"""If they are """"indisputable"""" then why does th...",2002,1,article,random,train,-1.0,1.0,0.2,0


In [4]:
df_test = df[df['split'] == 'test'].copy().reset_index(drop=True)
df_test.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y
0,8953.0,Elected or Electoral? JHK,2002,0,article,random,test,0.0,1.0,0.1,0
1,28959.0,Please relate the ozone hole to increases in c...,2002,1,article,random,test,-1.0,1.0,0.2,0
2,138074.0,"""\n\n\n\nI'm not sure if it's properly called ...",2002,1,article,random,test,0.0,1.0,0.5,0
3,200664.0,\n\n\n \nThanks on the info on how to move a p...,2002,1,user,random,test,0.0,1.0,0.4,0
4,213105.0,"""\n\n: I should do that too, I agree, but I've...",2002,1,user,random,test,0.0,1.0,0.3,0


Let's grab our "best" model from earlier.

In [5]:
def tokenizer(text):
    return re.findall(r'[a-z0-9]+', text.lower())

gs = load('../results/gs_cv_sgd.joblib')
pipe = gs.best_estimator_

Just to make things easier, let's compute probabilities for the entire test set.

In [6]:
df_test['y_prob'] = pipe.predict_proba(df_test['comment'])[:,1]
df_test.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y,y_prob
0,8953.0,Elected or Electoral? JHK,2002,0,article,random,test,0.0,1.0,0.1,0,0.244198
1,28959.0,Please relate the ozone hole to increases in c...,2002,1,article,random,test,-1.0,1.0,0.2,0,0.17284
2,138074.0,"""\n\n\n\nI'm not sure if it's properly called ...",2002,1,article,random,test,0.0,1.0,0.5,0,0.0037
3,200664.0,\n\n\n \nThanks on the info on how to move a p...,2002,1,user,random,test,0.0,1.0,0.4,0,0.008976
4,213105.0,"""\n\n: I should do that too, I agree, but I've...",2002,1,user,random,test,0.0,1.0,0.3,0,0.007818


# Global Model Interpretation

Given that this is a linear model, let's start by looking at the tokens that are most important for predicting each class.

In [7]:
tokens = pipe.named_steps['vect'].get_feature_names() # Note, NOT the same as vocabulary_
weights = pipe.named_steps['clf'].coef_[0]
df_model = pd.DataFrame({'token':tokens, 'weight':weights})
df_model.head()

Unnamed: 0,token,weight
0,0,-0.48531
1,0 0,0.039417
2,0 00,-8.8e-05
3,0 005,-0.007528
4,0 01,-0.002821


In [8]:
df_model.sort_values('weight', ascending=True).head(10)

Unnamed: 0,token,weight
463216,thanks,-1.575386
127997,cool you,-1.079754
463188,thank you,-1.015618
220356,hey hey,-0.978156
58988,are cool,-0.958368
463160,thank,-0.863174
189578,for your,-0.836592
130100,could you,-0.775153
393418,regards,-0.771566
228178,http en,-0.759324


In [9]:
df_model.sort_values('weight', ascending=False).head(10)

Unnamed: 0,token,weight
88865,block block,12.789651
315629,nipple nipple,11.052719
315628,nipple,10.774228
459260,teabag,9.681863
97787,buttsecks,7.755658
540505,wikipedia hi,5.545768
220792,hi wikipedia,5.39969
195137,fuck,4.016078
109999,chester,3.47463
500501,tommy2010,3.472154


# Historical Counterfactual Examples

Let's pick a positive example from the test set and find the closest* example from the training set that had the _opposite_ outcome.

In [14]:
idx_test_pos = (df_test['y'] == 1) & (df_test['y_prob'] > 0.5)
df_test[idx_test_pos].sample(5)

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y,y_prob
31381,684746726.0,\n\n== You fascit Nazi! You socialist bastard!...,2015,1,user,blocked,test,-2.0,-1.0,-1.6,1,1.0
13718,227050603.0,"""\n::Block me. I really don't give a shit! If ...",2008,1,user,blocked,test,-2.0,0.0,-0.9,1,0.978047
12584,207227099.0,""", a Democrat cyber thug, banns me due to diff...",2008,0,user,blocked,test,-1.0,1.0,-0.2,1,0.899124
13759,227668942.0,\n\n\nA.K.D.A.H.N why do u keep blockin me for...,2008,0,user,blocked,test,-1.0,1.0,-0.1,1,0.597773
20486,372491007.0,"But what do I know, I'm a complete fucking mor...",2010,1,user,blocked,test,-2.0,0.0,-1.1,1,0.999958


In [17]:
comment = df_test.loc[13718]['comment']
print(comment)

"
::Block me. I really don't give a shit! If a source doesn't work for someone, it gets removed. K?    "


Now let's vectorize this comment, vectorize all the training comments, and determine which one is most similar and _not toxic_.

In [18]:
idx_train_neg = df_train['y'] == 0
X_train_neg = pipe.named_steps['vect'].transform(df_train.loc[idx_train_neg, 'comment'])

In [19]:
X = pipe.named_steps['vect'].transform([comment])

In [53]:
from sklearn.metrics import pairwise_distances

cosine_dist = pairwise_distances(X_train_neg, X, metric='cosine').flatten()
print('AVG: %0.2f' % df_train.loc[idx_train_neg, ['avg']].values[cosine_dist.argmin()][0])
print('Cosine Distance: %0.2f' % cosine_dist.min())
print('<COMMENT>\n' + df_train.loc[idx_train_neg, 'comment'].values[cosine_dist.argmin()] + '\n<COMMENT>')

AVG: 0.60
Cosine Distance: 0.66
<COMMENT>


::::::::::: Okay, found a source, don't know if it's good enough, don't care. It was worth a shot. I apologize for the personal attacks to you, but I'd appreciate it if you wouldn't make sarcastic comments or making fun of what I say. 
<COMMENT>


In [54]:
euc_dist = pairwise_distances(X_train_neg, X, metric='euclidean').flatten()
print('AVG: %0.2f' % df_train.loc[idx_train_neg, ['avg']].values[euc_dist.argmin()][0])
print('Euclidean Distance: %0.2f' % df_train.loc[idx_train_neg, ['avg']].values[euc_dist.argmin()][0])
print('<COMMENT>\n' + df_train.loc[idx_train_neg, 'comment'].values[euc_dist.argmin()] + '\n<COMMENT>')

AVG: 0.10
Euclidean Distance: 0.10
<COMMENT>

::::::Rollback still doesn't work. —  / 
<COMMENT>


Let's look at the three closest examples with the same label (positive).

In [55]:
idx_train_pos = df_train['y'] == 1
X_train_pos = pipe.named_steps['vect'].transform(df_train.loc[idx_train_pos, 'comment'])

In [64]:
cosine_dist = pairwise_distances(X_train_pos, X, metric='cosine').flatten()
for i in cosine_dist.flatten().argsort()[:3]:
    print('\nAVG: %0.2f' % df_train.loc[idx_train_pos, ['avg']].values[cosine_dist.argmin()][0])
    print('Cosine Distance: %0.2f' % cosine_dist[i])
#     print(df_train.loc[idx_train_pos, ['avg']].values[i])
    print('<COMMENT>\n' + df_train.loc[idx_train_pos, 'comment'].values[i] + '\n<COMMENT>')


AVG: -1.40
Cosine Distance: 0.57
<COMMENT>


I don't give a flying fuck. block me I Don't care. kiss my goddamn ass. DUSTIN Motherfucking D
<COMMENT>

AVG: -1.40
Cosine Distance: 0.58
<COMMENT>
And I don't give a damn.  


<COMMENT>

AVG: -1.40
Cosine Distance: 0.59
<COMMENT>


<COMMENT>


In [65]:
euc_dist = pairwise_distances(X_train_pos, X, metric='euclidean').flatten()
for i in euc_dist.flatten().argsort()[:3]:
    print('\nAVG: %0.2f' % df_train.loc[idx_train_pos, ['avg']].values[euc_dist.argmin()][0])
    print('Euclidean Distance: %0.2f' % euc_dist[i])
#     print(df_train.loc[idx_train_pos, ['avg']].values[i])
    print('<COMMENT>\n' + df_train.loc[idx_train_pos, 'comment'].values[i] + '\n<COMMENT>')


AVG: -0.50
Euclidean Distance: 6.08
<COMMENT>
And I don't give a damn.  


<COMMENT>

AVG: -0.50
Euclidean Distance: 6.40
<COMMENT>
"
::::It doesn't matter. "
<COMMENT>

AVG: -0.50
Euclidean Distance: 6.48
<COMMENT>


I don't care. Asshole! 
<COMMENT>


# Greedy Counterfactual Example

Now, instead of mining our training data for counterfactual examples, let's attempt to _create_ one by dropping word occurances from the original text until the score changes.

In [225]:
idx_nonzero = (X.toarray() > 0).astype(int)
variants = [comment]
print(comment)



== Know your place ==

If you vandalise any pages again, you will be blocked   


In [230]:
idx_nonzero = np.nonzero(X.toarray().flatten())[0]
variants = np.repeat(X.toarray(), len(idx_nonzero) + 1, axis=0) # add one base instance for the original
for i, j in enumerate(idx_nonzero):
    variants[i+1,j] = 0
#     print(np.nonzero(variants[i+1])[0].shape)
variants.shape

(24, 560571)

In [239]:
y_prob_var = pipe.named_steps['clf'].predict_proba(variants)[:,1]
k = y_prob_var.argmin()
print('''Removing token "%s" changes score from %0.2f to %0.2f''' % (tokens[idx_nonzero[k]], y_prob_var[0], y_prob_var[k]))

NameError: name 'tokens' is not defined

# Local Surrogate Model

Interesting. Now let's see if we can use some simple methods to expose decision points. We'll develop a process to tokenize a document and drop out unique tokens. By removing tokens and recomputing the score, we should be able to see how any one token affects the prediction.

In [10]:
comment = 'Wikipedia is total crap!'
tokens = tokenizer(comment)
tokens

['wikipedia', 'is', 'total', 'crap']

Now let's drop one token at a time and create variations on the original comment. We'll keep the original comment as the first item for comparison.

In [11]:
variants = [comment]
for token in sorted(set(tokens)): # The set is very important!
    variants.append(re.sub(token, '', comment))
variants

['Wikipedia is total crap!',
 'Wikipedia is total !',
 'Wikipedia  total crap!',
 'Wikipedia is  crap!',
 'Wikipedia is total crap!']

In [12]:
sorted(set(tokens))

['crap', 'is', 'total', 'wikipedia']

Generate new scores. Let's only look at the probability related to the positive case.

In [13]:
y_probs = pipe.predict_proba(variants)[:,1]
y_probs

array([0.96419754, 0.59595006, 0.95051515, 0.93706812, 0.96419754])

Compute the difference from the base score.

In [14]:
y_diffs = y_probs[0] - y_probs[1:]
y_diffs

array([0.36824748, 0.01368239, 0.02712943, 0.        ])

Now let's assign these score differences as background spans in the original text.

In [15]:
weight_dict = {k:v for k,v in zip(list(sorted(set(tokens))), y_diffs)}
weight_dict

{'crap': 0.36824748498106574,
 'is': 0.013682391040973019,
 'total': 0.02712942587769296,
 'wikipedia': 0.0}

In [16]:
output = '<pre>' + comment + '</pre>'
for k,v in weight_dict.items():
    span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)
    output = re.sub(k, span_string, output)
HTML(output)

Cool, now let's try and functionalize that.

In [17]:
def highlight_toxic(text, pipe, norm=False):
    # Get tokens
    tokens = tokenizer(text)
    # Create variants
    variants = [text]
    for token in sorted(set(tokens)):
        variants.append(re.sub(token, '', text))
    # Score variants
    y_probs = pipe.predict_proba(variants)[:,1]
    # Compute differences from base score 
    y_diffs = y_probs[0] - y_probs[1:]
    # Normalize
    if norm:
        y_diffs /= np.linalg.norm(y_diffs, ord=1)
    # Assign weights to tokens
    weight_dict = {k:v for k,v in zip(list(sorted(set(tokens))), y_diffs)}
    # Generate output
    output = '<pre>' + text + '</pre>'
    for k,v in weight_dict.items():
        span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)
        output = re.sub(r'\b%s\b' % k, span_string, output)
    return output

In [18]:
output = highlight_toxic('Thank you for the suggestion', pipe)
print(pipe.predict_proba(['Thank you for the suggestion']))
HTML(output)

[[0.93358762 0.06641238]]


In [19]:
output = highlight_toxic('I hope you have a bad day', pipe)
print(pipe.predict_proba(['I hope you have a bad day']))
HTML(output)

[[0.77770784 0.22229216]]


In [20]:
output = highlight_toxic('This is the worst edit ever. Go jump off a bridge.', pipe)
print(pipe.predict_proba(['This is the worst edit ever. Go jump off a bridge.']))
HTML(output)

[[0.06290301 0.93709699]]


Let's try it out on some random samples.

In [21]:
normal_comments = df[df['y']==0].sample(10, random_state=seed)['comment']
y_prob_normal = pipe.predict_proba(normal_comments)
normal_examples = [highlight_toxic(comment, pipe) for comment in normal_comments]

In [22]:
i = 1
print(y_prob_normal[i])
HTML(normal_examples[i])

[0.96518533 0.03481467]


In [23]:
toxic_comments = df[df['y']==1].sample(10, random_state=seed)['comment']
y_prob_toxic = pipe.predict_proba(toxic_comments)
toxic_examples = [highlight_toxic(comment, pipe) for comment in toxic_comments]

In [24]:
i = 0
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.40919978 0.59080022]


In [25]:
i = 1
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.37620651 0.62379349]


In [26]:
i = 7
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.24845623 0.75154377]


In [27]:
i = 8
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[3.18633803e-06 9.99996814e-01]


So...it seems like dropping out one token at a time isn't going to cut it. Even though the model is scoring this comment as toxic, it is having some issues highlighting specific segments. Let's dive into the model and drive highlighting by the exact contribution to each prediction.

In [28]:
comment = toxic_comments.values[8]
features = pipe.named_steps['vect'].get_feature_names() # Don't cast this to a numpy array...
counts = pipe.named_steps['vect'].transform([comment]).toarray().flatten()
w = pipe.named_steps['clf'].coef_.flatten()

In [29]:
idx = np.where(counts > 0)[0]
weight_dict = {}
for i in idx:
    if len(features[i].split(' ')) == 1: # Only keep unigrams
        weight_dict[features[i]] = w[i]*counts[i]
weight_dict

{'a': 0.9506251995001473,
 'about': -0.2573779969605216,
 'an': 0.24775747406102963,
 'and': -0.26113798744752736,
 'any': -0.04195150960840179,
 'anyone': -0.17851617027838612,
 'article': -0.917904310846447,
 'as': -0.1915447073542239,
 'at': -0.1695905146795088,
 'banking': -0.21517154754479062,
 'bias': 0.14695020051432892,
 'big': 0.4201936147568579,
 'by': -0.007789988269459465,
 'calling': -0.004525334154367213,
 'check': -0.32613314230385276,
 'considers': 0.03558991071102402,
 'conspiracy': 0.08777763038538694,
 'criticizes': -0.022762161378118775,
 'day': -0.07177890620233918,
 'did': -0.23221968775288868,
 'discussion': -0.15508429059649287,
 'don': -0.028806586242888867,
 'done': -0.09142575896360813,
 'douchebag': 3.0288359785357937,
 'economists': -0.02508670957939663,
 'editor': -0.10446221268238955,
 'experts': 0.04890261771595807,
 'fact': -0.03430937874263487,
 'fat': 1.1855633862722827,
 'flag': 0.16822040909077168,
 'for': 0.02230599771168577,
 'fractional': -0.0467

In [30]:
output = '<pre>' + comment + '</pre>'
for k,v in weight_dict.items():
    if v < 0:
        pass
    else:
        span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)        
        output = re.sub(r'\b%s\b' % k, span_string, output, re.IGNORECASE)

In [31]:
HTML(output)

Now the functional form of the above process.

In [32]:
def highlight_toxic(comment, pipe, features, w):
    counts = pipe.named_steps['vect'].transform([comment]).toarray().flatten()
    # Build weight dictionary (unigram only)
    idx = np.where(counts > 0)[0]
    weight_dict = {}
    for i in idx:
        if len(features[i].split(' ')) == 1: # Only keep unigrams
            weight_dict[features[i]] = w[i]*counts[i]
    # Scale to max value
    
    # Insert spans
    output = '<pre>' + comment + '</pre>'
    for k,v in weight_dict.items():
        if v < 0:
            pass
        else:
            # Some regex magic for keeping the original case
            span_string = r'<span style="background-color: rgba(255, 0, 0, %0.2f)">\1</span>' % v
            output = re.sub(r'(?i)(\b%s\b)' % k, span_string, output)
    return output

In [33]:
features = pipe.named_steps['vect'].get_feature_names()
w = pipe.named_steps['clf'].coef_.flatten()
toxic_examples = [highlight_toxic(comment, pipe, features, w) for comment in toxic_comments]

In [34]:
HTML(toxic_examples[3])

In [35]:
HTML(toxic_examples[7])