In [1]:
%matplotlib inline
from sklearn.externals.joblib import load
from IPython.display import HTML
import matplotlib.pyplot as plt
import sqlite3 as sql
import pandas as pd
import numpy as np
import re

seed = 101

  return f(*args, **kwds)


Load the dev set from the database.

In [2]:
with sql.connect('../data/toxic.db') as conn:
    df = pd.read_sql_query('''select * from toxic where split="dev"''', conn)
df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y
0,35367.0,""":In an interpreted language your source code ...",2002,1,article,random,dev,-1.0,1.0,0.2,0
1,37675.0,"""-\nThis is not """"creative"""". Those are the d...",2002,0,article,random,dev,-1.0,2.0,0.3,0
2,52473.0,"""\n\nCan anyone provide any justification for ...",2002,1,article,random,dev,0.0,1.0,0.6,0
3,109914.0,"Done. This entry is lond, I'll see about chopp...",2002,1,article,random,dev,-1.0,1.0,0.3,0
4,113226.0,"Note to Eclecticology: Hum, you just brought t...",2002,1,article,random,dev,0.0,1.0,0.4,0


Let's grab our best model from earlier.

In [4]:
def tokenizer(text):
    return re.findall(r'[a-z0-9]+', text.lower())

gs = load('../results/gs_cv_sgd.joblib')
pipe = gs.best_estimator_

Let's start by looking at the tokens that are most important for predicting each class.

In [22]:
tokens = pipe.named_steps['vect'].get_feature_names() # Note, NOT the same as vocabulary_
weights = pipe.named_steps['clf'].coef_[0]
df_model = pd.DataFrame({'token':tokens, 'weight':weights})
df_model.head()

Unnamed: 0,token,weight
0,0,-0.520761
1,0 0,0.014939
2,0 00,-7.9e-05
3,0 00000001,-0.000357
4,0 000001,0.071203


In [25]:
df_model.sort_values('weight', ascending=True).head(10)

Unnamed: 0,token,weight
435563,concernthanks for,-1.657369
2002668,your concernthanks,-1.657369
435562,concernthanks,-1.657369
1700835,thanks,-1.627872
457184,cool you,-1.084424
1700742,thank you,-1.026411
200583,are cool,-0.984181
808895,hey hey,-0.920216
1700675,thank,-0.910976
837492,http en,-0.884435


In [24]:
df_model.sort_values('weight', ascending=False).head(10)

Unnamed: 0,token,weight
305709,block block,12.796378
1169398,nipple nipple,11.051707
1169393,nipple,10.793317
1685046,teabag,9.681793
1685049,teabag teabag,9.666201
705752,freezer freezer,9.287123
705750,freezer,9.234097
341715,buttsecks,7.75566
341716,buttsecks buttsecks,7.740031
1947366,wikipedia hi,5.563389


Wonderful. Can we leverage weights to highlight positive or negative segments within a comment? Let's do a few experiments.

Now let's try a more universal method.

In [218]:
comment = 'Wikipedia is total crap!'
n_variants = 1000
masks = np.random.choice([0,1], size=(n_variants, len(comment)), p=[0.1,0.9])
masks

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 0, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 0, 1],
       ...,
       [1, 1, 0, ..., 1, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 0, 1, ..., 1, 1, 1]])

In [219]:
comment_variants = [comment]
for row in masks:
    masked_comment = ''.join([char for char, flag in zip(comment,row) if flag==1])
    comment_variants.append(masked_comment)
comment_variants[:5]

['Wikipedia is total crap!',
 'Wikipeda i totl crap!',
 'Wiipedia i total crap!',
 'Wikipei is tot cra!',
 'Wikiedia is total crap!']

In [247]:
y_probs = pipe.predict_proba(comment_variants)[:,1]
y_probs[:5]

array([0.96057224, 0.87051494, 0.88874274, 0.2699313 , 0.91898656])

In [248]:
y_diffs = y_probs[0] - y_probs[1:]
y_diffs[:5]

array([0.0900573 , 0.0718295 , 0.69064094, 0.04158568, 0.51645136])

In [249]:
weights = np.mean([mask*y_prob for mask, y_prob in zip(masks, y_probs)], axis=0)
# weights = np.mean([mask*y_diff for mask, y_diff in zip(masks, y_diffs)], axis=0)
weights

array([0.62503889, 0.62366656, 0.615581  , 0.62976806, 0.6147776 ,
       0.63087165, 0.60861985, 0.61944684, 0.62742715, 0.61936319,
       0.61653758, 0.63321181, 0.62895351, 0.62031334, 0.61838722,
       0.61602109, 0.61042806, 0.62390479, 0.62525183, 0.62895968,
       0.62811294, 0.62287756, 0.62008034, 0.6282586 ])

In [250]:
pos_weights = np.zeros_like(weights)
for i, weight in enumerate(weights):
    if weight > 0:
        pos_weights[i] = weight
    else:
        pos_weights[i] = 0
pos_weights

array([0.62503889, 0.62366656, 0.615581  , 0.62976806, 0.6147776 ,
       0.63087165, 0.60861985, 0.61944684, 0.62742715, 0.61936319,
       0.61653758, 0.63321181, 0.62895351, 0.62031334, 0.61838722,
       0.61602109, 0.61042806, 0.62390479, 0.62525183, 0.62895968,
       0.62811294, 0.62287756, 0.62008034, 0.6282586 ])

In [251]:
output = '<pre>'
for char, w in zip(comment, pos_weights):
    output += '<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>' % (w,char)
output += '</pre>'
    
HTML(output) 

In [210]:
def explain_comment(comment, pipe, n_variants=1000, smooth=True, scale=None):
    # Create masks
    masks = np.random.choice([0,1], size=(n_variants, len(comment)), p=[0.05,0.95])
    # Generate variants
    comment_variants = [comment]
    for row in masks:
        masked_comment = ''.join([char for char, flag in zip(comment,row) if flag==1])
        comment_variants.append(masked_comment)
    # Generate predictions and compute differences
    y_probs = pipe.predict_proba(comment_variants)[:,1]
#     y_diffs = y_probs[0] - y_probs[1:]
    # Compute weights
#     weights = np.mean([mask*y_diff for mask, y_diff in zip(masks, y_diffs)], axis=0)
    weights = np.mean([mask*y_prob for mask, y_prob in zip(masks, y_probs)], axis=0)
    # Focus on positive weights
    pos_weights = np.zeros_like(weights)
    for i, weight in enumerate(weights):
        if weight > 0:
            pos_weights[i] = weight
        else:
            pos_weights[i] = 0
    # Smooth
    if smooth:
        pos_weights = np.convolve(pos_weights, [0.2, 0.6, 0.2], mode='same')
    # Scale factor
    if scale:
        pos_weights *= scale
    # Insert spans
    output = '<pre>'
    for char, w in zip(comment, pos_weights):
        output += '<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>' % (w,char)
    output += '</pre>'
    return output

In [211]:
output = explain_comment('Thank you for the suggestion', pipe)
HTML(output)

In [212]:
output = explain_comment('I hope you have a bad day', pipe)
HTML(output)

In [213]:
output = explain_comment('This is the worst edit ever. Go jump off a bridge.', pipe)
HTML(output)

Let's try it out on some random samples.

In [198]:
normal_comments = df[df['y']==0].sample(10, random_state=seed)['comment']
examples = [explain_comment(comment, pipe, scale=0.5) for comment in normal_comments]

In [199]:
HTML(examples[1])

In [172]:
HTML(examples[2])

In [207]:
toxic_comments = df[df['y']==1].sample(10, random_state=seed)['comment']
examples = [explain_comment(comment, pipe) for comment in toxic_comments]

In [208]:
HTML(examples[1])

In [209]:
HTML(examples[4])