In [1]:
%matplotlib inline
from sklearn.externals.joblib import load
from IPython.display import HTML
import matplotlib.pyplot as plt
import sqlite3 as sql
import pandas as pd
import numpy as np
import re

seed = 101

Load the dev set from the database.

In [2]:
with sql.connect('../data/toxic.db') as conn:
    df = pd.read_sql_query('''select * from toxic where split="dev"''', conn)
df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y
0,35367.0,""":In an interpreted language your source code ...",2002,1,article,random,dev,-1.0,1.0,0.2,0
1,37675.0,"""-\nThis is not """"creative"""". Those are the d...",2002,0,article,random,dev,-1.0,2.0,0.3,0
2,52473.0,"""\n\nCan anyone provide any justification for ...",2002,1,article,random,dev,0.0,1.0,0.6,0
3,109914.0,"Done. This entry is lond, I'll see about chopp...",2002,1,article,random,dev,-1.0,1.0,0.3,0
4,113226.0,"Note to Eclecticology: Hum, you just brought t...",2002,1,article,random,dev,0.0,1.0,0.4,0


Let's grab our best model from earlier.

In [3]:
def tokenizer(text):
    return re.findall(r'[a-z0-9]+', text.lower())

gs = load('../results/gs_cv_sgd.joblib')
pipe = gs.best_estimator_

Let's start by looking at the tokens that are most important for predicting each class.

In [4]:
tokens = pipe.named_steps['vect'].get_feature_names() # Note, NOT the same as vocabulary_
weights = pipe.named_steps['clf'].coef_[0]
df_model = pd.DataFrame({'token':tokens, 'weight':weights})
df_model.head()

Unnamed: 0,token,weight
0,0,-0.520761
1,0 0,0.014939
2,0 00,-7.9e-05
3,0 00000001,-0.000357
4,0 000001,0.071203


In [5]:
df_model.sort_values('weight', ascending=True).head(10)

Unnamed: 0,token,weight
435563,concernthanks for,-1.657369
2002668,your concernthanks,-1.657369
435562,concernthanks,-1.657369
1700835,thanks,-1.627872
457184,cool you,-1.084424
1700742,thank you,-1.026411
200583,are cool,-0.984181
808895,hey hey,-0.920216
1700675,thank,-0.910976
837492,http en,-0.884435


In [6]:
df_model.sort_values('weight', ascending=False).head(10)

Unnamed: 0,token,weight
305709,block block,12.796378
1169398,nipple nipple,11.051707
1169393,nipple,10.793317
1685046,teabag,9.681793
1685049,teabag teabag,9.666201
705752,freezer freezer,9.287123
705750,freezer,9.234097
341715,buttsecks,7.75566
341716,buttsecks buttsecks,7.740031
1947366,wikipedia hi,5.563389


Interesting. Now let's see if we can use some simple methods to expose decision points. We'll develop a process to tokenize a document and drop out unique tokens. By removing tokens and recomputing the score, we should be able to see how any one token affects the prediction.

In [7]:
comment = 'Wikipedia is total crap!'
tokens = tokenizer(comment)
tokens

['wikipedia', 'is', 'total', 'crap']

Now let's drop one token at a time and create variations on the original comment. We'll keep the original comment as the first item for comparison.

In [8]:
variants = [comment]
for token in sorted(set(tokens)): # The set is very important!
    variants.append(re.sub(token, '', comment))
variants

['Wikipedia is total crap!',
 'Wikipedia is total !',
 'Wikipedia  total crap!',
 'Wikipedia is  crap!',
 'Wikipedia is total crap!']

In [9]:
sorted(set(tokens))

['crap', 'is', 'total', 'wikipedia']

Generate new scores. Let's only look at the probability related to the positive case.

In [10]:
y_probs = pipe.predict_proba(variants)[:,1]
y_probs

array([0.96057224, 0.58360886, 0.9460504 , 0.93914789, 0.96057224])

Compute the difference from the base score.

In [11]:
y_diffs = y_probs[0] - y_probs[1:]
y_diffs

array([0.37696339, 0.01452184, 0.02142435, 0.        ])

Now let's assign these score differences as background spans in the original text.

In [12]:
weight_dict = {k:v for k,v in zip(list(sorted(set(tokens))), y_diffs)}
weight_dict

{'crap': 0.3769633854896386,
 'is': 0.014521843133698198,
 'total': 0.02142435427322198,
 'wikipedia': 0.0}

In [13]:
output = '<pre>' + comment + '</pre>'
for k,v in weight_dict.items():
    span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)
    output = re.sub(k, span_string, output)
HTML(output)

Cool, now let's try and functionalize that.

In [14]:
def highlight_toxic(text, pipe, norm=False):
    # Get tokens
    tokens = tokenizer(text)
    # Create variants
    variants = [text]
    for token in sorted(set(tokens)):
        variants.append(re.sub(token, '', text))
    # Score variants
    y_probs = pipe.predict_proba(variants)[:,1]
    # Compute differences from base score 
    y_diffs = y_probs[0] - y_probs[1:]
    # Normalize
    if norm:
        y_diffs /= np.linalg.norm(y_diffs, ord=1)
    # Assign weights to tokens
    weight_dict = {k:v for k,v in zip(list(sorted(set(tokens))), y_diffs)}
    # Generate output
    output = '<pre>' + text + '</pre>'
    for k,v in weight_dict.items():
        span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)
        output = re.sub(r'\b%s\b' % k, span_string, output)
    return output

In [15]:
output = highlight_toxic('Thank you for the suggestion', pipe)
print(pipe.predict_proba(['Thank you for the suggestion']))
HTML(output)

[[0.95068748 0.04931252]]


In [16]:
output = highlight_toxic('I hope you have a bad day', pipe)
print(pipe.predict_proba(['I hope you have a bad day']))
HTML(output)

[[0.79067608 0.20932392]]


In [17]:
output = highlight_toxic('This is the worst edit ever. Go jump off a bridge.', pipe)
print(pipe.predict_proba(['This is the worst edit ever. Go jump off a bridge.']))
HTML(output)

[[0.06163979 0.93836021]]


Let's try it out on some random samples.

In [18]:
normal_comments = df[df['y']==0].sample(10, random_state=seed)['comment']
y_prob_normal = pipe.predict_proba(normal_comments)
normal_examples = [highlight_toxic(comment, pipe) for comment in normal_comments]

In [19]:
i = 1
print(y_prob_normal[i])
HTML(normal_examples[i])

[0.93636541 0.06363459]


In [20]:
toxic_comments = df[df['y']==1].sample(10, random_state=seed)['comment']
y_prob_toxic = pipe.predict_proba(toxic_comments)
toxic_examples = [highlight_toxic(comment, pipe) for comment in toxic_comments]

In [21]:
i = 0
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.4498492 0.5501508]


In [22]:
i = 1
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.32386961 0.67613039]


In [23]:
i = 7
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.1222434 0.8777566]


In [24]:
i = 8
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[5.71562827e-06 9.99994284e-01]


So...it seems like dropping out one token at a time isn't going to cut it. Even though the model is scoring this comment as toxic, it is having some issues highlighting specific segments. Let's dive into the model and drive highlighting by the exact contribution to each prediction.

In [25]:
comment = toxic_comments.values[8]
features = pipe.named_steps['vect'].get_feature_names() # Don't cast this to a numpy array...
counts = pipe.named_steps['vect'].transform([comment]).toarray().flatten()
w = pipe.named_steps['clf'].coef_.flatten()

In [26]:
idx = np.where(counts > 0)[0]
weight_dict = {}
for i in idx:
    if len(features[i].split(' ')) == 1: # Only keep unigrams
        weight_dict[features[i]] = w[i]*counts[i]
weight_dict

{'a': 1.2088480945726323,
 'about': -0.2607603013466355,
 'an': 0.2589215365669668,
 'and': -0.2670906313375103,
 'any': -0.01776723571584512,
 'anyone': -0.15207051850937595,
 'article': -0.9811387856680129,
 'as': -0.16735390192665595,
 'at': -0.16784033999245604,
 'banking': -0.15026369731312011,
 'bias': 0.15444237518778658,
 'big': 0.4241447863419349,
 'by': -0.05913132542296167,
 'calling': -0.034323493004453004,
 'check': -0.358014700930834,
 'considers': 0.023000681317583403,
 'conspiracy': 0.10950413355480988,
 'criticizes': -0.03014604981136815,
 'day': -0.1126251987953881,
 'did': -0.27519608091791237,
 'discussion': -0.1855866443477259,
 'don': -0.04834424964650836,
 'done': -0.08994546895596615,
 'douchebag': 2.940380037365341,
 'douchemaster': 8.323732967922443e-07,
 'economists': 0.0018935977790262074,
 'editor': -0.1384600955434151,
 'experts': 0.06117111631735238,
 'fact': -0.035057845505748884,
 'fat': 1.2038253328081299,
 'flag': 0.13262530786773408,
 'for': 0.027827

In [27]:
output = '<pre>' + comment + '</pre>'
for k,v in weight_dict.items():
    if v < 0:
        pass
    else:
        span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)        
        output = re.sub(r'\b%s\b' % k, span_string, output, re.IGNORECASE)

In [28]:
HTML(output)

Now the functional form of the above process.

In [29]:
def highlight_toxic(comment, pipe, features, w):
    counts = pipe.named_steps['vect'].transform([comment]).toarray().flatten()
    # Build weight dictionary (unigram only)
    idx = np.where(counts > 0)[0]
    weight_dict = {}
    for i in idx:
        if len(features[i].split(' ')) == 1: # Only keep unigrams
            weight_dict[features[i]] = w[i]*counts[i]
    # Scale to max value
    
    # Insert spans
    output = '<pre>' + comment + '</pre>'
    for k,v in weight_dict.items():
        if v < 0:
            pass
        else:
            # Some regex magic for keeping the original case
            span_string = r'<span style="background-color: rgba(255, 0, 0, %0.2f)">\1</span>' % v
            output = re.sub(r'(?i)(\b%s\b)' % k, span_string, output)
    return output

In [30]:
features = pipe.named_steps['vect'].get_feature_names()
w = pipe.named_steps['clf'].coef_.flatten()
toxic_examples = [highlight_toxic(comment, pipe, features, w) for comment in toxic_comments]

In [31]:
HTML(toxic_examples[3])

In [32]:
HTML(toxic_examples[7])