In [198]:
%matplotlib inline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.linear_model import Ridge
from IPython.display import HTML
import matplotlib.pyplot as plt
from joblib import load
from tqdm import tqdm
import sqlite3 as sql
import pandas as pd
import numpy as np
import re

seed = 101

Load the entire dataset.

In [2]:
with sql.connect('../data/toxic.db') as conn:
    df = pd.read_sql_query('''select * from toxic''', conn)
df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y
0,2232.0,This:\n:One can make an analogy in mathematica...,2002,1,article,random,train,-1.0,1.0,0.4,0
1,4216.0,"""\n\n:Clarification for you (and Zundark's ri...",2002,1,user,random,train,0.0,2.0,0.5,0
2,8953.0,Elected or Electoral? JHK,2002,0,article,random,test,0.0,1.0,0.1,0
3,26547.0,"""This is such a fun entry. Devotchka\n\nI on...",2002,1,article,random,train,0.0,2.0,0.6,0
4,28959.0,Please relate the ozone hole to increases in c...,2002,1,article,random,test,-1.0,1.0,0.2,0


Split into two seperate dataframes: df_train and df_test.

In [3]:
df_train = df[df['split'] == 'train'].copy().reset_index(drop=True)
df_train.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y
0,2232.0,This:\n:One can make an analogy in mathematica...,2002,1,article,random,train,-1.0,1.0,0.4,0
1,4216.0,"""\n\n:Clarification for you (and Zundark's ri...",2002,1,user,random,train,0.0,2.0,0.5,0
2,26547.0,"""This is such a fun entry. Devotchka\n\nI on...",2002,1,article,random,train,0.0,2.0,0.6,0
3,37330.0,"""\n\n\nI fixed the link; I also removed """"home...",2002,1,article,random,train,-1.0,1.0,0.1,0
4,37346.0,"""If they are """"indisputable"""" then why does th...",2002,1,article,random,train,-1.0,1.0,0.2,0


In [4]:
df_test = df[df['split'] == 'test'].copy().reset_index(drop=True)
df_test.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y
0,8953.0,Elected or Electoral? JHK,2002,0,article,random,test,0.0,1.0,0.1,0
1,28959.0,Please relate the ozone hole to increases in c...,2002,1,article,random,test,-1.0,1.0,0.2,0
2,138074.0,"""\n\n\n\nI'm not sure if it's properly called ...",2002,1,article,random,test,0.0,1.0,0.5,0
3,200664.0,\n\n\n \nThanks on the info on how to move a p...,2002,1,user,random,test,0.0,1.0,0.4,0
4,213105.0,"""\n\n: I should do that too, I agree, but I've...",2002,1,user,random,test,0.0,1.0,0.3,0


Let's grab our "best" model from earlier.

In [5]:
def tokenizer(text):
    return re.findall(r'[a-z0-9]+', text.lower())

gs = load('../results/gs_cv_sgd.joblib')
pipe = gs.best_estimator_

Just to make things easier, let's compute probabilities for the entire test set.

In [6]:
df_test['y_prob'] = pipe.predict_proba(df_test['comment'])[:,1]
df_test.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y,y_prob
0,8953.0,Elected or Electoral? JHK,2002,0,article,random,test,0.0,1.0,0.1,0,0.244198
1,28959.0,Please relate the ozone hole to increases in c...,2002,1,article,random,test,-1.0,1.0,0.2,0,0.17284
2,138074.0,"""\n\n\n\nI'm not sure if it's properly called ...",2002,1,article,random,test,0.0,1.0,0.5,0,0.0037
3,200664.0,\n\n\n \nThanks on the info on how to move a p...,2002,1,user,random,test,0.0,1.0,0.4,0,0.008976
4,213105.0,"""\n\n: I should do that too, I agree, but I've...",2002,1,user,random,test,0.0,1.0,0.3,0,0.007818


# Global surrogates

Many NLP models use complicated neural network architecture that don't exactly lend themselves well to interpretation. A global surrogate is an interpretable model (e.g., decision tree, logistic regression, k-nearest neighbors, etc.) that is trained on the output of the _true_ model. In effect, it tries to distill the complex model into a simpler one, which can have benefits for deployment as well. Our "best" model is a linear model, so this is a bit more direct that the process would normally be. Let's start by looking at the tokens that are most important for predicting each class.

In [7]:
tokens = pipe.named_steps['vect'].get_feature_names() # Note, NOT the same as vocabulary_
weights = pipe.named_steps['clf'].coef_[0]
df_model = pd.DataFrame({'token':tokens, 'weight':weights})
df_model.head()

Unnamed: 0,token,weight
0,0,-0.48531
1,0 0,0.039417
2,0 00,-8.8e-05
3,0 005,-0.007528
4,0 01,-0.002821


In [8]:
df_model.sort_values('weight', ascending=True).head(10)

Unnamed: 0,token,weight
463216,thanks,-1.575386
127997,cool you,-1.079754
463188,thank you,-1.015618
220356,hey hey,-0.978156
58988,are cool,-0.958368
463160,thank,-0.863174
189578,for your,-0.836592
130100,could you,-0.775153
393418,regards,-0.771566
228178,http en,-0.759324


In [9]:
df_model.sort_values('weight', ascending=False).head(10)

Unnamed: 0,token,weight
88865,block block,12.789651
315629,nipple nipple,11.052719
315628,nipple,10.774228
459260,teabag,9.681863
97787,buttsecks,7.755658
540505,wikipedia hi,5.545768
220792,hi wikipedia,5.39969
195137,fuck,4.016078
109999,chester,3.47463
500501,tommy2010,3.472154


# Counterfactual examples
A counterfactual explanation of a prediction describes the smallest change to the prediction instance that results in a change to a predefined output. In the context of this problem, the smallest change that induces a change from toxic to non-toxic or vice-versa. Of course, defining what constitutes a _small_ change is particularly difficult. Here are a couple basic strategies for generating those examples.

## Historical comparison 
We'll start by picking a positive (toxic) example from the test set and finding the _closest_ example from the training set that had a negative (non-toxic) outcome.

In [10]:
idx_test_pos = (df_test['y'] == 1) & (df_test['y_prob'] > 0.5)
df_test[idx_test_pos].sample(5)

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,min,max,avg,y,y_prob
7820,122936921.0,\n:Are you an idiot? That plot synopsis wasn't...,2007,1,user,blocked,test,-2.0,1.0,-1.0,1,0.91026
16289,278071348.0,You're the one who amditted that your edits w...,2009,0,article,blocked,test,-1.0,1.0,-0.2,1,0.544424
30518,656766319.0,"\n\nYour threats don't work, motherfucker. I d...",2015,0,user,blocked,test,-2.0,0.0,-1.6,1,0.834805
23007,434447019.0,\n\n== Suspected Sockpuppetry ==\n\nI suspect ...,2011,0,user,random,test,-1.0,2.0,-0.1,1,0.974467
9530,154523620.0,\n\nI think mucha lucha is the most homosexual...,2007,1,article,blocked,test,-2.0,-1.0,-1.1,1,0.99762


In [11]:
comment = df_test.loc[13718]['comment']
print(comment)

"
::Block me. I really don't give a shit! If a source doesn't work for someone, it gets removed. K?    "


Now let's vectorize this comment, vectorize all the negative training instances, and determine which one is the closest to the target comment. We'll start by computing [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) (actually, cosine distance, or 1 - cosine similarity) between the target comment and _all_ training comments.

In [12]:
idx_train_neg = df_train['y'] == 0
X_train_neg = pipe.named_steps['vect'].transform(df_train.loc[idx_train_neg, 'comment'])

In [13]:
X = pipe.named_steps['vect'].transform([comment])

In [14]:
cosine_dist = pairwise_distances(X_train_neg, X, metric='cosine').flatten()
print('AVG: %0.2f' % df_train.loc[idx_train_neg, ['avg']].values[cosine_dist.argmin()][0])
print('Cosine Distance: %0.3f' % cosine_dist.min())
print('<COMMENT>\n' + df_train.loc[idx_train_neg, 'comment'].values[cosine_dist.argmin()] + '\n<COMMENT>')

AVG: 0.60
Cosine Distance: 0.656
<COMMENT>


::::::::::: Okay, found a source, don't know if it's good enough, don't care. It was worth a shot. I apologize for the personal attacks to you, but I'd appreciate it if you wouldn't make sarcastic comments or making fun of what I say. 
<COMMENT>


Interesting. Both are clearly talking about sources, but the second is apologetic and less confrontational. Let's look at the closest three comments.

In [15]:
cosine_dist = pairwise_distances(X_train_neg, X, metric='cosine').flatten()
for i in cosine_dist.flatten().argsort()[:3]:
    print('\nAVG: %0.2f' % df_train.loc[idx_train_neg, ['avg']].values[cosine_dist.argmin()][0])
    print('Cosine Distance: %0.3f' % cosine_dist[i])
    print('<COMMENT>\n' + df_train.loc[idx_train_neg, 'comment'].values[i] + '\n<COMMENT>')


AVG: 0.60
Cosine Distance: 0.656
<COMMENT>


::::::::::: Okay, found a source, don't know if it's good enough, don't care. It was worth a shot. I apologize for the personal attacks to you, but I'd appreciate it if you wouldn't make sarcastic comments or making fun of what I say. 
<COMMENT>

AVG: 0.60
Cosine Distance: 0.656
<COMMENT>
"

== looking for a song ==

I am looking for a song that I thought was by a Flock of Seagulls from obviously WAY BACK that goes ""Don't Change for you, Don't Change a thing for me. It was kind of punkish but made a point a it popped into my mind if anyone can help me. Thanks so much"
<COMMENT>

AVG: 0.60
Cosine Distance: 0.658
<COMMENT>
 Why do people make weird exception for the title Rabbi? Don't give a person a title that needs a degree that they don't have.
<COMMENT>


These examples don't look particularly relevant. Let's look at the three closest positive (toxic) examples.

In [16]:
idx_train_pos = df_train['y'] == 1
X_train_pos = pipe.named_steps['vect'].transform(df_train.loc[idx_train_pos, 'comment'])

In [17]:
cosine_dist = pairwise_distances(X_train_pos, X, metric='cosine').flatten()
for i in cosine_dist.flatten().argsort()[:3]:
    print('\nAVG: %0.2f' % df_train.loc[idx_train_pos, ['avg']].values[cosine_dist.argmin()][0])
    print('Cosine Distance: %0.3f' % cosine_dist[i])
#     print(df_train.loc[idx_train_pos, ['avg']].values[i])
    print('<COMMENT>\n' + df_train.loc[idx_train_pos, 'comment'].values[i] + '\n<COMMENT>')


AVG: -1.40
Cosine Distance: 0.568
<COMMENT>


I don't give a flying fuck. block me I Don't care. kiss my goddamn ass. DUSTIN Motherfucking D
<COMMENT>

AVG: -1.40
Cosine Distance: 0.582
<COMMENT>
And I don't give a damn.  


<COMMENT>

AVG: -1.40
Cosine Distance: 0.591
<COMMENT>


<COMMENT>


## Greedy counterfactual example
Now, instead of mining our training data for counterfactual examples, let's attempt to _create_ one by dropping word occurances from the original text until the score changes. Let's review the previous example.

In [18]:
print(comment)

"
::Block me. I really don't give a shit! If a source doesn't work for someone, it gets removed. K?    "


The steps here are relatively straightforward:
1. Using the defined vectorizer, convert the comment to a raw count vector.
2. Create a variation for each unique token in the raw count vector, such that each variant has a single token masked.
3. Generate a confidence score for each variant.
4. Identify the feature that moved the base score the furthest and mask it across all other variants.
5. Repeat until confidence score crosses threshold.

We'll start by demonstrating a single iteration of the above process.

In [23]:
idx_nonzero = np.nonzero(X.toarray().flatten())[0] # identify all nonzero elements of the target vector
variants = np.repeat(X.toarray(), len(idx_nonzero), axis=0)
# for each variant, mask a single feature (token)
for i, j in enumerate(idx_nonzero):
    variants[i,j] = 0
variants.shape

(38, 560571)

Now we'll generate a prediction for each variant and identify which feature was most impactful.

In [24]:
y_prob_var = pipe.named_steps['clf'].predict_proba(variants)[:,1]
k = y_prob_var.argmin()
print('''Removing token "%s" changes toxic score from %0.1f%% to %0.1f%%''' % (tokens[idx_nonzero[k]], 100*y_prob_var[0], 100*y_prob_var[k]))

Removing token "shit" changes toxic score from 96.8% to 67.4%


Fun stuff. Now let's create a function and repeat the process.

In [64]:
def explain_prediction_cf(comment, tokens, pipe, max_tokens=100):
    X = pipe.named_steps['vect'].transform([comment])
    y_prob_base = pipe.named_steps['clf'].predict_proba(X)[:,1][0]
    idx_nonzero = np.nonzero(X.toarray().flatten())[0]
    variants = np.repeat(X.toarray(), len(idx_nonzero), axis=0)
    for i,j in enumerate(idx_nonzero):
        variants[i,j] = 0
    log = [[None, None, y_prob_base]]
    for step in tqdm(range(max_tokens), total=float('inf')):
        y_prob_var = pipe.named_steps['clf'].predict_proba(variants)[:,1]
        k = y_prob_var.argsort()[step]
#         print(k, tokens[idx_nonzero[k]], y_prob_var[k])
        variants[:,idx_nonzero[k]] = 0
        log.append([k, tokens[idx_nonzero[k]], y_prob_var[k]])
        if y_prob_var[k] < 0.5:
            break
    return log, y_prob_base

In [72]:
log, y_prob_base = explain_prediction_cf(comment, tokens, pipe)
log

2it [00:00,  9.97it/s]


[[None, None, 0.9780473453499616],
 [27, 'shit', 0.6740743903054798],
 [22, 'me', 0.5742807959783442],
 [14, 'give a', 0.4841897354143122]]

Let's format this for ease of consumption.

In [103]:
def get_cf(comment, tokens, pipe):
    log, y_prob_base = explain_prediction_cf(comment, tokens, pipe)
    html = '<pre><h2>Explanation</h2>\n'
    html += 'Removing {'
    for row in log[1:]:
        html += '"%s", ' % row[1]
    html = html[:-2]
    html += '} from the text changes the toxicity score from %0.1f%% to %0.1f%%.' % (100*log[0][2], 100*log[-1][2])
    # Now let's add the original comment with highlighted text
    for row in log[1:]:
        token = row[1]
        comment = re.sub(r'\b%s\b' % token, '<span style="background-color: rgba(255, 0, 0, 0.2)">%s</span>' % token, comment, flags=re.IGNORECASE)  
    html += '<h2>Original</h2>\n%s</pre>' % comment
    return html

In [104]:
HTML(get_cf(comment, tokens, pipe))

2it [00:00, 10.17it/s]


In [101]:
comment_2 = df_test.loc[17621, 'comment']
print(comment_2)

, 8 August 2009 (UTC)


I just saw Xeno's edit comment - A Phone Call????? That's a reliable source that verified it for you???? You're a lousy editor, biased, obstructionist and fixed on defending your article. Fact is the claim was unsupported and unverified and should not be in here until such time as a Reliable Source was produced. You've been completely unable and incapable of providing a source yet more than happy to keep your edit by any means possible. Absolute garbage - and I detest your unsupported allegations that I'm a fucking teabagger, Republican or one-subject editor -ESPECIALLY since I provided supporting links to my NPOV editing. This is how you support an edit: You give it a proper name: Suncoast Regional Emmy Award /You give it a year: 2000. /You give it a title: A Grave Injustice /You give it a channel: WDSU, New Orleans - AND YOU PROVIDE A RELIABLE SOURCE: And you do it without bias according to supporting references. Your a biased hack, your attacks, ignorance, in

In [105]:
HTML(get_cf(comment_2, tokens, pipe))

7it [00:04,  1.49it/s]


# Local Surrogate Model

Local surrogate models are interpretable models (e.g., Logistic Regression, Decision Tree, etc.) that are used to explain individual predictions of black box machine learning models. The steps for computing a local surrogate model are as follows:

1. Generate variants by randomly masking (blanking) features found in the base instance.
2. Compute distance between base instance and each variant.
3. Compute scores for each variant.
4. Train an interpretable model using the inverse distance as the sample weight.
5. Interpret the resulting model.

In [107]:
print(comment)

"
::Block me. I really don't give a shit! If a source doesn't work for someone, it gets removed. K?    "


Let's start by vectorizing the comment (base instance) and randomly masking features from it to create variants.

In [154]:
X = pipe.named_steps['vect'].transform([comment]).toarray()
idx_nonzero = np.nonzero(X.flatten())[0]
idx_nonzero

array([ 13683,  18982,  19217,  88832,  88962, 152681, 152682, 153245,
       153256, 186689, 189069, 201036, 201126, 201907, 201909, 229569,
       230727, 232375, 232385, 254303, 255014, 262939, 290807, 291167,
       388853, 389014, 396308, 424082, 424153, 435431, 435600, 437394,
       437507, 454940, 455496, 456398, 547563, 547652])

In [155]:
mask = np.random.choice([0,1], size=(100, idx_nonzero.shape[0]), p=[0.2,0.8])

In [156]:
X_var = np.repeat(X, repeats=100, axis=0)
X_var[:, idx_nonzero] = mask*X_var[:, idx_nonzero]

In [157]:
sim = cosine_similarity(X_var, X).flatten()
sim[:5]

array([0.92932038, 0.94146887, 0.91701095, 0.8660254 , 0.83937206])

In [204]:
y_prob_var = pipe.named_steps['clf'].predict_proba(X_var)[:,1]
y_prob_var

array([0.97019489, 0.95105696, 0.96889938, 0.97989539, 0.92555573,
       0.43630836, 0.96978832, 0.78926859, 0.98386264, 0.64729942,
       0.96535687, 0.94732183, 0.69693903, 0.98377088, 0.97858002,
       0.97050126, 0.97418296, 0.96731702, 0.95671203, 0.55242902,
       0.74329257, 0.9819456 , 0.98746605, 0.97406047, 0.97280599,
       0.98508819, 0.66433111, 0.97492093, 0.97106533, 0.94439904,
       0.95820659, 0.62470695, 0.9477999 , 0.98789114, 0.97636115,
       0.54214197, 0.87357502, 0.98507409, 0.6270072 , 0.97886626,
       0.96839777, 0.97371749, 0.96132969, 0.95947663, 0.67619483,
       0.97377879, 0.30413632, 0.98656777, 0.98299359, 0.9687295 ,
       0.56434967, 0.95452144, 0.97958165, 0.46180748, 0.96399527,
       0.96714973, 0.97945948, 0.93634001, 0.64978137, 0.96627083,
       0.95530905, 0.59228667, 0.97130483, 0.97422419, 0.93618556,
       0.97834238, 0.98371783, 0.96901908, 0.49515367, 0.95564086,
       0.98426733, 0.94911348, 0.98442473, 0.98931216, 0.97072

In [202]:
clf = Ridge(random_state=seed)
clf.fit(X_var, y_prob_var)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=101, solver='auto', tol=0.001)

In [200]:
df_local = pd.DataFrame({'token':tokens, 'coef':clf.coef_.flatten()})
df_local.sort_values('coef', ascending=True).head()

Unnamed: 0,token,coef
547563,work,-0.036465
230727,i really,-0.030787
291167,me i,-0.023506
435431,someone,-0.023292
232385,if a,-0.02296


In [201]:
df_local.sort_values('coef', ascending=False).head()

Unnamed: 0,token,coef
424082,shit,0.33204
88832,block,0.041888
290807,me,0.034108
201909,give a,0.033178
455496,t give,0.022423


In [141]:
mask

(100, 38)

In [140]:
mask
X_variants[:, idx_nonzero]

ValueError: operands could not be broadcast together with shapes (100,38) (2,) 

Now let's drop one token at a time and create variations on the original comment. We'll keep the original comment as the first item for comparison.

In [11]:
variants = [comment]
for token in sorted(set(tokens)): # The set is very important!
    variants.append(re.sub(token, '', comment))
variants

['Wikipedia is total crap!',
 'Wikipedia is total !',
 'Wikipedia  total crap!',
 'Wikipedia is  crap!',
 'Wikipedia is total crap!']

In [12]:
sorted(set(tokens))

['crap', 'is', 'total', 'wikipedia']

Generate new scores. Let's only look at the probability related to the positive case.

In [13]:
y_probs = pipe.predict_proba(variants)[:,1]
y_probs

array([0.96419754, 0.59595006, 0.95051515, 0.93706812, 0.96419754])

Compute the difference from the base score.

In [14]:
y_diffs = y_probs[0] - y_probs[1:]
y_diffs

array([0.36824748, 0.01368239, 0.02712943, 0.        ])

Now let's assign these score differences as background spans in the original text.

In [15]:
weight_dict = {k:v for k,v in zip(list(sorted(set(tokens))), y_diffs)}
weight_dict

{'crap': 0.36824748498106574,
 'is': 0.013682391040973019,
 'total': 0.02712942587769296,
 'wikipedia': 0.0}

In [16]:
output = '<pre>' + comment + '</pre>'
for k,v in weight_dict.items():
    span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)
    output = re.sub(k, span_string, output)
HTML(output)

Cool, now let's try and functionalize that.

In [17]:
def highlight_toxic(text, pipe, norm=False):
    # Get tokens
    tokens = tokenizer(text)
    # Create variants
    variants = [text]
    for token in sorted(set(tokens)):
        variants.append(re.sub(token, '', text))
    # Score variants
    y_probs = pipe.predict_proba(variants)[:,1]
    # Compute differences from base score 
    y_diffs = y_probs[0] - y_probs[1:]
    # Normalize
    if norm:
        y_diffs /= np.linalg.norm(y_diffs, ord=1)
    # Assign weights to tokens
    weight_dict = {k:v for k,v in zip(list(sorted(set(tokens))), y_diffs)}
    # Generate output
    output = '<pre>' + text + '</pre>'
    for k,v in weight_dict.items():
        span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)
        output = re.sub(r'\b%s\b' % k, span_string, output)
    return output

In [18]:
output = highlight_toxic('Thank you for the suggestion', pipe)
print(pipe.predict_proba(['Thank you for the suggestion']))
HTML(output)

[[0.93358762 0.06641238]]


In [19]:
output = highlight_toxic('I hope you have a bad day', pipe)
print(pipe.predict_proba(['I hope you have a bad day']))
HTML(output)

[[0.77770784 0.22229216]]


In [20]:
output = highlight_toxic('This is the worst edit ever. Go jump off a bridge.', pipe)
print(pipe.predict_proba(['This is the worst edit ever. Go jump off a bridge.']))
HTML(output)

[[0.06290301 0.93709699]]


Let's try it out on some random samples.

In [21]:
normal_comments = df[df['y']==0].sample(10, random_state=seed)['comment']
y_prob_normal = pipe.predict_proba(normal_comments)
normal_examples = [highlight_toxic(comment, pipe) for comment in normal_comments]

In [22]:
i = 1
print(y_prob_normal[i])
HTML(normal_examples[i])

[0.96518533 0.03481467]


In [23]:
toxic_comments = df[df['y']==1].sample(10, random_state=seed)['comment']
y_prob_toxic = pipe.predict_proba(toxic_comments)
toxic_examples = [highlight_toxic(comment, pipe) for comment in toxic_comments]

In [24]:
i = 0
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.40919978 0.59080022]


In [25]:
i = 1
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.37620651 0.62379349]


In [26]:
i = 7
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[0.24845623 0.75154377]


In [27]:
i = 8
print(y_prob_toxic[i])
HTML(toxic_examples[i])

[3.18633803e-06 9.99996814e-01]


So...it seems like dropping out one token at a time isn't going to cut it. Even though the model is scoring this comment as toxic, it is having some issues highlighting specific segments. Let's dive into the model and drive highlighting by the exact contribution to each prediction.

In [28]:
comment = toxic_comments.values[8]
features = pipe.named_steps['vect'].get_feature_names() # Don't cast this to a numpy array...
counts = pipe.named_steps['vect'].transform([comment]).toarray().flatten()
w = pipe.named_steps['clf'].coef_.flatten()

In [29]:
idx = np.where(counts > 0)[0]
weight_dict = {}
for i in idx:
    if len(features[i].split(' ')) == 1: # Only keep unigrams
        weight_dict[features[i]] = w[i]*counts[i]
weight_dict

{'a': 0.9506251995001473,
 'about': -0.2573779969605216,
 'an': 0.24775747406102963,
 'and': -0.26113798744752736,
 'any': -0.04195150960840179,
 'anyone': -0.17851617027838612,
 'article': -0.917904310846447,
 'as': -0.1915447073542239,
 'at': -0.1695905146795088,
 'banking': -0.21517154754479062,
 'bias': 0.14695020051432892,
 'big': 0.4201936147568579,
 'by': -0.007789988269459465,
 'calling': -0.004525334154367213,
 'check': -0.32613314230385276,
 'considers': 0.03558991071102402,
 'conspiracy': 0.08777763038538694,
 'criticizes': -0.022762161378118775,
 'day': -0.07177890620233918,
 'did': -0.23221968775288868,
 'discussion': -0.15508429059649287,
 'don': -0.028806586242888867,
 'done': -0.09142575896360813,
 'douchebag': 3.0288359785357937,
 'economists': -0.02508670957939663,
 'editor': -0.10446221268238955,
 'experts': 0.04890261771595807,
 'fact': -0.03430937874263487,
 'fat': 1.1855633862722827,
 'flag': 0.16822040909077168,
 'for': 0.02230599771168577,
 'fractional': -0.0467

In [30]:
output = '<pre>' + comment + '</pre>'
for k,v in weight_dict.items():
    if v < 0:
        pass
    else:
        span_string = '''<span style="background-color: rgba(255, 0, 0, %0.2f)">%s</span>''' % (v,k)        
        output = re.sub(r'\b%s\b' % k, span_string, output, re.IGNORECASE)

In [31]:
HTML(output)

Now the functional form of the above process.

In [32]:
def highlight_toxic(comment, pipe, features, w):
    counts = pipe.named_steps['vect'].transform([comment]).toarray().flatten()
    # Build weight dictionary (unigram only)
    idx = np.where(counts > 0)[0]
    weight_dict = {}
    for i in idx:
        if len(features[i].split(' ')) == 1: # Only keep unigrams
            weight_dict[features[i]] = w[i]*counts[i]
    # Scale to max value
    
    # Insert spans
    output = '<pre>' + comment + '</pre>'
    for k,v in weight_dict.items():
        if v < 0:
            pass
        else:
            # Some regex magic for keeping the original case
            span_string = r'<span style="background-color: rgba(255, 0, 0, %0.2f)">\1</span>' % v
            output = re.sub(r'(?i)(\b%s\b)' % k, span_string, output)
    return output

In [33]:
features = pipe.named_steps['vect'].get_feature_names()
w = pipe.named_steps['clf'].coef_.flatten()
toxic_examples = [highlight_toxic(comment, pipe, features, w) for comment in toxic_comments]

In [34]:
HTML(toxic_examples[3])

In [35]:
HTML(toxic_examples[7])