## TODO

* Table containing accuracy, precision, specificity, recall, f1, roc-auc for best classifier configurations
* Roc-auc graphs for each best classifier configuration
* Table of accuracy, precision, specificity, and percent uncertainty at each threshold
* Graphs of accuracy, precision, specificity, and percent uncertainty at each threshold
* Some kind of graph/table using empath (top categories at each threshold?)


In [99]:
import json
import yaml
import importlib
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, average_precision_score, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score, roc_curve, auc, precision_recall_curve, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from scipy.sparse import load_npz

from empath import Empath
from collections import Counter
from functools import partial

class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

In [84]:
pd.set_option('display.max_colwidth', None)

In [3]:
MODEL_BINARY_PATH = '../models/best_models_w2v_rating_2021-05-29.pickle'

with open(MODEL_BINARY_PATH, 'rb') as filestream:
    d = pickle.load(filestream)

In [4]:
MODEL_BINARY_PATH = '../models/best_models_w2v_rating_SVC_2021-06-03.pickle'

with open(MODEL_BINARY_PATH, 'rb') as filestream:
    d_svc = pickle.load(filestream)

In [5]:
METRICS = {
    'accuracy':accuracy_score,
    'precision':partial(precision_score),
    'negative predictive value':partial(precision_score,pos_label = -1),
    'recall':recall_score,
    'f1':f1_score,
    #'average_precision':average_precision_score,
    'roc_auc':roc_auc_score
}

def generate_sample(df,n,balance=True):
    if balance and n:
        pos = df.loc[df['aft_net_sign_helpful'] > 0].sample(int(n/2))
        neg = df.loc[df['aft_net_sign_helpful'] < 0].sample(int(n/2))
        sample = pos.append(neg)
    elif n:
        sample = df.loc[df['aft_net_sign_helpful'] != 0].sample(n)
    else:
        sample = df.loc[df['aft_net_sign_helpful'] != 0]
    return sample

def proba_to_preds(probability_list,threshold=.5):
    preds = []
    for proba in probability_list:
        if proba[1] > threshold:
            preds.append(1)
        else:
            preds.append(-1)
    return preds

def results_to_table(proba,truth):
    result_dict = results_to_json(proba,truth)
    return pd.DataFrame([result_dict])
    #return results

def results_to_json(proba,truth):
    preds = proba_to_preds(proba)
    results_dict = {}
    for metric in METRICS:
        if metric == 'roc_auc':
            results_dict[metric] = METRICS[metric](truth,proba[:, 1])
        else:
            results_dict[metric] = METRICS[metric](truth,preds)
    return results_dict

In [6]:
feature_path_w2v = '../datasets/vectorized/vectorized_w2v_rating_2021-03-31.json'

def load_dataset(feature_path,sparse_matrix_path=None):
    with open(feature_path,'r') as filestream:
        df = pd.DataFrame(json.load(filestream))

    df = generate_sample(df, None)
    df = df.reset_index()
    
    if sparse_matrix_path:
        with open(sparse_matrix_path,'rb') as filestream:
            features = load_npz(filestream)
            
    else:
        features = pd.DataFrame(df['feature_vector'].values.tolist()).to_numpy()

    labels = df['aft_net_sign_helpful'].to_numpy()
    
    return features, labels, df

features, labels, df = load_dataset(feature_path_w2v)

In [7]:
labels_train = labels[d['indices']['train']]
labels_test = labels[d['indices']['test']]
features_train = features[d['indices']['train']]
features_test = features[d['indices']['test']]
df_test = df.iloc[d['indices']['test']]

In [8]:
del d['classifiers']['SVC']

for c in d_svc['classifiers']:
    d['classifiers'][c] = d_svc['classifiers'][c]

In [9]:
d

{'classifiers': {'GradientBoostingClassifier': GridSearchCV(estimator=Pipeline(steps=[('clf', DummyEstimator())]), n_jobs=32,
               param_grid=[{'clf': [GradientBoostingClassifier(learning_rate=0.01,
                                                               max_depth=7,
                                                               max_features='log2',
                                                               min_samples_leaf=13,
                                                               n_estimators=700)],
                            'clf__learning_rate': [0.01, 0.1, 0.5, 1],
                            'clf__max_depth': [1, 3, 5, 7],
                            'clf__max_features': ['log2'],
                            'clf__min_samples_leaf': [1, 3, 5, 7, 13],
                            'clf__n_estimators': [100, 300, 500, 700]}],
               pre_dispatch=64, refit='roc_auc',
               scoring=('roc_auc', 'f1', 'accuracy', 'recall', 'precision')),
  '

In [10]:
probas = {}

for model in d['classifiers']:
    probas[model] = d['classifiers'][model].predict_proba(features_test)

In [26]:
lexicon = Empath()
lexicon.analyze("he hit the other person", normalize=True)

{'help': 0.0,
 'office': 0.0,
 'dance': 0.0,
 'money': 0.0,
 'wedding': 0.0,
 'domestic_work': 0.0,
 'sleep': 0.0,
 'medical_emergency': 0.0,
 'cold': 0.0,
 'hate': 0.0,
 'cheerfulness': 0.0,
 'aggression': 0.0,
 'occupation': 0.0,
 'envy': 0.0,
 'anticipation': 0.0,
 'family': 0.0,
 'vacation': 0.0,
 'crime': 0.0,
 'attractive': 0.0,
 'masculine': 0.0,
 'prison': 0.0,
 'health': 0.0,
 'pride': 0.0,
 'dispute': 0.0,
 'nervousness': 0.0,
 'government': 0.0,
 'weakness': 0.0,
 'horror': 0.0,
 'swearing_terms': 0.0,
 'leisure': 0.0,
 'suffering': 0.0,
 'royalty': 0.0,
 'wealthy': 0.0,
 'tourism': 0.0,
 'furniture': 0.0,
 'school': 0.0,
 'magic': 0.0,
 'beach': 0.0,
 'journalism': 0.0,
 'morning': 0.0,
 'banking': 0.0,
 'social_media': 0.0,
 'exercise': 0.0,
 'night': 0.0,
 'kill': 0.0,
 'blue_collar_job': 0.0,
 'art': 0.0,
 'ridicule': 0.0,
 'play': 0.0,
 'computer': 0.0,
 'college': 0.0,
 'optimism': 0.0,
 'stealing': 0.0,
 'real_estate': 0.0,
 'home': 0.0,
 'divine': 0.0,
 'sexual': 0.0

In [82]:
def get_empath_categories_diff(proba, labels, text, threshold=.5, step=.1, n_categories=5):
    df = pd.DataFrame({
        'proba_neg':proba.T[0],
        'proba_pos':proba.T[1],
        'labels':labels,
        'text':text
    })
    df = df.loc[((df['proba_pos'] > threshold) & (df['proba_pos'] < (threshold+step))) | ((df['proba_neg'] > threshold) & (df['proba_neg'] < (threshold+step)))]
    confusion = {}
    confusion['tp'] = df.loc[(df['proba_pos'] > df['proba_neg']) & (df['labels'] == 1)]
    confusion['tn'] = df.loc[(df['proba_pos'] < df['proba_neg']) & (df['labels'] == -1)]
    confusion['fp'] = df.loc[(df['proba_pos'] > df['proba_neg']) & (df['labels'] == -1)]
    confusion['fn'] = df.loc[(df['proba_pos'] < df['proba_neg']) & (df['labels'] == 1)]
    cat_dict = {}
    for cell in confusion:
        text = ' '.join(confusion[cell]['text'].values)
        categories = lexicon.analyze(text, normalize=True)
        if categories:
            cat_dict[cell] = ' '.join(list({k: v for k, v in sorted(categories.items(), key=lambda item: item[1], reverse=True)})[:n_categories])
        else:
            cat_dict[cell] = None
        
    return cat_dict

## Topic Analysis

In order to better understand classifier performance, investigate the topical makeup of both correct and incorrect classifications.  Using a dictionary based approach, we generate the top 5 topics at each probability threshold for all four quandrants of the confusion matrix: true positives, true negatives, false positives, and false negatives.  We use the Python Empath to generate categories at each threshold level.

By comparing topics across these 4 quandrants we can determine whether misclassifications systematically effect certain topics, or whether these errors are relatively evenly distributed.  Systematically misclassifying specific types of feedback could introduce more bias, which would ultimately negate the end goal of this project.  Conversely, understanding what types of content are being correctly classified may help us understand where the classifier excels.

Overall, we observe relatively high overlap between topics across thresholds and quandrants of the confusion matrix.  Of the 20 potential cells in the confusion-topic matrix (Table X), 'internet' occures 14 times, 'communication' occures 13 times, 'writing' occures 10 times, and 'reading' occures 9 times.  Noteably, none of these top topics appear for negative predictions when our confidence threshold is set to .9, or positive predictions when our confidence threshold is set to .8 (recall that our classifier does not make any positive predictions with above .9 confidence).  This likely indicates that these common topics are too general to help our classifier make a confident prediction.

Some topics appear consistantly in either positive or negative predictions.  Journalism appears at the .5 through .7 confidence thresholds for both true and false positive predictions, but in none of our negative predictions.  Similarily, negative emotion appears at all thresholds for true negatives, at the .7 and .8 thresholds for false negatives, and in none of the positive predictions.  These common topics may be strong predictors of either positive or negative feedback, but their appearance in both the true and false rows of the confusion matrix indicates the highly contextual nature of the concept of helpfulness when applied to reader feedback.  For instance, while negative emotion appears in most unhelpful feedback, it also appears in some helpful feedback that our model misclassifies as unhelpful.  Similarily, while the topic journalism appears in most helpful feedback, our model tends to misclassify unhelpful feedback that falls under the category journalism.

Noteably, the predictions about which our classifier is most confident also have the higher number of unique topics.  This could be because these topics are strong predictors of hepful or unhelpful feedback and therefore result in high confidence prediction, or simply it could result from a low number of predictions which produces a somewhat random distribution of topics.  For instance, 'domestic_work' and 'family' appear for both true negatives and false negatives, but only at the .9 confidence theshold.  Family, home, and shopping appear only for true negatives at the .9 confidence threshold, and 'sexual', 'swearing_terms', and 'ridicule' appear only for false negatives at the .9 confidence threshold.  'Help', 'office', 'dance', and 'money' all appear only for true and false positives at the .8 confidence level.

In [144]:
def remove_uncertain_predictions(proba, labels, text, threshold=.5):
    df = pd.DataFrame({
        'proba_neg':proba.T[0],
        'proba_pos':proba.T[1],
        'labels':labels,
        'text':text
    })
    df = df.loc[(df['proba_pos'] > threshold) | ((df['proba_neg'] > threshold))]
    return df

def get_empath_categories(proba, labels, text, threshold=.5, step=.1, n_categories=5):
    df = remove_uncertain_predictions(
        proba,
        labels,
        text,
        threshold
    )
    confusion = {}
    confusion['tp'] = df.loc[(df['proba_pos'] > df['proba_neg']) & (df['labels'] == 1)]
    confusion['tn'] = df.loc[(df['proba_pos'] < df['proba_neg']) & (df['labels'] == -1)]
    confusion['fp'] = df.loc[(df['proba_pos'] > df['proba_neg']) & (df['labels'] == -1)]
    confusion['fn'] = df.loc[(df['proba_pos'] < df['proba_neg']) & (df['labels'] == 1)]
    cat_dict = {}
    for cell in confusion:
        text = ' '.join(confusion[cell]['text'].values)
        categories = lexicon.analyze(text, normalize=True)

        if categories:
            cat_dict[cell] = ' '.join(list({k: v for k, v in sorted(categories.items(), key=lambda item: item[1], reverse=True)})[:n_categories])
        else:
            cat_dict[cell] = None

    return cat_dict

df = pd.DataFrame()
step = .1
n_categories = 5

for threshold in np.arange(.5,1,step):
    cat_dict = get_empath_categories(
        probas['GradientBoostingClassifier'],
        labels_test,
        df_test['aft_comment'].values,
        threshold,
        .1,
        n_categories
    )
    cat_dict['threshold'] = '{0}'.format(round(threshold,1))
    df = df.append(pd.DataFrame([cat_dict]))
    
word_counts = {'all':Counter()}
for _, row in df.iterrows():
    for col in df.columns:
        if col != 'threshold':
            if col not in word_counts:
                word_counts[col] = Counter()
            if row[col]:
                word_counts['all'].update(row[col].split(' '))
                word_counts[col].update(row[col].split(' '))
                
for key in word_counts:
    print(key, word_counts[key])
    
df

all Counter({'internet': 14, 'communication': 13, 'writing': 10, 'reading': 9, 'negative_emotion': 7, 'journalism': 6, 'messaging': 3, 'phone': 3, 'speaking': 2, 'computer': 2, 'help': 2, 'office': 2, 'dance': 2, 'money': 2, 'domestic_work': 2, 'family': 2, 'business': 1, 'hate': 1, 'wedding': 1, 'social_media': 1, 'home': 1, 'shopping': 1, 'sexual': 1, 'swearing_terms': 1, 'ridicule': 1})
tp Counter({'internet': 3, 'communication': 3, 'writing': 3, 'journalism': 3, 'reading': 3, 'speaking': 1, 'help': 1, 'office': 1, 'dance': 1, 'money': 1})
tn Counter({'negative_emotion': 5, 'communication': 4, 'internet': 4, 'writing': 2, 'messaging': 2, 'phone': 2, 'reading': 1, 'hate': 1, 'domestic_work': 1, 'family': 1, 'home': 1, 'shopping': 1})
fp Counter({'internet': 3, 'communication': 3, 'writing': 3, 'journalism': 3, 'reading': 3, 'help': 1, 'office': 1, 'dance': 1, 'money': 1, 'wedding': 1})
fn Counter({'internet': 4, 'communication': 3, 'writing': 2, 'reading': 2, 'negative_emotion': 2, '

Unnamed: 0,tp,tn,fp,fn,threshold
0,internet communication writing journalism reading,communication internet writing reading negative_emotion,internet communication writing journalism reading,communication internet writing reading speaking,0.5
0,internet communication writing journalism reading,internet communication negative_emotion writing messaging,internet communication writing journalism reading,internet communication writing reading business,0.6
0,internet communication writing journalism reading,negative_emotion internet communication messaging phone,communication internet writing reading journalism,internet communication negative_emotion computer messaging,0.7
0,speaking help office dance money,negative_emotion internet hate communication phone,help office dance money wedding,internet negative_emotion phone social_media computer,0.8
0,,negative_emotion domestic_work family home shopping,,sexual domestic_work family swearing_terms ridicule,0.9


In [125]:
df = pd.DataFrame()
step = .1

for threshold in np.arange(.5,1,step):
    cat_dict = get_empath_categories_diff(
        probas['GradientBoostingClassifier'],
        labels_test,
        df_test['aft_comment'].values,
        threshold
    )
    cat_dict['threshold'] = '{0} - {1}'.format(round(threshold,1),round(threshold + step,1))
    df = df.append(pd.DataFrame([cat_dict]))
    
word_counts = {'all':Counter()}
for _, row in df.iterrows():
    for col in df.columns:
        if col != 'threshold':
            if col not in word_counts:
                word_counts[col] = Counter()
            if row[col]:
                word_counts['all'].update(row[col].split(' '))
                word_counts[col].update(row[col].split(' '))

for key in word_counts:
    print(key, word_counts[key])
    
df

all Counter({'internet': 14, 'communication': 13, 'writing': 11, 'reading': 10, 'journalism': 6, 'negative_emotion': 5, 'speaking': 4, 'messaging': 2, 'help': 2, 'office': 2, 'dance': 2, 'money': 2, 'phone': 2, 'domestic_work': 2, 'family': 2, 'business': 1, 'friends': 1, 'hate': 1, 'wedding': 1, 'social_media': 1, 'computer': 1, 'home': 1, 'shopping': 1, 'sexual': 1, 'swearing_terms': 1, 'ridicule': 1})
tp Counter({'internet': 3, 'communication': 3, 'writing': 3, 'journalism': 3, 'reading': 3, 'speaking': 1, 'help': 1, 'office': 1, 'dance': 1, 'money': 1})
tn Counter({'communication': 4, 'internet': 4, 'writing': 3, 'negative_emotion': 3, 'reading': 2, 'speaking': 2, 'messaging': 1, 'phone': 1, 'hate': 1, 'domestic_work': 1, 'family': 1, 'home': 1, 'shopping': 1})
fp Counter({'internet': 3, 'communication': 3, 'writing': 3, 'reading': 3, 'journalism': 3, 'help': 1, 'office': 1, 'dance': 1, 'money': 1, 'wedding': 1})
fn Counter({'internet': 4, 'communication': 3, 'writing': 2, 'reading

Unnamed: 0,tp,tn,fp,fn,threshold
0,internet communication writing journalism reading,communication internet writing reading speaking,internet communication writing reading journalism,communication internet writing reading speaking,0.5 - 0.6
0,internet writing communication journalism reading,communication internet writing reading speaking,internet communication writing journalism reading,internet communication writing reading business,0.6 - 0.7
0,internet communication writing journalism reading,internet communication negative_emotion writing messaging,communication internet writing reading journalism,internet communication friends messaging negative_emotion,0.7 - 0.8
0,speaking help office dance money,negative_emotion internet communication phone hate,help office dance money wedding,internet negative_emotion phone social_media computer,0.8 - 0.9
0,,negative_emotion domestic_work family home shopping,,sexual domestic_work family swearing_terms ridicule,0.9 - 1.0
