In [1]:
from data_processing import format_raw_df
import pandas as pd

df = pd.read_csv('data/writers.csv')
df = format_raw_df(df.copy())

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
from data_processing import get_random_train_test_split, get_vectorized_inputs_and_label

train_df_rand, test_df_rand = get_random_train_test_split(df[df["is_question"]], test_size=0.2, random_state=40)

## Let's try a simple model

In [3]:
# TODO update train_df_rand

X_train, y_train = get_vectorized_inputs_and_label(train_df_rand)

X_test, y_test = get_vectorized_inputs_and_label(test_df_rand)

KeyError: 'vectors'

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', oob_score=True)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
y_predicted_proba = clf.predict_proba(X_test)

In [None]:
y_train.value_counts()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1



In [None]:
# Training accuracy
# Thanks to https://datascience.stackexchange.com/questions/13151/randomforestclassifier-oob-scoring-method
y_train_pred = np.argmax(clf.oob_decision_function_,axis=1)

accuracy, precision, recall, f1 = get_metrics(y_train, y_train_pred)
print("Training accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

In [None]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted)
print("Validation accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

In [None]:
from model_evaluation import get_confusion_matrix_plot

get_confusion_matrix_plot(y_predicted, y_test, figsize=(9, 9))
plt.show()

In [None]:
from model_evaluation import get_roc_plot

get_roc_plot(y_predicted_proba[:,1], y_test, figsize=(10,10))
plt.show()


In [None]:
get_roc_plot(y_predicted_proba[:,1], y_test, fpr_bar=.1, figsize=(10,10))
plt.show()

In [None]:
from model_evaluation import get_calibration_plot

get_calibration_plot(y_predicted_proba[:,1], y_test, figsize=(9,9))


In [None]:
from model_evaluation import get_feature_importance

feature_names = [
    "action_verb_full",
    "question_mark_full",
    "norm_text_len",
    "language_question",
]

w_indices = ["word_vector_index_%s" % s for s in range(300)]
w_indices.extend(feature_names)
all_feature_names = np.array(w_indices)



In [None]:

print("Top 5 importances:\n")
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[:5]]))

print("\nBottom 5 importances:\n")
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[-5:]]))

## Let's look at most and least successful examples

In [None]:
from model_evaluation import get_top_k
test_analysis_df = test_df_rand.copy()
test_analysis_df["predicted_proba"] = y_predicted_proba[:, 1]
test_analysis_df["true_label"] = y_test

to_display = [
    "predicted_proba",
    "true_label",
    "Title",
    "body_text",
    "text_len",
    "action_verb_full",
    "question_mark_full",
    "language_question",
]
threshold = 0.5


top_pos, top_neg, worst_pos, worst_neg, unsure = get_top_k(test_analysis_df, "predicted_proba", "true_label", k=2)
pd.options.display.max_colwidth = 100

In [None]:
# Most confident correct positive predictions
top_pos[to_display]

In [None]:
# Most confident correct negative predictions
top_neg[to_display]

In [None]:
# Most confident incorrect negative predictions
worst_pos[to_display]

In [None]:
# Most confident incorrect positive predictions
worst_neg[to_display]

In [None]:
# Most unsure questions
unsure[to_display]

In [None]:
from lime.lime_text import LimeTextExplainer

vector_store = nlp

clf_text_only = RandomForestClassifier(n_estimators=100, class_weight='balanced', oob_score=True)
X_train_text = np.vstack(train_df_rand["full_text"].apply(lambda x: nlp(x).vector))
X_test_text = np.vstack(test_df_rand["full_text"].apply(lambda x: nlp(x).vector))
clf_text_only.fit(X_train_text, y_train)

def text_pipeline(examples):
    global vector_store
    vectors = [nlp(x).vector for x in examples]
    vectors=np.vstack(np.array(vectors))

    return clf_text_only.predict_proba(vectors)


In [None]:
def explain_one_instance(instance, class_names):
    explainer = LimeTextExplainer(class_names=class_names)
    exp = explainer.explain_instance(instance, text_pipeline, num_features=6)
    return exp

def visualize_one_exp(features, labels, index, class_names = ["Unanswered","Answered"]):
    exp = explain_one_instance(features[index], class_names = class_names)
    print('Index: %d' % index)
    print('True class: %s' % class_names[labels[index]])
    exp.show_in_notebook(text=True)

In [None]:
visualize_one_exp(list(test_df_rand["full_text"]), list(y_test), 7)

In [None]:
import random
from collections import defaultdict

random.seed(40)

def get_statistical_explanation(test_set, sample_size, word2vec_pipeline, label_dict):
    sample_sentences = random.sample(test_set, sample_size)
    explainer = LimeTextExplainer()
    
    labels_to_sentences = defaultdict(list)
    contributors = defaultdict(dict)
    
    # First, find contributing words to each class
    for sentence in sample_sentences:
        probabilities = word2vec_pipeline([sentence])
        curr_label = probabilities[0].argmax()
        labels_to_sentences[curr_label].append(sentence)
        exp = explainer.explain_instance(sentence, word2vec_pipeline, num_features=6, labels=[curr_label])
        listed_explanation = exp.as_list(label=curr_label)
        
        for word,contributing_weight in listed_explanation:
            if word in contributors[curr_label]:
                contributors[curr_label][word].append(contributing_weight)
            else:
                contributors[curr_label][word] = [contributing_weight]    
    
    # average each word's contribution to a class, and sort them by impact
    average_contributions = {}
    sorted_contributions = {}
    for label,lexica in contributors.items():
        curr_label = label
        curr_lexica = lexica
        average_contributions[curr_label] = pd.Series(index=curr_lexica.keys())
        for word,scores in curr_lexica.items():
            average_contributions[curr_label].loc[word] = np.sum(np.array(scores))/sample_size
        detractors = average_contributions[curr_label].sort_values()
        supporters = average_contributions[curr_label].sort_values(ascending=False)
        sorted_contributions[label_dict[curr_label]] = {
            'detractors':detractors,
             'supporters': supporters
        }
    return sorted_contributions

label_to_text = {
    0: 'Unanswered',
    1: 'Answered',
}
sorted_contributions = get_statistical_explanation(list(test_df_rand["full_text"]), 5, text_pipeline, label_to_text)



In [None]:
def plot_important_words(top_scores, top_words, bottom_scores, bottom_words, name):
    y_pos = np.arange(len(top_words))
    top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]
    top_pairs = sorted(top_pairs, key=lambda x: x[1])
    
    bottom_pairs = [(a,b) for a,b in zip(bottom_words, bottom_scores)]
    bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)
    
    top_words = [a[0] for a in top_pairs]
    top_scores = [a[1] for a in top_pairs]
    
    bottom_words = [a[0] for a in bottom_pairs]
    bottom_scores = [a[1] for a in bottom_pairs]
    
    fig = plt.figure(figsize=(10, 10))  

    plt.subplot(121)
    plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)
    plt.title('Unanswered', fontsize=20)
    plt.yticks(y_pos, bottom_words, fontsize=14)
    plt.suptitle('Key words', fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplot(122)
    plt.barh(y_pos,top_scores, align='center', alpha=0.5)
    plt.title('Answered', fontsize=20)
    plt.yticks(y_pos, top_words, fontsize=14)
    plt.suptitle(name, fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplots_adjust(wspace=0.8)
    plt.show()

In [None]:
top_words = sorted_contributions['Answered']['supporters'][:10].index.tolist()
top_scores = sorted_contributions['Answered']['supporters'][:10].tolist()
bottom_words = sorted_contributions['Answered']['detractors'][:10].index.tolist()
bottom_scores = sorted_contributions['Answered']['detractors'][:10].tolist()

plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance")

In [None]:
## Let's try a simple model

# TODO update train_df_rand

X_train, y_train = get_vectorized_inputs_and_label(train_df_rand)

X_test, y_test = get_vectorized_inputs_and_label(test_df_rand)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', oob_score=True)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
y_predicted_proba = clf.predict_proba(X_test)

y_train.value_counts()

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1



# Training accuracy
# Thanks to https://datascience.stackexchange.com/questions/13151/randomforestclassifier-oob-scoring-method
y_train_pred = np.argmax(clf.oob_decision_function_,axis=1)

accuracy, precision, recall, f1 = get_metrics(y_train, y_train_pred)
print("Training accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted)
print("Validation accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

from model_evaluation import get_confusion_matrix_plot

get_confusion_matrix_plot(y_predicted, y_test, figsize=(9, 9))
plt.show()

from model_evaluation import get_roc_plot

get_roc_plot(y_predicted_proba[:,1], y_test, figsize=(10,10))
plt.show()


get_roc_plot(y_predicted_proba[:,1], y_test, fpr_bar=.1, figsize=(10,10))
plt.show()

from model_evaluation import get_calibration_plot

get_calibration_plot(y_predicted_proba[:,1], y_test, figsize=(9,9))


from model_evaluation import get_feature_importance

feature_names = [
    "action_verb_full",
    "question_mark_full",
    "norm_text_len",
    "language_question",
]

w_indices = ["word_vector_index_%s" % s for s in range(300)]
w_indices.extend(feature_names)
all_feature_names = np.array(w_indices)




print("Top 5 importances:\n")
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[:5]]))

print("\nBottom 5 importances:\n")
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[-5:]]))

## Let's look at most and least successful examples

from model_evaluation import get_top_k
test_analysis_df = test_df_rand.copy()
test_analysis_df["predicted_proba"] = y_predicted_proba[:, 1]
test_analysis_df["true_label"] = y_test

to_display = [
    "predicted_proba",
    "true_label",
    "Title",
    "body_text",
    "text_len",
    "action_verb_full",
    "question_mark_full",
    "language_question",
]
threshold = 0.5


top_pos, top_neg, worst_pos, worst_neg, unsure = get_top_k(test_analysis_df, "predicted_proba", "true_label", k=2)
pd.options.display.max_colwidth = 100

# Most confident correct positive predictions
top_pos[to_display]

# Most confident correct negative predictions
top_neg[to_display]

# Most confident incorrect negative predictions
worst_pos[to_display]

# Most confident incorrect positive predictions
worst_neg[to_display]

# Most unsure questions
unsure[to_display]

from lime.lime_text import LimeTextExplainer

vector_store = nlp

clf_text_only = RandomForestClassifier(n_estimators=100, class_weight='balanced', oob_score=True)
X_train_text = np.vstack(train_df_rand["full_text"].apply(lambda x: nlp(x).vector))
X_test_text = np.vstack(test_df_rand["full_text"].apply(lambda x: nlp(x).vector))
clf_text_only.fit(X_train_text, y_train)

def text_pipeline(examples):
    global vector_store
    vectors = [nlp(x).vector for x in examples]
    vectors=np.vstack(np.array(vectors))

    return clf_text_only.predict_proba(vectors)


def explain_one_instance(instance, class_names):
    explainer = LimeTextExplainer(class_names=class_names)
    exp = explainer.explain_instance(instance, text_pipeline, num_features=6)
    return exp

def visualize_one_exp(features, labels, index, class_names = ["Unanswered","Answered"]):
    exp = explain_one_instance(features[index], class_names = class_names)
    print('Index: %d' % index)
    print('True class: %s' % class_names[labels[index]])
    exp.show_in_notebook(text=True)

visualize_one_exp(list(test_df_rand["full_text"]), list(y_test), 7)

import random
from collections import defaultdict

random.seed(40)

def get_statistical_explanation(test_set, sample_size, word2vec_pipeline, label_dict):
    sample_sentences = random.sample(test_set, sample_size)
    explainer = LimeTextExplainer()
    
    labels_to_sentences = defaultdict(list)
    contributors = defaultdict(dict)
    
    # First, find contributing words to each class
    for sentence in sample_sentences:
        probabilities = word2vec_pipeline([sentence])
        curr_label = probabilities[0].argmax()
        labels_to_sentences[curr_label].append(sentence)
        exp = explainer.explain_instance(sentence, word2vec_pipeline, num_features=6, labels=[curr_label])
        listed_explanation = exp.as_list(label=curr_label)
        
        for word,contributing_weight in listed_explanation:
            if word in contributors[curr_label]:
                contributors[curr_label][word].append(contributing_weight)
            else:
                contributors[curr_label][word] = [contributing_weight]    
    
    # average each word's contribution to a class, and sort them by impact
    average_contributions = {}
    sorted_contributions = {}
    for label,lexica in contributors.items():
        curr_label = label
        curr_lexica = lexica
        average_contributions[curr_label] = pd.Series(index=curr_lexica.keys())
        for word,scores in curr_lexica.items():
            average_contributions[curr_label].loc[word] = np.sum(np.array(scores))/sample_size
        detractors = average_contributions[curr_label].sort_values()
        supporters = average_contributions[curr_label].sort_values(ascending=False)
        sorted_contributions[label_dict[curr_label]] = {
            'detractors':detractors,
             'supporters': supporters
        }
    return sorted_contributions

label_to_text = {
    0: 'Unanswered',
    1: 'Answered',
}
sorted_contributions = get_statistical_explanation(list(test_df_rand["full_text"]), 5, text_pipeline, label_to_text)



def plot_important_words(top_scores, top_words, bottom_scores, bottom_words, name):
    y_pos = np.arange(len(top_words))
    top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]
    top_pairs = sorted(top_pairs, key=lambda x: x[1])
    
    bottom_pairs = [(a,b) for a,b in zip(bottom_words, bottom_scores)]
    bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)
    
    top_words = [a[0] for a in top_pairs]
    top_scores = [a[1] for a in top_pairs]
    
    bottom_words = [a[0] for a in bottom_pairs]
    bottom_scores = [a[1] for a in bottom_pairs]
    
    fig = plt.figure(figsize=(10, 10))  

    plt.subplot(121)
    plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)
    plt.title('Unanswered', fontsize=20)
    plt.yticks(y_pos, bottom_words, fontsize=14)
    plt.suptitle('Key words', fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplot(122)
    plt.barh(y_pos,top_scores, align='center', alpha=0.5)
    plt.title('Answered', fontsize=20)
    plt.yticks(y_pos, top_words, fontsize=14)
    plt.suptitle(name, fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplots_adjust(wspace=0.8)
    plt.show()

top_words = sorted_contributions['Answered']['supporters'][:10].index.tolist()
top_scores = sorted_contributions['Answered']['supporters'][:10].tolist()
bottom_words = sorted_contributions['Answered']['detractors'][:10].index.tolist()
bottom_scores = sorted_contributions['Answered']['detractors'][:10].tolist()

plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance")