# The top k approach

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

import sys
sys.path.append("..")

from ml_editor.data_processing import format_raw_df

data_path = Path('../data/writers.csv')
df = pd.read_csv(data_path)
df = format_raw_df(df.copy())

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
from ml_editor.data_processing import get_split_by_author, get_vectorized_inputs_and_label, add_features_to_df

df = add_features_to_df(df.loc[df["is_question"]].copy(), pretrained_vectors=True)
train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=40)

In [3]:
X_train, y_train = get_vectorized_inputs_and_label(train_df)
X_test, y_test = get_vectorized_inputs_and_label(test_df)

In [4]:
from sklearn.externals import joblib

model_path = Path("../models/model_1.pkl")
clf = joblib.load(model_path) 

y_predicted = clf.predict(X_test)
y_predicted_proba = clf.predict_proba(X_test)

In [5]:
from ml_editor.model_evaluation import get_top_k

test_analysis_df = test_df.copy()
test_analysis_df["predicted_proba"] = y_predicted_proba[:, 1]
test_analysis_df["true_label"] = y_test

to_display = [
    "predicted_proba",
    "true_label",
    "Title",
    "body_text",
    "text_len",
    "action_verb_full",
    "question_mark_full",
    "language_question",
]
threshold = 0.5


top_pos, top_neg, worst_pos, worst_neg, unsure = get_top_k(test_analysis_df, "predicted_proba", "true_label", k=2)
pd.options.display.max_colwidth = 100

In [6]:
# Most confident correct positive predictions
top_pos[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
38394,0.82,True,Are chapters with a single character inherently more difficult for an average reader to connect ...,On the topic of keeping a reader engaged:\nDialog is a great way to balance out a scene of descr...,511,True,True,False
24729,0.79,True,"Can I use ""fuck"" as a non-vulgar verb in a fantasy/steampunk world?",I've been sending my fourth-ish novel through the my writing group. It is about a trio of teenag...,397,True,True,False


In [7]:
# Most confident correct negative predictions
top_neg[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7878,0.22,False,"When quoting a person's informal speech, how much liberty do you have to make changes to what th...","Even during a formal interview for a news article, people speak informally. They say ""uhm"", they...",116,True,True,False
8204,0.24,False,Separate paragraphs without line breaks,"I have a medium which does not have line breaks, and a few paragraphs of text. How can I separat...",217,True,True,False


In [8]:
# Most confident incorrect negative predictions
worst_pos[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18613,0.22,True,"Addressing ""logo-ification"" of an organization's name in their literature",I need help finding some style rules to address an issue with a client. I'm working with an orga...,230,False,True,False
19509,0.27,True,How to copyright a book without lawyer and outside USA?,I would like to publish an ebook with amazon and i dont have time/money to keep copyrights with ...,56,True,True,False


In [9]:
# Most confident incorrect positive predictions
worst_neg[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
39613,0.79,False,How to write an interview-style story without it being an infodump?,"Inspired by the Underworld setting where vampires slept in steampunk-styled sarcophagi, slowly d...",489,True,True,False
4107,0.75,False,Single character POV vs. two POVs - how to decide?,"I'm starting to look at my next novel, and I'm trying to decide whether I should tell it from on...",281,True,True,False


In [10]:
# Most unsure questions
unsure[to_display]

Unnamed: 0_level_0,predicted_proba,true_label,Title,body_text,text_len,action_verb_full,question_mark_full,language_question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8271,0.5,True,About the Author description,"Whether we like it or not, the ""About the Author"" description is a marketing tool. It contains s...",149,False,True,False
8306,0.5,False,Publishing price comparisons. Is it allowed?,I'm currently in the process of writing a small localised book on arranging funerals yourself (D...,166,False,True,False
