# Inspect feature importance

A good way to diagnose a model is to look at which features are important. First we load the model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

import sys
sys.path.append("..")
from ml_editor.data_processing import format_raw_df

data_path = Path('../data/writers.csv')
df = pd.read_csv(data_path)
df = format_raw_df(df.copy())

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
from ml_editor.data_processing import get_split_by_author, get_vectorized_inputs_and_label, add_features_to_df

df = add_features_to_df(df.loc[df["is_question"]].copy(), pretrained_vectors=True)
train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=40)

In [3]:
X_train, y_train = get_vectorized_inputs_and_label(train_df)
X_test, y_test = get_vectorized_inputs_and_label(test_df)

In [4]:
from sklearn.externals import joblib

model_path = Path("../models/model_1.pkl")
clf = joblib.load(model_path) 

y_predicted = clf.predict(X_test)
y_predicted_proba = clf.predict_proba(X_test)

Now let's look at the most important features

In [5]:
from ml_editor.model_evaluation import get_feature_importance

feature_names = [
    "action_verb_full",
    "question_mark_full",
    "norm_text_len",
    "language_question",
]

w_indices = ["word_vector_index_%s" % s for s in range(300)]
w_indices.extend(feature_names)
all_feature_names = np.array(w_indices)



In [6]:

print("Top 5 importances:\n")
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[:5]]))

print("\nBottom 5 importances:\n")
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[-5:]]))

Top 5 importances:

norm_text_len: 0.0065
word_vector_index_160: 0.0052
word_vector_index_122: 0.0044
word_vector_index_90: 0.0044
word_vector_index_267: 0.0043

Bottom 5 importances:

word_vector_index_106: 0.0026
word_vector_index_137: 0.0024
action_verb_full: 0.00046
question_mark_full: 0.00035
language_question: 0.00011


The word vector indices aren't as clearly explainable, but we can see that the text length is the most important feature. On the other hand, our other generated features end up being the least important. 