In this notebook, we will cover three ways to use a model to generate recommendations:

- Model score
- Global feature importance
- Local feature importance

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

import sys
sys.path.append("..")

from ml_editor.data_processing import format_raw_df

data_path = Path('../data/writers_with_features.csv')
df = pd.read_csv(data_path)

In [2]:
from sklearn.externals import joblib

clf = joblib.load(Path("../models/model_3.pkl")) 

In [5]:
from ml_editor.data_processing import get_split_by_author

train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=40)

In [6]:
from ml_editor.data_processing import get_vectorized_series, get_feature_vector_and_label
from ml_editor.model_v2 import POS_NAMES



features = ["num_questions", 
               "num_periods",
               "num_commas",
               "num_exclam",
               "num_quotes",
               "num_colon",
               "num_stops",
               "num_semicolon",
               "num_words",
               "num_chars",
               "num_diff_words",
               "avg_word_len",
               "polarity"
              ]
features.extend(POS_NAMES.keys())

y_test = test_df["Score"] > test_df["Score"].median()
X_test = test_df[features].astype(float)

## Using feature values

In [16]:
features_and_labels = df.copy()
features_and_labels["label"] = features_and_labels["Score"] > features_and_labels["Score"].median()
features_and_labels = features_and_labels[["label"] + features].copy()

In [17]:
class_feature_values = features_and_labels.groupby("label").mean()
class_feature_values = class_feature_values.round(3)
class_feature_values.transpose()

label,False,True
num_questions,0.432,0.409
num_periods,0.814,0.754
num_commas,0.673,0.728
num_exclam,0.019,0.015
num_quotes,0.216,0.199
num_colon,0.094,0.081
num_stops,10.537,10.61
num_semicolon,0.013,0.014
num_words,21.638,21.48
num_chars,822.104,967.032
