In [1]:
# Import defaultdict
from collections import defaultdict

# Numpy and pandas for manipulating the data
import numpy as np
import pandas as pd

# Matplotlib and seaborn for visualization
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

# GridSearchCV for training 
from sklearn.model_selection import GridSearchCV

# Performance metrics from sklearn
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Prophet for time forecasting
from fbprophet import Prophet

# Classification models
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

# To hide stdout because Prophet can be loud
import logging
logging.getLogger('fbprophet').setLevel(logging.WARNING)

In [2]:
feature_file = './data/cleaned/feature.csv'
by_date_total_file = './data/cleaned/by_date_total.csv'
provisions_file = './data/raw/provisions.csv'
useful_provisions_file = './data/cleaned/useful_provisions.csv'

feature_df = pd.read_csv(feature_file, parse_dates=True, index_col=0)
by_date_total_df = pd.read_csv(by_date_total_file, parse_dates=True, index_col=0)
provisions_df = pd.read_csv(provisions_file, parse_dates=True)
useful_provisions_df = pd.read_csv(useful_provisions_file, parse_dates=True, index_col=0)

### Fixing features
There are still a couple of features to be tweaked. Although the data was cleaned in `clean-data` journal, it is more convenient to add the provisions features here, as they are likely to be changed along with changes to the model.

In [3]:
# Add only the useful provisions to our feature_df (k from this year and k from n years prior)
def add_provisions(feature_df, provisions_df, useful_provisions_df, k=30, n=5):
    # Get the state and year columns for a join later and lawtotal to account for excluded provisions
    columns = list(useful_provisions_df.head(k)['provision'].values)
#     columns = list(useful_provisions_df.sort_values('diff').tail(k)['provision'].values)
    columns.extend(['year', 'state', 'lawtotal'])     
    
    # Get the years 
    years = feature_df.groupby('this_year').count().index.values

    # Keep track of provisions for this year and n years prior
    current_provisions = []
    old_provisions = []

    # Add the provisions from each year to a list
    for year in years:
        current_provisions.append(provisions_df[provisions_df['year'] == year][columns])
        old_provisions.append(provisions_df[provisions_df['year'] == year - n][columns])

    # Put the provisions into a DataFrame
    current_provisions = pd.concat(current_provisions)
    old_provisions = pd.concat(old_provisions)
    old_provisions['year'] += n # Match the year which we want to join onto

    # Merge the provisions
    all_provisions = pd.merge(current_provisions, old_provisions, on=['state', 'year'], suffixes=('', '_old'))

    # Add provisions to feature_df and return the new feature_df
    feature_df = pd.merge(feature_df, all_provisions, left_on=['this_year', 'state'], right_on=['year', 'state'])
    return feature_df.drop('year', axis=1)

In [4]:
# Include provision information
feature_df = add_provisions(feature_df, provisions_df, useful_provisions_df, k=10, n=5)

# Exclude states that have a very low average number of deaths. In this case, we choose 2 as our threshold
min_deaths = 2
average_deaths = feature_df.groupby('state')['next_deaths'].mean()
excluded = average_deaths[average_deaths < min_deaths].index
states = average_deaths[average_deaths >= min_deaths].index

# Filter out the states
feature_df = feature_df[~feature_df['state'].isin(excluded)]

# Add the label. The label will be whether gun violence will increase by more than 30% for a given month
feature_df['label'] = (feature_df['deviation'] > 1).astype(int)
# Sort the data chronologically by date, then alphabetically by state name  
feature_df = feature_df.sort_values(['next_date', 'state']).reset_index().drop('index', axis=1)

# Drop the columns directly related to the label
feature_df = feature_df.drop(['rate_change', 'next_deaths', 'deviation'], axis=1)  
# Drop columns regarding year 
feature_df = feature_df.drop(['next_year', 'this_year'], axis=1)

# By dropping null values, we lose the first month because there is no previous date for the first month
feature_df = feature_df.dropna()

# Make placeholders for two additional features: one for predictions of this month, and one for next month
feature_df['this_preds'] = 0
feature_df['next_preds'] = 0

# Finally, make a weekly DataFrame for the time series predictions
resampled_df = by_date_total_df.resample('W').sum()

### Modeling
Now that all the `feature_df` is ready with all of the features and information, it's time to model the data. Here is an image of the pipeline:

In [5]:
# First, define need a function to easily get time series predictions
def get_trend_predictions(resampled_df, date, state, n_periods=14):
    time_series = resampled_df[:date][state]
    time_series = time_series.reset_index()
    time_series.columns = ['ds', 'y']

    # Model the data and make predictions
    model = Prophet(yearly_seasonality=True)
    model = model.fit(time_series)
    future = model.make_future_dataframe(periods=n_periods)
    output = model.predict(future)[['ds', 'yhat']]

    # Resample monthly and add the state as a column
    output = output.set_index('ds').resample('M').sum()

    # Get predictions for this month and next month
    this_output = output[:date].reset_index()
    this_output.columns = ['this_date', 'this_preds']
    next_output = output.reset_index().shift(-1).dropna()
    next_output.columns = ['next_date', 'next_preds']

    return this_output, next_output

# Update trend predictions up to the current observation's state
def update_trends(feature_df, resampled_df, date, states): 
    for state in states:
        this_trend_preds, next_trend_preds = get_trend_predictions(resampled_df, this_date, state)
        to_update = (feature_df['state'] == state) & (feature_df['this_date'] <= this_date)
        feature_df.loc[to_update, 'this_preds'] = this_trend_preds['this_preds'].values
        feature_df.loc[to_update, 'next_preds'] = next_trend_preds['next_preds'].values
    
    return feature_df.dropna()

In [6]:
# These are columns unnecessary classification models;drop them when feeding features to the models
extra_columns = ['this_date', 'next_date', 'state', 'label']
dates = feature_df['next_date'].unique()

# Make a list w/ pairs of models and parameters to look through when doing GridSearchCV

# Parameters for XGBClassifier
xgb_params = {
  'max_depth': [3, 5, 7, 9], 
  'n_estimators': [30, 50, 100, 300]
}

# Parameters for LogisitcRegression
logi_regr_params = {
    'penalty': ['l1', 'l2'],
    'C': [1e-2, 1e-1, 1, 10, 1e3, 1e5]
}

# Parameters for RandomForest
random_forest_params = {
  'max_depth': [3, 5, 7, 9],
  'n_estimators': [30, 50, 100, 300]
}

# Parameters for AdaBoost
adaboost_params = {
  'n_estimators': [30, 50, 100, 300]
}

# Parameters for GaussianNB
percent_positive = feature_df['label'].mean() # Percentage of positive labels
percent_negative = 1 - percent_positive # Percentage of negative features 
bayes_params = {'priors': [None, [percent_negative, percent_positive]]}

# model{ 'model name': (model_object, parameters) } 
models = {
    'XGBoost': (XGBClassifier(), xgb_params), 
    'Logistic Reg': (LogisticRegression(), logi_regr_params),
    'Random Forest': (RandomForestClassifier(), random_forest_params),
    'AdaBoost': (AdaBoostClassifier(), adaboost_params),
    'Gaussian NB': (GaussianNB(), bayes_params)   
}

Train each model on 2014 and 2015, then make predictions on every month for 2016

In [7]:
# Start at 2016-01-31 and stop (before) 2017-01-31 
start = np.where(dates == '2016-01-31')[0][0]
end = np.where(dates == '2017-01-31')[0][0]

training_history = defaultdict(list)
testing_history = defaultdict(list)
testing_history_probs = defaultdict(list)
trained_models = defaultdict(dict)
for index in range(start, end):
    this_date = dates[index - 1]
    next_date = dates[index]
    
    print("Updating trends for with this_date {}...".format(this_date))
    feature_df = update_trends(feature_df, resampled_df, this_date, states) 

    ### CUT ####

    for state in states:
        # Training data is all data before next_date
        # Testing data all data during next_date
        state_filter = feature_df['state'] == state
        train_date_filter = feature_df['next_date'] < next_date
        test_date_filter = feature_df['next_date'] == next_date
        train_filter = train_date_filter & state_filter
        test_filter = test_date_filter & state_filter
        
        X_train = feature_df.loc[train_filter].drop(extra_columns, axis=1).values
        y_train = feature_df.loc[train_filter, 'label']

        X_test = feature_df.loc[test_filter].drop(extra_columns, axis=1).values
        y_test = feature_df.loc[test_filter, 'label']

        meta_train = []
        meta_test = []
        for name, (model, parameters) in models.items():
            print("Training {} | next_date: {}... ".format(name, next_date))
            clf = GridSearchCV(model, parameters)
            clf.fit(X_train, y_train)

            # Make predictions on training set
            train_preds = clf.predict(X_train)
            test_preds = clf.predict(X_test)
            train_probs = clf.best_estimator_.predict_proba(X_train)[:, 0]
            test_probs = clf.best_estimator_.predict_proba(X_test)[:, 0]

            # Make meta features to train the meta model on
            meta_train.append(train_probs)
            meta_test.append(test_probs)

            # Keep track of the predictions
            training_history[name].append(train_preds)
            testing_history[name].extend(test_preds)
            testing_history_probs[name].extend(test_probs)

            # Remember the last model
            trained_models[name][state] = clf

        # Take transpose of meta features so that observations are rows
        meta_train = np.array(meta_train).T
        meta_test = np.array(meta_test).T

        # Create and train the meta model
        clf = GridSearchCV(XGBClassifier(), xgb_params)
        clf.fit(meta_train, y_train)

        # Make training and testing predictions
        train_preds = clf.predict(meta_train)
        test_preds = clf.predict(meta_test)

        # Keep track of the predictions
        training_history['meta'].append(train_preds)
        testing_history['meta'].extend(test_preds)

Updating trends for with this_date 2015-12-31...
Training XGBoost | next_date: 2016-01-31... 
Training Logistic Reg | next_date: 2016-01-31... 
Training Random Forest | next_date: 2016-01-31... 
Training AdaBoost | next_date: 2016-01-31... 
Training Gaussian NB | next_date: 2016-01-31... 
Training XGBoost | next_date: 2016-01-31... 
Training Logistic Reg | next_date: 2016-01-31... 
Training Random Forest | next_date: 2016-01-31... 
Training AdaBoost | next_date: 2016-01-31... 
Training Gaussian NB | next_date: 2016-01-31... 
Training XGBoost | next_date: 2016-01-31... 
Training Logistic Reg | next_date: 2016-01-31... 
Training Random Forest | next_date: 2016-01-31... 
Training AdaBoost | next_date: 2016-01-31... 
Training Gaussian NB | next_date: 2016-01-31... 
Training XGBoost | next_date: 2016-01-31... 
Training Logistic Reg | next_date: 2016-01-31... 
Training Random Forest | next_date: 2016-01-31... 
Training AdaBoost | next_date: 2016-01-31... 
Training Gaussian NB | next_date: 20

Training Gaussian NB | next_date: 2016-01-31... 
Training XGBoost | next_date: 2016-01-31... 
Training Logistic Reg | next_date: 2016-01-31... 
Training Random Forest | next_date: 2016-01-31... 
Training AdaBoost | next_date: 2016-01-31... 
Training Gaussian NB | next_date: 2016-01-31... 
Training XGBoost | next_date: 2016-01-31... 
Training Logistic Reg | next_date: 2016-01-31... 
Training Random Forest | next_date: 2016-01-31... 
Training AdaBoost | next_date: 2016-01-31... 
Training Gaussian NB | next_date: 2016-01-31... 
Training XGBoost | next_date: 2016-01-31... 
Training Logistic Reg | next_date: 2016-01-31... 
Training Random Forest | next_date: 2016-01-31... 
Training AdaBoost | next_date: 2016-01-31... 
Training Gaussian NB | next_date: 2016-01-31... 
Training XGBoost | next_date: 2016-01-31... 
Training Logistic Reg | next_date: 2016-01-31... 
Training Random Forest | next_date: 2016-01-31... 
Training AdaBoost | next_date: 2016-01-31... 
Training Gaussian NB | next_date: 20

Training AdaBoost | next_date: 2016-02-29... 
Training Gaussian NB | next_date: 2016-02-29... 
Training XGBoost | next_date: 2016-02-29... 
Training Logistic Reg | next_date: 2016-02-29... 
Training Random Forest | next_date: 2016-02-29... 
Training AdaBoost | next_date: 2016-02-29... 
Training Gaussian NB | next_date: 2016-02-29... 
Training XGBoost | next_date: 2016-02-29... 
Training Logistic Reg | next_date: 2016-02-29... 
Training Random Forest | next_date: 2016-02-29... 
Training AdaBoost | next_date: 2016-02-29... 
Training Gaussian NB | next_date: 2016-02-29... 
Training XGBoost | next_date: 2016-02-29... 
Training Logistic Reg | next_date: 2016-02-29... 
Training Random Forest | next_date: 2016-02-29... 
Training AdaBoost | next_date: 2016-02-29... 
Training Gaussian NB | next_date: 2016-02-29... 
Training XGBoost | next_date: 2016-02-29... 
Training Logistic Reg | next_date: 2016-02-29... 
Training Random Forest | next_date: 2016-02-29... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-03-31... 
Training Gaussian NB | next_date: 2016-03-31... 
Training XGBoost | next_date: 2016-03-31... 
Training Logistic Reg | next_date: 2016-03-31... 
Training Random Forest | next_date: 2016-03-31... 
Training AdaBoost | next_date: 2016-03-31... 
Training Gaussian NB | next_date: 2016-03-31... 
Training XGBoost | next_date: 2016-03-31... 
Training Logistic Reg | next_date: 2016-03-31... 
Training Random Forest | next_date: 2016-03-31... 
Training AdaBoost | next_date: 2016-03-31... 
Training Gaussian NB | next_date: 2016-03-31... 
Training XGBoost | next_date: 2016-03-31... 
Training Logistic Reg | next_date: 2016-03-31... 
Training Random Forest | next_date: 2016-03-31... 
Training AdaBoost | next_date: 2016-03-31... 
Training Gaussian NB | next_date: 2016-03-31... 
Training XGBoost | next_date: 2016-03-31... 
Training Logistic Reg | next_date: 2016-03-31... 
Training Random Forest | next_date: 2016-03-31... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-04-30... 
Training Gaussian NB | next_date: 2016-04-30... 
Training XGBoost | next_date: 2016-04-30... 
Training Logistic Reg | next_date: 2016-04-30... 
Training Random Forest | next_date: 2016-04-30... 
Training AdaBoost | next_date: 2016-04-30... 
Training Gaussian NB | next_date: 2016-04-30... 
Training XGBoost | next_date: 2016-04-30... 
Training Logistic Reg | next_date: 2016-04-30... 
Training Random Forest | next_date: 2016-04-30... 
Training AdaBoost | next_date: 2016-04-30... 
Training Gaussian NB | next_date: 2016-04-30... 
Training XGBoost | next_date: 2016-04-30... 
Training Logistic Reg | next_date: 2016-04-30... 
Training Random Forest | next_date: 2016-04-30... 
Training AdaBoost | next_date: 2016-04-30... 
Training Gaussian NB | next_date: 2016-04-30... 
Training XGBoost | next_date: 2016-04-30... 
Training Logistic Reg | next_date: 2016-04-30... 
Training Random Forest | next_date: 2016-04-30... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-04-30... 
Training Gaussian NB | next_date: 2016-04-30... 
Training XGBoost | next_date: 2016-04-30... 
Training Logistic Reg | next_date: 2016-04-30... 
Training Random Forest | next_date: 2016-04-30... 
Training AdaBoost | next_date: 2016-04-30... 
Training Gaussian NB | next_date: 2016-04-30... 
Training XGBoost | next_date: 2016-04-30... 
Training Logistic Reg | next_date: 2016-04-30... 
Training Random Forest | next_date: 2016-04-30... 
Training AdaBoost | next_date: 2016-04-30... 
Training Gaussian NB | next_date: 2016-04-30... 
Updating trends for with this_date 2016-04-30...
Training XGBoost | next_date: 2016-05-31... 
Training Logistic Reg | next_date: 2016-05-31... 
Training Random Forest | next_date: 2016-05-31... 
Training AdaBoost | next_date: 2016-05-31... 
Training Gaussian NB | next_date: 2016-05-31... 
Training XGBoost | next_date: 2016-05-31... 
Training Logistic Reg | next_date: 2016-05-31... 
Training Random Forest | next_date: 20

Training AdaBoost | next_date: 2016-05-31... 
Training Gaussian NB | next_date: 2016-05-31... 
Training XGBoost | next_date: 2016-05-31... 
Training Logistic Reg | next_date: 2016-05-31... 
Training Random Forest | next_date: 2016-05-31... 
Training AdaBoost | next_date: 2016-05-31... 
Training Gaussian NB | next_date: 2016-05-31... 
Training XGBoost | next_date: 2016-05-31... 
Training Logistic Reg | next_date: 2016-05-31... 
Training Random Forest | next_date: 2016-05-31... 
Training AdaBoost | next_date: 2016-05-31... 
Training Gaussian NB | next_date: 2016-05-31... 
Training XGBoost | next_date: 2016-05-31... 
Training Logistic Reg | next_date: 2016-05-31... 
Training Random Forest | next_date: 2016-05-31... 
Training AdaBoost | next_date: 2016-05-31... 
Training Gaussian NB | next_date: 2016-05-31... 
Training XGBoost | next_date: 2016-05-31... 
Training Logistic Reg | next_date: 2016-05-31... 
Training Random Forest | next_date: 2016-05-31... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-06-30... 
Training Gaussian NB | next_date: 2016-06-30... 
Training XGBoost | next_date: 2016-06-30... 
Training Logistic Reg | next_date: 2016-06-30... 
Training Random Forest | next_date: 2016-06-30... 
Training AdaBoost | next_date: 2016-06-30... 
Training Gaussian NB | next_date: 2016-06-30... 
Training XGBoost | next_date: 2016-06-30... 
Training Logistic Reg | next_date: 2016-06-30... 
Training Random Forest | next_date: 2016-06-30... 
Training AdaBoost | next_date: 2016-06-30... 
Training Gaussian NB | next_date: 2016-06-30... 
Training XGBoost | next_date: 2016-06-30... 
Training Logistic Reg | next_date: 2016-06-30... 
Training Random Forest | next_date: 2016-06-30... 
Training AdaBoost | next_date: 2016-06-30... 
Training Gaussian NB | next_date: 2016-06-30... 
Training XGBoost | next_date: 2016-06-30... 
Training Logistic Reg | next_date: 2016-06-30... 
Training Random Forest | next_date: 2016-06-30... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-07-31... 
Training Gaussian NB | next_date: 2016-07-31... 
Training XGBoost | next_date: 2016-07-31... 
Training Logistic Reg | next_date: 2016-07-31... 
Training Random Forest | next_date: 2016-07-31... 
Training AdaBoost | next_date: 2016-07-31... 
Training Gaussian NB | next_date: 2016-07-31... 
Training XGBoost | next_date: 2016-07-31... 
Training Logistic Reg | next_date: 2016-07-31... 
Training Random Forest | next_date: 2016-07-31... 
Training AdaBoost | next_date: 2016-07-31... 
Training Gaussian NB | next_date: 2016-07-31... 
Training XGBoost | next_date: 2016-07-31... 
Training Logistic Reg | next_date: 2016-07-31... 
Training Random Forest | next_date: 2016-07-31... 
Training AdaBoost | next_date: 2016-07-31... 
Training Gaussian NB | next_date: 2016-07-31... 
Training XGBoost | next_date: 2016-07-31... 
Training Logistic Reg | next_date: 2016-07-31... 
Training Random Forest | next_date: 2016-07-31... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-08-31... 
Training Gaussian NB | next_date: 2016-08-31... 
Training XGBoost | next_date: 2016-08-31... 
Training Logistic Reg | next_date: 2016-08-31... 
Training Random Forest | next_date: 2016-08-31... 
Training AdaBoost | next_date: 2016-08-31... 
Training Gaussian NB | next_date: 2016-08-31... 
Training XGBoost | next_date: 2016-08-31... 
Training Logistic Reg | next_date: 2016-08-31... 
Training Random Forest | next_date: 2016-08-31... 
Training AdaBoost | next_date: 2016-08-31... 
Training Gaussian NB | next_date: 2016-08-31... 
Training XGBoost | next_date: 2016-08-31... 
Training Logistic Reg | next_date: 2016-08-31... 
Training Random Forest | next_date: 2016-08-31... 
Training AdaBoost | next_date: 2016-08-31... 
Training Gaussian NB | next_date: 2016-08-31... 
Training XGBoost | next_date: 2016-08-31... 
Training Logistic Reg | next_date: 2016-08-31... 
Training Random Forest | next_date: 2016-08-31... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-08-31... 
Training Gaussian NB | next_date: 2016-08-31... 
Training XGBoost | next_date: 2016-08-31... 
Training Logistic Reg | next_date: 2016-08-31... 
Training Random Forest | next_date: 2016-08-31... 
Training AdaBoost | next_date: 2016-08-31... 
Training Gaussian NB | next_date: 2016-08-31... 
Training XGBoost | next_date: 2016-08-31... 
Training Logistic Reg | next_date: 2016-08-31... 
Training Random Forest | next_date: 2016-08-31... 
Training AdaBoost | next_date: 2016-08-31... 
Training Gaussian NB | next_date: 2016-08-31... 
Training XGBoost | next_date: 2016-08-31... 
Training Logistic Reg | next_date: 2016-08-31... 
Training Random Forest | next_date: 2016-08-31... 
Training AdaBoost | next_date: 2016-08-31... 
Training Gaussian NB | next_date: 2016-08-31... 
Training XGBoost | next_date: 2016-08-31... 
Training Logistic Reg | next_date: 2016-08-31... 
Training Random Forest | next_date: 2016-08-31... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-09-30... 
Training Gaussian NB | next_date: 2016-09-30... 
Training XGBoost | next_date: 2016-09-30... 
Training Logistic Reg | next_date: 2016-09-30... 
Training Random Forest | next_date: 2016-09-30... 
Training AdaBoost | next_date: 2016-09-30... 
Training Gaussian NB | next_date: 2016-09-30... 
Training XGBoost | next_date: 2016-09-30... 
Training Logistic Reg | next_date: 2016-09-30... 
Training Random Forest | next_date: 2016-09-30... 
Training AdaBoost | next_date: 2016-09-30... 
Training Gaussian NB | next_date: 2016-09-30... 
Training XGBoost | next_date: 2016-09-30... 
Training Logistic Reg | next_date: 2016-09-30... 
Training Random Forest | next_date: 2016-09-30... 
Training AdaBoost | next_date: 2016-09-30... 
Training Gaussian NB | next_date: 2016-09-30... 
Training XGBoost | next_date: 2016-09-30... 
Training Logistic Reg | next_date: 2016-09-30... 
Training Random Forest | next_date: 2016-09-30... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-10-31... 
Training Gaussian NB | next_date: 2016-10-31... 
Training XGBoost | next_date: 2016-10-31... 
Training Logistic Reg | next_date: 2016-10-31... 
Training Random Forest | next_date: 2016-10-31... 
Training AdaBoost | next_date: 2016-10-31... 
Training Gaussian NB | next_date: 2016-10-31... 
Training XGBoost | next_date: 2016-10-31... 
Training Logistic Reg | next_date: 2016-10-31... 
Training Random Forest | next_date: 2016-10-31... 
Training AdaBoost | next_date: 2016-10-31... 
Training Gaussian NB | next_date: 2016-10-31... 
Training XGBoost | next_date: 2016-10-31... 
Training Logistic Reg | next_date: 2016-10-31... 
Training Random Forest | next_date: 2016-10-31... 
Training AdaBoost | next_date: 2016-10-31... 
Training Gaussian NB | next_date: 2016-10-31... 
Training XGBoost | next_date: 2016-10-31... 
Training Logistic Reg | next_date: 2016-10-31... 
Training Random Forest | next_date: 2016-10-31... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-11-30... 
Training Gaussian NB | next_date: 2016-11-30... 
Training XGBoost | next_date: 2016-11-30... 
Training Logistic Reg | next_date: 2016-11-30... 
Training Random Forest | next_date: 2016-11-30... 
Training AdaBoost | next_date: 2016-11-30... 
Training Gaussian NB | next_date: 2016-11-30... 
Training XGBoost | next_date: 2016-11-30... 
Training Logistic Reg | next_date: 2016-11-30... 
Training Random Forest | next_date: 2016-11-30... 
Training AdaBoost | next_date: 2016-11-30... 
Training Gaussian NB | next_date: 2016-11-30... 
Training XGBoost | next_date: 2016-11-30... 
Training Logistic Reg | next_date: 2016-11-30... 
Training Random Forest | next_date: 2016-11-30... 
Training AdaBoost | next_date: 2016-11-30... 
Training Gaussian NB | next_date: 2016-11-30... 
Training XGBoost | next_date: 2016-11-30... 
Training Logistic Reg | next_date: 2016-11-30... 
Training Random Forest | next_date: 2016-11-30... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-12-31... 
Training Gaussian NB | next_date: 2016-12-31... 
Training XGBoost | next_date: 2016-12-31... 
Training Logistic Reg | next_date: 2016-12-31... 
Training Random Forest | next_date: 2016-12-31... 
Training AdaBoost | next_date: 2016-12-31... 
Training Gaussian NB | next_date: 2016-12-31... 
Training XGBoost | next_date: 2016-12-31... 
Training Logistic Reg | next_date: 2016-12-31... 
Training Random Forest | next_date: 2016-12-31... 
Training AdaBoost | next_date: 2016-12-31... 
Training Gaussian NB | next_date: 2016-12-31... 
Training XGBoost | next_date: 2016-12-31... 
Training Logistic Reg | next_date: 2016-12-31... 
Training Random Forest | next_date: 2016-12-31... 
Training AdaBoost | next_date: 2016-12-31... 
Training Gaussian NB | next_date: 2016-12-31... 
Training XGBoost | next_date: 2016-12-31... 
Training Logistic Reg | next_date: 2016-12-31... 
Training Random Forest | next_date: 2016-12-31... 
Training AdaBoost | next_date: 2016-

Training AdaBoost | next_date: 2016-12-31... 
Training Gaussian NB | next_date: 2016-12-31... 
Training XGBoost | next_date: 2016-12-31... 
Training Logistic Reg | next_date: 2016-12-31... 
Training Random Forest | next_date: 2016-12-31... 
Training AdaBoost | next_date: 2016-12-31... 
Training Gaussian NB | next_date: 2016-12-31... 
Training XGBoost | next_date: 2016-12-31... 
Training Logistic Reg | next_date: 2016-12-31... 
Training Random Forest | next_date: 2016-12-31... 
Training AdaBoost | next_date: 2016-12-31... 
Training Gaussian NB | next_date: 2016-12-31... 
Training XGBoost | next_date: 2016-12-31... 
Training Logistic Reg | next_date: 2016-12-31... 
Training Random Forest | next_date: 2016-12-31... 
Training AdaBoost | next_date: 2016-12-31... 
Training Gaussian NB | next_date: 2016-12-31... 
Training XGBoost | next_date: 2016-12-31... 
Training Logistic Reg | next_date: 2016-12-31... 
Training Random Forest | next_date: 2016-12-31... 
Training AdaBoost | next_date: 2016-

In [8]:
all_probs = []
for v in testing_history_probs.values():
    all_probs.append(v)
    
all_probs = np.array(all_probs).T
vote_by_probs = [int(x > 0.5) for x in all_probs.mean(axis=1)]

all_preds = []
for v in testing_history.values():
    all_preds.append(v)
    
all_preds = np.array(all_preds).T
vote_by_preds = [int(x > 0.5) for x in all_preds.mean(axis=1)]

testing_history['vote_by_probs'] = vote_by_probs
testing_history['vote_by_preds'] = vote_by_preds

In [9]:
truth = feature_df[(feature_df['next_date'] >= '2016-01-31') & (feature_df['next_date'] < '2017-01-31')]['label']
for name, preds in testing_history.items():
    plt.plot()
    print("{}: {} ".format(name, accuracy_score(truth, preds)))
    cm = confusion_matrix(truth, preds)
    tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
#     print("True Negative: {}".format(tn))
#     print("False Positive: {}".format(fp))
#     print("False Negative: {}".format(fn))
#     print("True Positive: {}".format(tp))
    print("Recall: {}**".format(recall))
    print("Precision: {}".format(precision))
    print(cm)
    print('-'*10)

XGBoost: 0.5096899224806202 
Recall: 0.7018867924528301**
Precision: 0.5166666666666667
[[ 77 174]
 [ 79 186]]
----------
Logistic Reg: 0.5426356589147286 
Recall: 0.6716981132075471**
Precision: 0.5443425076452599
[[102 149]
 [ 87 178]]
----------
Random Forest: 0.5523255813953488 
Recall: 0.5811320754716981**
Precision: 0.5620437956204379
[[131 120]
 [111 154]]
----------
AdaBoost: 0.5310077519379846 
Recall: 0.6641509433962264**
Precision: 0.5349544072948328
[[ 98 153]
 [ 89 176]]
----------
Gaussian NB: 0.560077519379845 
Recall: 0.6377358490566037**
Precision: 0.5633333333333334
[[120 131]
 [ 96 169]]
----------
meta: 0.5329457364341085 
Recall: 0.6264150943396226**
Precision: 0.538961038961039
[[109 142]
 [ 99 166]]
----------
vote_by_probs: 0.44573643410852715 
Recall: 0.30943396226415093**
Precision: 0.44324324324324327
[[148 103]
 [183  82]]
----------
vote_by_preds: 0.5523255813953488 
Recall: 0.6150943396226415**
Precision: 0.5582191780821918
[[122 129]
 [102 163]]
---------

In [10]:
importances_df = pd.DataFrame()
importances_df['feature'] = feature_df.drop(extra_columns, axis=1).columns

for state, model in trained_models['XGBoost'].items():
    importances_df[state] = model.best_estimator_.feature_importances_
    
importances_df['avg'] = importances_df.mean(axis=1)
importances_df.sort_values('avg', ascending=False)[['feature', 'avg']].head(10)

Unnamed: 0,feature,avg
88,next_preds,0.408553
87,this_preds,0.307142
62,this_deaths,0.201221
0,last_year,0.039232
2,violent_crime,0.023795
4,murder_crime,0.008618
1,population,0.004724
14,rape_crime_old,0.002643
6,robbery_crime,0.001959
5,rape_crime,0.001289


In [11]:
importances_df = pd.DataFrame()
importances_df['feature'] = feature_df.drop(extra_columns, axis=1).columns

for state, model in trained_models['Logistic Reg'].items():
    importances_df[state] = np.abs(model.best_estimator_.coef_[0])
    
importances_df['avg'] = importances_df.mean(axis=1)
importances_df.sort_values('avg', ascending=False)[['feature', 'avg']].head(20)

Unnamed: 0,feature,avg
88,next_preds,0.178653
62,this_deaths,0.061333
87,this_preds,0.053559
70,mcdvdating,0.023414
23,wine,0.016045
57,cocaine_diff,0.014816
61,depression_diff,0.013246
79,statechecksh_old,0.011193
67,violentpartial,0.010769
25,alcohol_consumed,0.01073


In [12]:
probs = testing_history_probs['XGBoost']
cap_df = pd.DataFrame(truth)
cap_df['probs'] = probs
cap_df = cap_df.sort_values('probs')
cap_df['cap'] = cap_df['label'].cumsum() / cap_df['label'].sum()
cap_df.head(258).tail(5)

Unnamed: 0,label,probs,cap
1353,1,0.333563,0.501887
1391,0,0.335674,0.501887
1222,1,0.335752,0.50566
1149,0,0.337343,0.50566
1378,0,0.337817,0.50566


In [14]:
feature_df

Unnamed: 0,next_date,state,last_year,population,violent_crime,property_crime,murder_crime,rape_crime,robbery_crime,assault_crime,...,permith_old,mcdvdating_old,age21handgunsale_old,permitlaw_old,cap16_old,cap14_old,lawtotal_old,label,this_preds,next_preds
43,2014-02-28,Alabama,2012,4827660,20834,161835,346,1449,4645,13788,...,0,0,0,0,0,0,11,0,29.648398,22.333456
44,2014-02-28,Alaska,2012,736760,4709,21211,34,657,623,3127,...,0,0,0,0,0,0,5,0,2.162706,4.293674
45,2014-02-28,Arizona,2012,6616124,27576,223294,355,2344,6656,17391,...,0,1,0,0,0,0,13,0,11.279431,13.491958
46,2014-02-28,Arkansas,2012,2956780,13705,106477,158,1135,2261,9796,...,0,0,0,0,0,0,13,0,12.292441,10.061779
47,2014-02-28,California,2012,38347383,154739,1018907,1746,7464,53640,89029,...,1,1,1,0,1,1,93,0,98.063317,97.382315
48,2014-02-28,Colorado,2012,5262556,16099,139974,174,2198,3136,9826,...,0,0,0,0,0,0,17,1,10.853133,10.416974
49,2014-02-28,Connecticut,2012,3602470,9439,71274,91,647,3551,4906,...,1,1,1,1,1,1,70,0,3.258777,3.092879
50,2014-02-28,Delaware,2012,925114,4633,29001,41,278,1233,2960,...,0,0,1,0,1,1,26,0,6.213564,3.348792
51,2014-02-28,Florida,2012,19584927,91993,607170,972,4765,23200,61054,...,0,0,0,0,1,1,21,0,70.844627,64.604595
52,2014-02-28,Georgia,2012,9981773,37519,339808,563,2022,12704,21361,...,0,0,0,0,0,0,11,0,35.765349,38.814514
