In [1]:
import sys, os
sys.path.insert(0, '/home/jgutman/mvesc/ETL')
from mvesc_utility_functions import *
import pandas as pd

In [2]:
import re
import numpy as np

In [93]:
import pickle

In [10]:
def make_df_categorical(raw_data):
    string_features = raw_data.select_dtypes(include=[object, bool])
    numeric_features = raw_data.select_dtypes(include=[np.number])

    for string_col in string_features.columns:
        raw_data[string_col] = string_features[string_col].astype('category')
        if (len(raw_data[string_col].cat.categories) < 2
            and string_col != 'predicted_label'
            and string_col != 'true_label'
            and string_col != 'correct'):
            raw_data.drop(string_col, axis=1, inplace=True)
            
    for numeric_col in numeric_features.columns:
        num_values = len(numeric_features[numeric_col].unique())
        num_bins = min(5, num_values)
        if (num_values < 2):
            raw_data.drop(numeric_col, axis=1, inplace=True)
        elif (num_values == 2 and
            numeric_features[numeric_col].isnull().sum() > 0):
            raw_data[numeric_col] = numeric_features[numeric_col] \
                .isnull().astype('category')
        else:
            raw_data[numeric_col] = pd.cut(numeric_features[numeric_col],
                bins = num_bins, precision = 1)
            
    return raw_data

In [4]:
def loop_through_top_models(optimization_criteria):
    with postgres_pgconnection_generator() as connection:
        with connection.cursor() as cursor:
            predictions = None
            top_models_query = """
        create temporary table top_models as
        select distinct on (model_name, label)
        * from
            (select model_name, filename, label, feature_categories,
            feature_grades, {criteria},
            rank() over (partition by (model_name, label)
                order by {ranker} desc) as val_rank
            from model.reports
            where debug=false
                and feature_categories like 'snapshots,%'
                and cv_scheme = 'leave_cohort_out'
            order by model_name, label, val_rank) vr
        order by model_name, label, val_rank;
        """.format(criteria = ", ".join(optimization_criteria),
            ranker = optimization_criteria[0])
            cursor.execute(top_models_query)

            cursor.execute("""select filename, feature_categories,
                feature_grades from top_models;""")
            models_and_features = cursor.fetchall()
            crosstabs_by_model_and_feature = dict()
            print('done grabbing models')

            for (table_name, feature_tables, feature_grade_range) \
                    in models_and_features:
                print(table_name)
                feature_table_list = feature_tables.split(", ")
                feature_grades = [int(i) for i in
                        feature_grade_range.split(", ")]
                feature_grade_regex = '({})'.format('|'.join(
                    [str(i) for i in feature_grades]))
                pattern = re.compile('(_gr_{rx}\Z)|(\D\Z)'.format(
                    rx=feature_grade_regex))

                for test_set in ['val', 'test']:
                    get_model_predictions = """select * from
                    (select student_lookup, true_label, predicted_label,
                    predicted_label = true_label as correct
                    from predictions."{table}" where split = '{test_set}') preds
                    """.format(table = table_name, test_set = test_set)

                    for features in feature_table_list:
                        get_model_predictions += """ left join
                    (select * from model.{features}) {features}
                    using(student_lookup)
                    """.format(features = features)

                    cursor.execute(get_model_predictions)
                    predictions_and_features = cursor.fetchall()
                    colnames = [i[0] for i in cursor.description]
                    predictions = pd.DataFrame.from_records(
                        predictions_and_features, index = 'student_lookup',
                        columns = colnames)
                    predictions = predictions.filter(regex=pattern)
                    predictions[['true_label', 'predicted_label']] = \
                    predictions[['true_label', 'predicted_label']].astype(bool)
                    predictions = make_df_categorical(predictions)
                    print('building crosstabs')

                    crosstabs = build_crosstabs(predictions)
                    key = (table_name, test_set)
                    crosstabs_by_model_and_feature[key] = crosstabs
    return crosstabs_by_model_and_feature

In [5]:
def build_crosstabs(prediction_data):
    # base_rates = {col: prediction_data[col].value_counts()
    #                    for col in prediction_data.columns}
    predicted = {col: pd.crosstab(index=prediction_data.predicted_label,
                    columns = prediction_data[col], margins=True,
                    normalize = True)
                for col in prediction_data.columns[3:]}
    actual = {col: pd.crosstab(index=prediction_data.true_label,
                    columns = prediction_data[col], margins=True,
                    normalize = True)
                 for col in prediction_data.columns[3:]}
    correct = {col: pd.crosstab(index=prediction_data.correct,
                    columns = prediction_data[col], margins=True,
                    normalize = True)
                 for col in prediction_data.columns[3:]}

    predicted_plus_actual = dict()
    for feature in predicted.keys():
        preds = predicted[feature]
        true = actual[feature]
        corr = correct[feature]
        preds.index = ['{name}: {value}'.format(name=preds.index.name,
                        value=value) for value in preds.index]
        true.index = ['{name}: {value}'.format(name=true.index.name,
                        value=value) for value in true.index]
        corr.index = ['{name}: {value}'.format(name=corr.index.name,
                        value=value) for value in corr.index]
        full = preds.append(true)
        full = full.append(corr)
        predicted_plus_actual[feature] = full
    return predicted_plus_actual

In [6]:
optimization_criteria = ['val_precision_5', 'val_recall_5']

In [11]:
all_top_crosstabs = loop_through_top_models(optimization_criteria)

done grabbing models
param_set_35_AB_ht_7253
building crosstabs
building crosstabs
param_set_16_AB_ht_6623
building crosstabs
building crosstabs
param_set_3_AB_ht_6933
building crosstabs
building crosstabs
param_set_39_DT_ht_7290
building crosstabs
building crosstabs
param_set_28_DT_ht_7180
building crosstabs
building crosstabs
param_set_6_DT_ht_6960
building crosstabs
building crosstabs
param_set_39_ET_ht_7292
building crosstabs
building crosstabs
param_set_20_ET_ht_6698
building crosstabs
building crosstabs
param_set_11_ET_ht_6524
building crosstabs
building crosstabs
param_set_44_GB_ht_7345
building crosstabs
building crosstabs
param_set_30_GB_ht_7205
building crosstabs
building crosstabs
param_set_8_GB_ht_6465
building crosstabs
building crosstabs
param_set_40_KNN_ht_7308
building crosstabs
building crosstabs
param_set_28_KNN_ht_7188
building crosstabs
building crosstabs
param_set_3_KNN_ht_6938
building crosstabs
building crosstabs
param_set_41_logit_ht_7309
building crosstabs
buil

In [80]:
def get_specific_cross_tabs(cross_tabs, filename, feature, split = 'val'):
    crosstab = all_top_crosstabs[(filename, split)][feature]
    totals = crosstab.ix['true_label: All']
    predicted = 100*crosstab.ix[['predicted_label: True',
        'predicted_label: False']]/totals
    actual = 100*crosstab.ix[['true_label: True',
        'true_label: False']]/totals
    full = predicted.append(actual)
    full = full.append(totals*100)
    full = full.round(2)
    return full

In [81]:
get_specific_cross_tabs(all_top_crosstabs, 'param_set_43_RF_ht_7331', 'gpa_gr_9')

gpa_gr_9,"(-0.004, 0.8]","(0.8, 1.6]","(1.6, 2.4]","(2.4, 3.2]","(3.2, 4]",All
predicted_label: True,0.0,0.0,0.0,0.0,0.0,0.0
predicted_label: False,100.0,100.0,100.0,100.0,100.0,100.0
true_label: True,58.82,18.39,6.38,2.19,1.26,5.24
true_label: False,41.18,81.61,93.62,97.81,98.74,94.76
true_label: All,1.59,8.15,26.4,34.18,29.68,100.0


In [82]:
get_specific_cross_tabs(all_top_crosstabs, 'param_set_43_RF_ht_7331', 'seventh_read_normalized')

seventh_read_normalized,"(-3.09, -1.9]","(-1.9, -0.7]","(-0.7, 0.5]","(0.5, 1.7]","(1.7, 2.9]",All
predicted_label: True,2.44,0.0,0.0,0.0,0.0,0.06
predicted_label: False,97.56,100.0,100.0,100.0,100.0,99.94
true_label: True,39.02,10.36,4.73,1.44,0.0,5.43
true_label: False,60.98,89.64,95.27,98.56,100.0,94.57
true_label: All,2.29,18.9,44.97,31.1,2.74,100.0


In [83]:
get_specific_cross_tabs(all_top_crosstabs, 'param_set_43_RF_ht_7331', 'days_present_gr_9')

days_present_gr_9,"(-0.2, 40]","(40, 80]","(80, 120]","(120, 160]","(160, 200]",All
predicted_label: True,0.0,0.0,0.0,0.38,0.0,0.05
predicted_label: False,100.0,100.0,100.0,99.62,100.0,99.95
true_label: True,15.63,20.0,27.27,9.16,4.56,5.94
true_label: False,84.38,80.0,72.73,90.84,95.44,94.06
true_label: All,1.64,1.28,1.69,13.41,81.99,100.0


In [84]:
get_specific_cross_tabs(all_top_crosstabs, 'param_set_43_RF_ht_7331', 'gender')

gender,F,M,All
predicted_label: True,0.11,0.0,0.05
predicted_label: False,99.89,100.0,99.95
true_label: True,4.76,7.0,5.94
true_label: False,95.24,93.0,94.06
true_label: All,47.34,52.66,100.0


In [86]:
get_specific_cross_tabs(all_top_crosstabs, 'param_set_39_DT_ht_7290', 'gpa_gr_9')

gpa_gr_9,"(-0.004, 0.8]","(0.8, 1.6]","(1.6, 2.4]","(2.4, 3.2]","(3.2, 4]",All
predicted_label: True,58.82,1.15,0.0,0.27,0.0,1.12
predicted_label: False,41.18,98.85,100.0,99.73,100.0,98.88
true_label: True,58.82,18.39,6.38,2.19,1.26,5.24
true_label: False,41.18,81.61,93.62,97.81,98.74,94.76
true_label: All,1.59,8.15,26.4,34.18,29.68,100.0


In [87]:
get_specific_cross_tabs(all_top_crosstabs, 'param_set_39_DT_ht_7290', 'seventh_read_normalized')

seventh_read_normalized,"(-3.09, -1.9]","(-1.9, -0.7]","(-0.7, 0.5]","(0.5, 1.7]","(1.7, 2.9]",All
predicted_label: True,4.88,1.18,1.0,0.72,0.0,1.01
predicted_label: False,95.12,98.82,99.0,99.28,100.0,98.99
true_label: True,39.02,10.36,4.73,1.44,0.0,5.43
true_label: False,60.98,89.64,95.27,98.56,100.0,94.57
true_label: All,2.29,18.9,44.97,31.1,2.74,100.0


In [88]:
get_specific_cross_tabs(all_top_crosstabs, 'param_set_39_DT_ht_7290', 'days_present_gr_9')

days_present_gr_9,"(-0.2, 40]","(40, 80]","(80, 120]","(120, 160]","(160, 200]",All
predicted_label: True,0.0,0.0,6.06,2.29,0.62,0.92
predicted_label: False,100.0,100.0,93.94,97.71,99.38,99.08
true_label: True,15.63,20.0,27.27,9.16,4.56,5.94
true_label: False,84.38,80.0,72.73,90.84,95.44,94.06
true_label: All,1.64,1.28,1.69,13.41,81.99,100.0


In [95]:
get_specific_cross_tabs(all_top_crosstabs, 'param_set_39_DT_ht_7290', 'mid_year_withdraw_gr_9')

mid_year_withdraw_gr_9,False,True,All
predicted_label: True,0.0,7.14,2.0
predicted_label: False,100.0,92.86,98.0
true_label: True,11.11,14.29,12.0
true_label: False,88.89,85.71,88.0
true_label: All,72.0,28.0,100.0


In [91]:
path = os.path.join('/home/jgutman/mvesc', 'Error_Feature_Analysis', 'pkls')

In [94]:
with open(os.path.join(path, 'crosstabs.pkl'), 'wb') as f:
    pickle.dump(all_top_crosstabs, f)