In [2]:
import os, sys
sys.path.insert(0, '/home/jtorrenc/mvesc/ETL')
from mvesc_utility_functions import *
import numpy as np
import pandas as pd
import random
from functools import partial

In [3]:
def color_gt(val,x=.5,c='blue',neg=True):
    """
    Colors values greater than x blue (or other provided color)
    
    :param float x: lower bound on colored values
    :param str c: color
    :param bool neg: if true, color negative values red
    :returns: css string to color values in dataframe
    :rtype: str
    """
    
    if val > x:
        color = c 
    elif val < 0:
        color = 'red' 
    else: color = 'black'
    return 'color: %s' % color

def table_exists(cursor, table, schema='clean'):
    """
    Checks to see if a table exists in the database
    
    :param pg.cursor cursor: 
    :param str table:
    :param str schema:
    :rtype: bool
    """
    cursor.execute("""                                                   
            select count(*) from information_schema.tables                       
            where table_schema = %s and table_name = %s
            """, [schema, table])
    return cursor.fetchall()[0][0]

In [18]:
with postgres_pgconnection_generator() as connection:
    with connection.cursor() as cursor:
        cursor.execute('select distinct model_name from model.reports')
        model_list = [m[0] for m in cursor.fetchall()]
        best = dict()
        for m in model_list:
            cursor.execute("""select filename from model.reports where model_name = %s
                            order by val_precision_10 desc limit 1; """, [m])
            best[m] = cursor.fetchall()[0][0]
    predictions = pd.DataFrame()
    for m, f in best.items():
        df = read_table_to_df(connection, f,columns = ['student_lookup','predicted_score'],
                                          schema='predictions',nrows=-1)
        df.set_index('student_lookup',inplace=True)
        df.columns = [m]
        predictions = pd.concat((predictions,df), axis=1, join='inner')


In [19]:
spearman_pred = predictions.corr('spearman')

# Spearman's Prediction Correlation

In [20]:
spearman_pred.style.applymap(partial(color_gt,x=.75))

Unnamed: 0,RF,SVM,KNN,GB,AB,logit,NB,ET,SGD,DT
RF,1.0,0.658894,0.485803,0.764697,0.6954,0.762888,0.631302,0.777807,0.562709,0.637199
SVM,0.658894,1.0,0.456741,0.66069,0.71511,0.85489,0.64038,0.54291,0.748221,0.528705
KNN,0.485803,0.456741,1.0,0.453817,0.458658,0.493082,0.414185,0.456968,0.425518,0.335798
GB,0.764697,0.66069,0.453817,1.0,0.755792,0.757075,0.642585,0.612558,0.610131,0.631266
AB,0.6954,0.71511,0.458658,0.755792,1.0,0.779364,0.657852,0.583288,0.615551,0.614656
logit,0.762888,0.85489,0.493082,0.757075,0.779364,1.0,0.695403,0.636051,0.739407,0.647433
NB,0.631302,0.64038,0.414185,0.642585,0.657852,0.695403,1.0,0.574956,0.501256,0.490113
ET,0.777807,0.54291,0.456968,0.612558,0.583288,0.636051,0.574956,1.0,0.450722,0.483682
SGD,0.562709,0.748221,0.425518,0.610131,0.615551,0.739407,0.501256,0.450722,1.0,0.46858
DT,0.637199,0.528705,0.335798,0.631266,0.614656,0.647433,0.490113,0.483682,0.46858,1.0


In [54]:
with postgres_pgconnection_generator() as connection:
    with connection.cursor() as cursor:
        cursor.execute('select distinct model_name from model.reports')
        model_list = [m[0] for m in cursor.fetchall()]
        cursor.execute('select distinct feature_categories from model.reports')
        feature_list = [f[0] for f in cursor.fetchall()]
        for f in feature_list:  #this won't work anymore when there are other combinations of features
            if ',' in f:
                feature_list.remove(f)
                if 'all' not in feature_list:
                    feature_list.append('all')
        best = dict()
        features = dict()
        spearman = dict()
        for f in feature_list:
            best[f] = dict()
            for m in model_list:
                if f != 'all':
                    cursor.execute("""select filename from model.reports 
                                    where model_name = %s
                                    and feature_categories = %s
                                    and feature_grades = '5, 6, 7, 8, 9'
                                    order by val_precision_10 desc limit 1; """, [m,f])
                else:
                    cursor.execute("""select filename from model.reports 
                                    where model_name = '{}'
                                    and feature_categories like '%,%'
                                    and feature_grades = '5, 6, 7, 8, 9'
                                    order by val_precision_10 desc limit 1; """.format(m))
                filename = cursor.fetchall()
                best[f][m] = filename[0][0]
            temp = pd.DataFrame()
            for m, file in best[f].items():
                if table_exists(cursor, file, 'feature_scores'):
                    df = read_table_to_df(connection, file ,columns=['feature','importance'],
                                          schema='feature_scores',nrows = -1)
                    df.set_index('feature',inplace=True)
                    df.columns = [m]
                    temp = pd.concat((temp,df), axis=1, join='outer')
            features[f] = temp
            spearman[f] = temp.abs().corr('spearman')

In [55]:
features.keys()

dict_keys(['all', 'oaa_normalized', 'absence', 'snapshots', 'grades', 'mobility'])

In [56]:
best['mobility']

{'AB': 'param_set_453_AB_ht_4524',
 'DT': 'param_set_458_DT_ht_4571',
 'ET': 'param_set_453_ET_ht_4523',
 'GB': 'param_set_468_GB_ht_4676',
 'KNN': 'param_set_458_KNN_ht_4579',
 'NB': 'param_set_463_NB_ht_4627',
 'RF': 'param_set_463_RF_ht_4622',
 'SGD': 'param_set_453_SGD_ht_4528',
 'SVM': 'param_set_453_SVM_ht_4525',
 'logit': 'param_set_453_logit_ht_4520'}

# Spearman's Feature Correlation


In [57]:
features['absence'].head()

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
absence_consec_gr_7,0.000789,0.087852,0.0,0.0,0.262957,0.000924,22.639499,0.0
absence_consec_gr_8,0.028656,-1.6e-05,0.011905,0.003216,0.031852,0.028532,1.331735,0.0
absence_consec_gr_9,0.046819,5.5e-05,0.060845,0.099215,0.269176,0.046311,9.322147,0.040512
absence_gr_5,0.064056,-2.3e-05,0.078696,,-0.046104,0.073802,-8.101389,0.020813
absence_gr_6,0.079172,8e-06,0.081279,0.052965,-0.222509,0.079043,-9.943623,0.127702


In [58]:
spearman['absence'].style.applymap(partial(color_gt,x=.7))

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
RF,1.0,-0.661654,0.952172,0.725255,0.307229,0.995489,-0.501754,0.747127
SVM,-0.661654,1.0,-0.673447,-0.630183,-0.144021,-0.657143,0.232506,-0.655301
GB,0.952172,-0.673447,1.0,0.829173,0.41799,0.946907,-0.363956,0.79009
AB,0.725255,-0.630183,0.829173,1.0,0.532081,0.710109,-0.0665971,0.851009
logit,0.307229,-0.144021,0.41799,0.532081,1.0,0.281627,0.382674,0.348315
ET,0.995489,-0.657143,0.946907,0.710109,0.281627,1.0,-0.515789,0.733469
SGD,-0.501754,0.232506,-0.363956,-0.0665971,0.382674,-0.515789,1.0,-0.1482
DT,0.747127,-0.655301,0.79009,0.851009,0.348315,0.733469,-0.1482,1.0


In [59]:
features['mobility'].head()

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
avg_address_change_to_gr_5,0.002644,0.418592,0.006284,0.0,0.0,0.001653,1.331735,0.0
avg_address_change_to_gr_5_isnull,0.004189,0.28075,1e-05,0.0,0.041814,0.00289,-1.512494e-14,0.007397
avg_address_change_to_gr_6,0.005937,0.071333,0.000757,0.0,0.0,0.004239,5.553688e-15,0.0
avg_address_change_to_gr_6_isnull,0.002541,-6.9e-05,0.0,0.0,0.00564,0.002361,-1.276167e-14,0.0
avg_address_change_to_gr_7,0.004179,-0.014458,0.008888,0.0,0.0,0.003427,-0.2219559,0.0


In [60]:
spearman['mobility'].style.applymap(partial(color_gt,x=.7))

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
RF,1.0,0.295473,0.772469,0.178241,0.0924542,0.78457,0.233936,0.189319
SVM,0.295473,1.0,0.253196,0.331941,0.225591,0.277223,0.438181,0.243914
GB,0.772469,0.253196,1.0,0.186848,0.091333,0.669802,0.187159,0.219173
AB,0.178241,0.331941,0.186848,1.0,0.44096,0.249274,0.416172,0.246055
logit,0.0924542,0.225591,0.091333,0.44096,1.0,0.249687,0.15756,0.241969
ET,0.78457,0.277223,0.669802,0.249274,0.249687,1.0,0.179368,0.231862
SGD,0.233936,0.438181,0.187159,0.416172,0.15756,0.179368,1.0,0.225803
DT,0.189319,0.243914,0.219173,0.246055,0.241969,0.231862,0.225803,1.0


In [61]:
features['oaa_normalized'].head()

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
eighth_math_normalized,0.039856,0.085535,0.038644,0.127352,0.0,0.02115,2.138664,0.0
eighth_math_normalized_isnull,0.006814,0.169345,0.0,0.0,0.0,0.004572,5.326941,0.0
eighth_math_percentile,0.034348,-0.055547,0.01872,0.044069,0.0,0.035541,-2.593598,0.08065
eighth_math_percentile_isnull,0.006687,0.169345,0.004537,0.0,0.0,,5.326941,0.025544
eighth_math_pl_Accelerated,0.002173,-0.008385,0.0,0.0,-0.233912,0.005935,-2.663471,0.0


In [62]:
spearman['oaa_normalized'].style.applymap(partial(color_gt,x=.7))

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
RF,1.0,0.19034,0.625103,0.576083,0.00225876,0.702344,0.0473235,0.388032
SVM,0.19034,1.0,0.148321,0.231872,0.383882,0.242479,0.518153,0.212397
GB,0.625103,0.148321,1.0,0.574008,0.0434368,0.501239,0.0818554,0.34524
AB,0.576083,0.231872,0.574008,1.0,0.135187,0.548473,0.174985,0.351726
logit,0.00225876,0.383882,0.0434368,0.135187,1.0,0.145132,0.545815,0.158714
ET,0.702344,0.242479,0.501239,0.548473,0.145132,1.0,0.193151,0.312255
SGD,0.0473235,0.518153,0.0818554,0.174985,0.545815,0.193151,1.0,0.132765
DT,0.388032,0.212397,0.34524,0.351726,0.158714,0.312255,0.132765,1.0


In [63]:
features['snapshots'].head()


Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
days_absent_excused_gr_7,0.026768,-0.713126,0.014806,0.023076,-0.505254,0.019405,-156.283049,0.0
days_absent_excused_gr_8,0.023392,0.100783,0.010918,0.0,0.0,0.022353,17.032193,0.0
days_absent_excused_gr_9,0.069043,0.394523,0.024187,0.0,0.351827,0.057544,-0.003329,0.032535
days_absent_gr_5,0.020311,-0.063759,0.02297,0.0,-0.06179,0.016444,-159.031385,0.017877
days_absent_gr_6,0.019935,-0.036667,0.017842,0.014429,-0.096762,0.016859,-145.958184,0.0


In [64]:
spearman['snapshots'].style.applymap(partial(color_gt,x=.7))

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
RF,1.0,0.095025,0.721762,0.417523,0.615553,0.939921,0.751488,0.409048
SVM,0.095025,1.0,-0.00486802,0.0181376,0.210424,0.0690795,-0.0074204,-0.0790659
GB,0.721762,-0.00486802,1.0,0.44731,0.573944,0.682544,0.555307,0.456449
AB,0.417523,0.0181376,0.44731,1.0,0.436809,0.378938,0.395561,0.45455
logit,0.615553,0.210424,0.573944,0.436809,1.0,0.608232,0.56302,0.272844
ET,0.939921,0.0690795,0.682544,0.378938,0.608232,1.0,0.769211,0.377824
SGD,0.751488,-0.0074204,0.555307,0.395561,0.56302,0.769211,1.0,0.264742
DT,0.409048,-0.0790659,0.456449,0.45455,0.272844,0.377824,0.264742,1.0


In [65]:
features['grades'].head()

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
art_gpa_gr_5,0.004507,-0.215649,0.0,0.0326,-0.932123,0.003461,-1.832721,0.023892
art_gpa_gr_5_isnull,0.000518,0.255441,0.0,0.0,0.0,0.001457,-19.976029,0.0
art_gpa_gr_6,0.009637,0.211025,0.005868,0.025582,0.814532,0.00543,45.837587,0.0
art_gpa_gr_6_isnull,0.001762,-0.830613,0.0,0.0,-0.977115,0.006097,-30.629911,0.0
art_gpa_gr_7,0.011735,0.055578,0.004427,0.027485,0.093914,0.010712,33.371786,0.030191


In [66]:
spearman['grades'].style.applymap(partial(color_gt,x=.7))

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
RF,1.0,0.423696,0.804328,0.67396,0.214112,0.927995,0.410105,0.449434
SVM,0.423696,1.0,0.257204,0.167192,0.290103,0.463219,0.424557,0.130005
GB,0.804328,0.257204,1.0,0.627556,0.103742,0.774236,0.339666,0.406669
AB,0.67396,0.167192,0.627556,1.0,0.160981,0.595843,0.179715,0.555375
logit,0.214112,0.290103,0.103742,0.160981,1.0,0.192141,0.171102,0.04392
ET,0.927995,0.463219,0.774236,0.595843,0.192141,1.0,0.439974,0.398672
SGD,0.410105,0.424557,0.339666,0.179715,0.171102,0.439974,1.0,0.0623742
DT,0.449434,0.130005,0.406669,0.555375,0.04392,0.398672,0.0623742,1.0


In [67]:
features['all'].head()

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
absence_consec_gr_7,0.0,0.141226,0.0,0.0,0.249384,0.0,90.557997,0.0
absence_consec_gr_8,0.001153,0.015964,0.006129,0.0,0.027541,0.001173,194.433347,0.0
absence_consec_gr_9,0.006888,0.017218,0.012071,0.017715,0.142771,0.002286,857.637502,0.0
absence_gr_5,0.002341,0.007709,0.002997,0.0,-0.014128,0.001349,-80.902917,0.0
absence_gr_6,0.00386,0.025194,0.012652,0.0,-0.063706,0.004162,-94.730768,0.0


In [68]:
spearman['all'].style.applymap(partial(color_gt,x=.7))

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
RF,1.0,0.0292079,0.524172,0.433862,0.402804,0.709972,0.553976,0.252606
SVM,0.0292079,1.0,0.0269234,0.0636731,0.330684,0.112872,-0.146474,0.0277437
GB,0.524172,0.0269234,1.0,0.37718,0.298748,0.456028,0.352286,0.24795
AB,0.433862,0.0636731,0.37718,1.0,0.282503,0.371578,0.302282,0.234729
logit,0.402804,0.330684,0.298748,0.282503,1.0,0.411164,0.221979,0.167592
ET,0.709972,0.112872,0.456028,0.371578,0.411164,1.0,0.540943,0.202544
SGD,0.553976,-0.146474,0.352286,0.302282,0.221979,0.540943,1.0,0.181013
DT,0.252606,0.0277437,0.24795,0.234729,0.167592,0.202544,0.181013,1.0


In [69]:
pd.Panel(spearman).mean(axis=0).style.applymap(partial(color_gt,x=.7,c='blue'))

Unnamed: 0,RF,SVM,GB,AB,logit,ET,SGD,DT
RF,1.0,0.0620146,0.733334,0.500821,0.272402,0.843382,0.249179,0.405928
SVM,0.0620146,1.0,0.00122154,0.0304387,0.216111,0.0846216,0.24325,-0.0200512
GB,0.733334,0.00122154,1.0,0.507013,0.254866,0.671793,0.192053,0.410929
AB,0.500821,0.0304387,0.507013,1.0,0.33142,0.475702,0.233686,0.448907
logit,0.272402,0.216111,0.254866,0.33142,1.0,0.314664,0.340358,0.205559
ET,0.843382,0.0846216,0.671793,0.475702,0.314664,1.0,0.26781,0.376104
SGD,0.249179,0.24325,0.192053,0.233686,0.340358,0.26781,1.0,0.11975
DT,0.405928,-0.0200512,0.410929,0.448907,0.205559,0.376104,0.11975,1.0


In [None]:
with postgres_pgconnection_generator() as connection:
    with connection.cursor() as cursor:
        cursor.execute("""select, """ 