In [90]:
import os, sys
sys.path.insert(0, '/home/jtorrenc/mvesc/ETL')
from mvesc_utility_functions import *
import numpy as np
import pandas as pd
import random
from functools import partial

In [286]:
def color_gt(val,x=.5,c='blue',neg=True):
    """
    Colors values greater than x blue (or other provided color)
    
    :param float x: lower bound on colored values
    :param str c: color
    :param bool neg: if true, color negative values red
    :returns: css string to color values in dataframe
    :rtype: str
    """
    
    if val > x:
        color = c 
    elif val < 0:
        color = 'red' 
    else: color = 'black'
    return 'color: %s' % color

def table_exists(cursor, table, schema='clean'):
    """
    Checks to see if a table exists in the database
    
    :param pg.cursor cursor: 
    :param str table:
    :param str schema:
    :rtype: bool
    """
    cursor.execute("""                                                   
            select count(*) from information_schema.tables                       
            where table_schema = %s and table_name = %s
            """, [schema, table])
    return cursor.fetchall()[0][0]

In [129]:
with postgres_pgconnection_generator() as connection:
    with connection.cursor() as cursor:
        cursor.execute('select distinct model_name from model.reports')
        model_list = [m[0] for m in cursor.fetchall()]
        best = dict()
        for m in model_list:
            cursor.execute("""select filename from model.reports where model_name = %s
                            order by val_precision_10 desc limit 1; """, [m])
            best[m] = cursor.fetchall()[0][0]
    predictions = pd.DataFrame()
    for m, f in best.items():
        df = read_table_to_df(connection, f,columns = ['student_lookup','predicted_score'],
                                          schema='predictions')
        df.set_index('student_lookup',inplace=True)
        df.columns = [m]
        predictions = pd.concat((predictions,df), axis=1, join='inner')


In [269]:
kendall_pred = predictions.corr('kendall')
spearman_pred = predictions.corr('spearman')

# Kendall's Tau Prediction Correlation

In [272]:
kendall_pred.style.applymap(partial(color_gt,x=.65))

Unnamed: 0,GB,AB,KNN,RF,DT,SVM,SGD,NB,ET,logit
GB,1.0,0.648979,0.386624,0.543964,0.563825,0.505984,0.31899,0.700666,0.331506,0.681978
AB,0.648979,1.0,0.279236,0.591049,0.698979,0.557895,0.421053,0.443405,0.577832,0.663158
KNN,0.386624,0.279236,1.0,0.568184,0.243975,0.265274,0.293198,0.537914,0.0300557,0.446778
RF,0.543964,0.591049,0.568184,1.0,0.619064,0.471754,0.504289,0.551534,0.326843,0.677808
DT,0.563825,0.698979,0.243975,0.619064,1.0,0.515037,0.404672,0.472456,0.514765,0.576351
SVM,0.505984,0.557895,0.265274,0.471754,0.515037,1.0,0.442105,0.508294,0.657142,0.684211
SGD,0.31899,0.421053,0.293198,0.504289,0.404672,0.442105,1.0,0.270369,0.577832,0.505263
NB,0.700666,0.443405,0.537914,0.551534,0.472456,0.508294,0.270369,1.0,0.273552,0.638071
ET,0.331506,0.577832,0.0300557,0.326843,0.514765,0.657142,0.577832,0.273552,1.0,0.543842
logit,0.681978,0.663158,0.446778,0.677808,0.576351,0.684211,0.505263,0.638071,0.543842,1.0


# Spearman's Prediction Correlation

In [277]:
spearman_pred.style.applymap(partial(color_gt,x=.75))

Unnamed: 0,GB,AB,KNN,RF,DT,SVM,SGD,NB,ET,logit
GB,1.0,0.77675,0.464516,0.694979,0.669951,0.653263,0.501572,0.847174,0.442681,0.791995
AB,0.77675,1.0,0.363542,0.751824,0.819069,0.714286,0.56391,0.631836,0.701552,0.807519
KNN,0.464516,0.363542,1.0,0.669589,0.278314,0.337006,0.368849,0.664877,0.0419427,0.555485
RF,0.694979,0.751824,0.669589,1.0,0.74225,0.606309,0.682856,0.712486,0.435887,0.835949
DT,0.669951,0.819069,0.278314,0.74225,1.0,0.625037,0.487465,0.59774,0.592371,0.71092
SVM,0.653263,0.714286,0.337006,0.606309,0.625037,1.0,0.654135,0.66214,0.808127,0.839098
SGD,0.501572,0.56391,0.368849,0.682856,0.487465,0.654135,1.0,0.436376,0.732988,0.717293
NB,0.847174,0.631836,0.664877,0.712486,0.59774,0.66214,0.436376,1.0,0.360783,0.81669
ET,0.442681,0.701552,0.0419427,0.435887,0.592371,0.808127,0.732988,0.360783,1.0,0.693885
logit,0.791995,0.807519,0.555485,0.835949,0.71092,0.839098,0.717293,0.81669,0.693885,1.0


In [229]:
with postgres_pgconnection_generator() as connection:
    with connection.cursor() as cursor:
        cursor.execute('select distinct model_name from model.reports')
        model_list = [m[0] for m in cursor.fetchall()]
        cursor.execute('select distinct feature_categories from model.reports')
        feature_list = [f[0] for f in cursor.fetchall()]
        to_remove = []
        for f in feature_list:
            if ',' in f:
                to_remove.append(f)
        for f in to_remove:
            feature_list.remove(f)
        best = dict()
        features = dict()
        spearman = dict()
        for f in feature_list:
            best[f] = dict()
            for m in model_list:
                cursor.execute("""select filename from model.reports 
                                where model_name = %s
                                    and feature_categories = %s
                                    and feature_grades = '5, 6, 7, 8, 9'
                                order by val_precision_10 desc limit 1; """, [m,f])
                best[f][m] = cursor.fetchall()[0][0]
            temp = pd.DataFrame()
            for m, file in best[f].items():
                if table_exists(cursor, file, 'feature_scores'):
                    df = read_table_to_df(connection, file ,columns=['feature','importance'],schema='feature_scores',nrows = -1)
                    df.set_index('feature',inplace=True)
                    df.columns = [m]
                    temp = pd.concat((temp,df), axis=1, join='outer')
            features[f] = temp
            spearman[f] = temp.corr('spearman')

In [252]:
best['mobility']

{'AB': 'param_set_453_AB_ht_4524',
 'DT': 'param_set_458_DT_ht_4571',
 'ET': 'param_set_453_ET_ht_4523',
 'GB': 'param_set_468_GB_ht_4676',
 'KNN': 'param_set_458_KNN_ht_4579',
 'NB': 'param_set_463_NB_ht_4627',
 'RF': 'param_set_463_RF_ht_4622',
 'SGD': 'param_set_453_SGD_ht_4528',
 'SVM': 'param_set_453_SVM_ht_4525',
 'logit': 'param_set_453_logit_ht_4520'}

# Spearman's Feature Correlation


In [253]:
features['absence'].head()

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
absence_consec_gr_7,0.0,0.0,0.000789,0.0,0.087852,22.639499,0.000924,0.262957
absence_consec_gr_8,0.011905,0.003216,0.028656,0.0,-1.6e-05,1.331735,0.028532,0.031852
absence_consec_gr_9,0.060845,0.099215,0.046819,0.040512,5.5e-05,9.322147,0.046311,0.269176
absence_gr_5,0.078696,,0.064056,0.020813,-2.3e-05,-8.101389,0.073802,-0.046104
absence_gr_6,0.081279,0.052965,0.079172,0.127702,8e-06,-9.943623,0.079043,-0.222509


In [287]:
spearman['absence'].style.applymap(partial(color_gt,x=.8))

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
GB,1.0,0.829173,0.952172,0.79009,-0.289266,-0.112528,0.946907,0.0279668
AB,0.829173,1.0,0.725255,0.851009,-0.158314,-0.0521977,0.710109,0.303826
RF,0.952172,0.725255,1.0,0.747127,-0.42406,-0.273684,0.995489,-0.0903615
DT,0.79009,0.851009,0.747127,1.0,-0.0926121,-0.18278,0.733469,-0.0372944
SVM,-0.289266,-0.158314,-0.42406,-0.0926121,1.0,0.653875,-0.418045,0.314762
SGD,-0.112528,-0.0521977,-0.273684,-0.18278,0.653875,1.0,-0.257895,0.647835
ET,0.946907,0.710109,0.995489,0.733469,-0.418045,-0.257895,1.0,-0.10241
logit,0.0279668,0.303826,-0.0903615,-0.0372944,0.314762,0.647835,-0.10241,1.0


In [283]:
features['mobility'].head()


Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
avg_address_change_to_gr_5,0.006284,0.0,0.002644,0.0,0.418592,1.331735,0.001653,0.0
avg_address_change_to_gr_5_isnull,1e-05,0.0,0.004189,0.007397,0.28075,-1.512494e-14,0.00289,0.041814
avg_address_change_to_gr_6,0.000757,0.0,0.005937,0.0,0.071333,5.553688e-15,0.004239,0.0
avg_address_change_to_gr_6_isnull,0.0,0.0,0.002541,0.0,-6.9e-05,-1.276167e-14,0.002361,0.00564
avg_address_change_to_gr_7,0.008888,0.0,0.004179,0.0,-0.014458,-0.2219559,0.003427,0.0


In [288]:
spearman['mobility'].style.applymap(partial(color_gt,x=.8))

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
GB,1.0,0.186848,0.772469,0.219173,0.135896,0.332366,0.669802,-0.0459065
AB,0.186848,1.0,0.178241,0.246055,-0.172866,-0.148659,0.249274,-0.248216
RF,0.772469,0.178241,1.0,0.189319,0.127738,0.348208,0.78457,-0.0339414
DT,0.219173,0.246055,0.189319,1.0,0.139271,0.144549,0.231862,0.134365
SVM,0.135896,-0.172866,0.127738,0.139271,1.0,0.650012,0.145963,0.588837
SGD,0.332366,-0.148659,0.348208,0.144549,0.650012,1.0,0.301345,0.545441
ET,0.669802,0.249274,0.78457,0.231862,0.145963,0.301345,1.0,0.0800012
logit,-0.0459065,-0.248216,-0.0339414,0.134365,0.588837,0.545441,0.0800012,1.0


In [257]:
features['oaa_normalized'].head()

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
eighth_math_normalized,0.038644,0.127352,0.039856,0.0,0.085535,2.138664,0.02115,0.0
eighth_math_normalized_isnull,0.0,0.0,0.006814,0.0,0.169345,5.326941,0.004572,0.0
eighth_math_percentile,0.01872,0.044069,0.034348,0.08065,-0.055547,-2.593598,0.035541,0.0
eighth_math_percentile_isnull,0.004537,0.0,0.006687,0.025544,0.169345,5.326941,,0.0
eighth_math_pl_Accelerated,0.0,0.0,0.002173,0.0,-0.008385,-2.663471,0.005935,-0.233912


In [289]:
spearman['oaa_normalized'].style.applymap(partial(color_gt,x=.8))

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
GB,1.0,0.574008,0.625103,0.34524,0.125889,0.0546042,0.501239,0.0804976
AB,0.574008,1.0,0.576083,0.351726,0.0977113,0.0129869,0.548473,0.048788
RF,0.625103,0.576083,1.0,0.388032,0.130376,0.177038,0.702344,0.140083
DT,0.34524,0.351726,0.388032,1.0,0.11749,0.0455147,0.312255,0.080076
SVM,0.125889,0.0977113,0.130376,0.11749,1.0,0.615144,0.151398,0.670097
SGD,0.0546042,0.0129869,0.177038,0.0455147,0.615144,1.0,0.0862026,0.794284
ET,0.501239,0.548473,0.702344,0.312255,0.151398,0.0862026,1.0,0.0313837
logit,0.0804976,0.048788,0.140083,0.080076,0.670097,0.794284,0.0313837,1.0


In [259]:
features['snapshots'].head()


Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
days_absent_excused_gr_7,0.014806,0.023076,0.026768,0.0,-0.713126,-156.283049,0.019405,-0.505254
days_absent_excused_gr_8,0.010918,0.0,0.023392,0.0,0.100783,17.032193,0.022353,0.0
days_absent_excused_gr_9,0.024187,0.0,0.069043,0.032535,0.394523,-0.003329,0.057544,0.351827
days_absent_gr_5,0.02297,0.0,0.020311,0.017877,-0.063759,-159.031385,0.016444,-0.06179
days_absent_gr_6,0.017842,0.014429,0.019935,0.0,-0.036667,-145.958184,0.016859,-0.096762


In [290]:
spearman['snapshots'].style.applymap(partial(color_gt,x=.7))

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
GB,1.0,0.44731,0.721762,0.456449,0.0287634,0.19359,0.682544,0.140485
AB,0.44731,1.0,0.417523,0.45455,0.081602,0.234726,0.378938,0.169644
RF,0.721762,0.417523,1.0,0.409048,0.104049,0.253087,0.939921,0.191395
DT,0.456449,0.45455,0.409048,1.0,-0.0163132,0.0872259,0.377824,0.0783942
SVM,0.0287634,0.081602,0.104049,-0.0163132,1.0,0.319294,0.0457418,0.551301
SGD,0.19359,0.234726,0.253087,0.0872259,0.319294,1.0,0.209446,0.569496
ET,0.682544,0.378938,0.939921,0.377824,0.0457418,0.209446,1.0,0.168933
logit,0.140485,0.169644,0.191395,0.0783942,0.551301,0.569496,0.168933,1.0


In [291]:
pd.Panel(spearman).mean(axis=0).style.applymap(partial(color_gt,x=.8,c='blue'))

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
GB,1.0,0.532979,0.775167,0.443524,-0.00621298,0.0254781,0.714946,0.00612979
AB,0.532979,1.0,0.514213,0.491743,-0.0394751,-0.0521639,0.496527,0.0228263
RF,0.775167,0.514213,1.0,0.436592,-0.021192,0.0351897,0.870064,-0.00800135
DT,0.443524,0.491743,0.436592,1.0,0.018182,-0.0191072,0.410817,0.0216724
SVM,-0.00621298,-0.0394751,-0.021192,0.018182,1.0,0.471227,-0.027056,0.537002
SGD,0.0254781,-0.0521639,0.0351897,-0.0191072,0.471227,1.0,0.00695617,0.599916
ET,0.714946,0.496527,0.870064,0.410817,-0.027056,0.00695617,1.0,-0.0131029
logit,0.00612979,0.0228263,-0.00800135,0.0216724,0.537002,0.599916,-0.0131029,1.0
