In [90]:
import os, sys
sys.path.insert(0, '/home/jtorrenc/mvesc/ETL')
from mvesc_utility_functions import *
import numpy as np
import pandas as pd
import random
from functools import partial

In [116]:
def color_gt(val,x=.5,c='blue'):
    """
    Colors values greater than x blue (or other provided color)
    
    :param float x: lower bound on colored values
    :param str c: color
    :returns: css string to color values in dataframe
    :rtype: str
    """
    color = c if val > x else 'black'
    return 'color: %s' % color

def table_exists(cursor, table, schema='clean'):
    """
    Checks to see if a table exists in the database
    
    :param pg.cursor cursor: 
    :param str table:
    :param str schema:
    :rtype: bool
    """
    cursor.execute("""                                                   
            select count(*) from information_schema.tables                       
            where table_schema = %s and table_name = %s
            """, [schema, table])
    return cursor.fetchall()[0][0]

In [129]:
with postgres_pgconnection_generator() as connection:
    with connection.cursor() as cursor:
        cursor.execute('select distinct model_name from model.reports')
        model_list = [m[0] for m in cursor.fetchall()]
        best = dict()
        for m in model_list:
            cursor.execute("""select filename from model.reports where model_name = %s
                            order by val_precision_10 desc limit 1; """, [m])
            best[m] = cursor.fetchall()[0][0]
    predictions = pd.DataFrame()
    for m, f in best.items():
        df = read_table_to_df(connection, f,columns = ['student_lookup','predicted_score'],
                                          schema='predictions')
        df.set_index('student_lookup',inplace=True)
        df.columns = [m]
        predictions = pd.concat((predictions,df), axis=1, join='inner')


In [120]:
kendall = predictions.corr('kendall')
spearman = predictions.corr('spearman')

# Kendall's Tau Prediction Correlation

In [121]:
kendall.style.applymap(partial(color_gt,x=.65,c='blue'))

Unnamed: 0,GB,AB,KNN,RF,DT,SVM,SGD,NB,ET,logit
GB,1.0,0.648979,0.386624,0.543964,0.563825,0.505984,0.31899,0.700666,0.331506,0.681978
AB,0.648979,1.0,0.279236,0.591049,0.698979,0.557895,0.421053,0.443405,0.577832,0.663158
KNN,0.386624,0.279236,1.0,0.568184,0.243975,0.265274,0.293198,0.537914,0.0300557,0.446778
RF,0.543964,0.591049,0.568184,1.0,0.619064,0.471754,0.504289,0.551534,0.326843,0.677808
DT,0.563825,0.698979,0.243975,0.619064,1.0,0.515037,0.404672,0.472456,0.514765,0.576351
SVM,0.505984,0.557895,0.265274,0.471754,0.515037,1.0,0.442105,0.508294,0.657142,0.684211
SGD,0.31899,0.421053,0.293198,0.504289,0.404672,0.442105,1.0,0.270369,0.577832,0.505263
NB,0.700666,0.443405,0.537914,0.551534,0.472456,0.508294,0.270369,1.0,0.273552,0.638071
ET,0.331506,0.577832,0.0300557,0.326843,0.514765,0.657142,0.577832,0.273552,1.0,0.543842
logit,0.681978,0.663158,0.446778,0.677808,0.576351,0.684211,0.505263,0.638071,0.543842,1.0


# Spearman's Prediction Correlation

In [122]:
spearman.style.applymap(partial(color_gt,x=.75,c='blue'))

Unnamed: 0,GB,AB,KNN,RF,DT,SVM,SGD,NB,ET,logit
GB,1.0,0.77675,0.464516,0.694979,0.669951,0.653263,0.501572,0.847174,0.442681,0.791995
AB,0.77675,1.0,0.363542,0.751824,0.819069,0.714286,0.56391,0.631836,0.701552,0.807519
KNN,0.464516,0.363542,1.0,0.669589,0.278314,0.337006,0.368849,0.664877,0.0419427,0.555485
RF,0.694979,0.751824,0.669589,1.0,0.74225,0.606309,0.682856,0.712486,0.435887,0.835949
DT,0.669951,0.819069,0.278314,0.74225,1.0,0.625037,0.487465,0.59774,0.592371,0.71092
SVM,0.653263,0.714286,0.337006,0.606309,0.625037,1.0,0.654135,0.66214,0.808127,0.839098
SGD,0.501572,0.56391,0.368849,0.682856,0.487465,0.654135,1.0,0.436376,0.732988,0.717293
NB,0.847174,0.631836,0.664877,0.712486,0.59774,0.66214,0.436376,1.0,0.360783,0.81669
ET,0.442681,0.701552,0.0419427,0.435887,0.592371,0.808127,0.732988,0.360783,1.0,0.693885
logit,0.791995,0.807519,0.555485,0.835949,0.71092,0.839098,0.717293,0.81669,0.693885,1.0


In [174]:
with postgres_pgconnection_generator() as connection:
    with connection.cursor() as cursor:
        cursor.execute('select distinct model_name from model.reports')
        model_list = [m[0] for m in cursor.fetchall()]
        cursor.execute('select distinct feature_categories from model.reports')
        feature_list = [f[0] for f in cursor.fetchall()]
        feature_list = [f if ',' not in f  for f in feature_list]
        best = dict()
        features = dict()
        for f in feature_list:
            best[f] = dict()
            for m in model_list:
                cursor.execute("""select filename from model.reports 
                                where model_name = %s
                                    and feature_categories = %s
                                    and feature_grades like '5, 6, 7, 8, 9'
                                order by val_precision_10 desc limit 1; """, [m,f])
#                best[f][m] = 
                print(m, f)
                print(cursor.fetchall()[0][0])
            temp = pd.DataFrame()
            for m, file in best[f].items():
                if table_exists(cursor, file, 'feature_scores'):
                    df = read_table_to_df(connection, file ,columns=['feature','importance'],schema='feature_scores')
                    df.set_index('feature',inplace=True)
                    df.columns = [m]
                    temp = pd.concat((temp,df), axis=1, join='inner')
                    features[f] = temp

SyntaxError: invalid syntax (<ipython-input-174-6ce9a8c6aa7d>, line 7)

In [154]:
kendall_f = features['absence'].corr('kendall')
spearman_f = features['absence'].corr('spearman')   

# Kendall's Feature Correlation


In [155]:
kendall_f.style.applymap(partial(color_gt,x=.65,c='blue'))

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
GB,1.0,0.4,0.8,0.6,0.2,0.8,1.0,0.4
AB,0.4,1.0,0.2,0.0,0.8,0.6,0.4,0.6
RF,0.8,0.2,1.0,0.4,0.4,0.6,0.8,0.2
DT,0.6,0.0,0.4,1.0,-0.2,0.4,0.6,0.4
SVM,0.2,0.8,0.4,-0.2,1.0,0.4,0.2,0.4
SGD,0.8,0.6,0.6,0.4,0.4,1.0,0.8,0.6
ET,1.0,0.4,0.8,0.6,0.2,0.8,1.0,0.4
logit,0.4,0.6,0.2,0.4,0.4,0.6,0.4,1.0


# Spearman's Feature Correlation


In [157]:
spearman_f.style.applymap(partial(color_gt,x=.8,c='blue'))

Unnamed: 0,GB,AB,RF,DT,SVM,SGD,ET,logit
GB,1.0,0.4,0.9,0.7,0.3,0.9,1.0,0.6
AB,0.4,1.0,0.3,0.1,0.9,0.7,0.4,0.8
RF,0.9,0.3,1.0,0.6,0.4,0.8,0.9,0.3
DT,0.7,0.1,0.6,1.0,0.0,0.4,0.7,0.5
SVM,0.3,0.9,0.4,0.0,1.0,0.6,0.3,0.5
SGD,0.9,0.7,0.8,0.4,0.6,1.0,0.9,0.7
ET,1.0,0.4,0.9,0.7,0.3,0.9,1.0,0.6
logit,0.6,0.8,0.3,0.5,0.5,0.7,0.6,1.0


In [158]:
features['absence']

Unnamed: 0_level_0,GB,AB,RF,DT,SVM,SGD,ET,logit
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
absence_gr_9,0.318075,0.594929,0.575692,0.577405,0.000303,1.624373,0.356194,0.710943
tardy_gr_9,0.062354,0.100142,0.04316,0.023294,-1.8e-05,-2.188515,0.024161,0.108634
tardy_unexcused_gr_9,0.027932,0.035173,0.045942,0.01965,1.6e-05,-2.854402,0.023865,-0.052338
absence_consec_gr_9,0.066715,0.152964,0.097122,0.008563,5e-05,1.588377,0.047326,0.04669
absence_unexcused_gr_9,0.082752,0.024632,0.18505,0.053947,-3.5e-05,0.952575,0.115825,0.01785
