In [2]:
import sys, os
sys.path.insert(0, '/home/jgutman/mvesc/ETL')
from mvesc_utility_functions import *
import pandas as pd

In [71]:
import re
import numpy as np

In [47]:
with postgres_pgconnection_generator() as connection:
    with connection.cursor() as cursor:
        optimization_criteria = ['val_precision_5', 'val_recall_5']
        predictions = None
        top_models_query = """
    create temporary table top_models as
    select distinct on (model_name, label)
    * from
        (select model_name, filename, label, feature_categories,
        feature_grades, {criteria},
        rank() over (partition by (model_name, label)
            order by {ranker} desc) as val_rank
        from model.reports
        where debug=false
        order by model_name, label, val_rank) vr
    order by model_name, label, val_rank;
    """.format(criteria = ", ".join(optimization_criteria),
        ranker = optimization_criteria[0])
        cursor.execute(top_models_query)

        cursor.execute("""select filename, feature_categories,
            feature_grades from top_models;""")
        models_and_features = cursor.fetchall()

        for (table_name, feature_tables, feature_grade_range) \
                in models_and_features:
            feature_table_list = feature_tables.split(", ")
            feature_grades = [int(i) for i in
                    feature_grade_range.split(", ")]
            feature_grade_regex = '({})'.format('|'.join(
                [str(i) for i in feature_grades]))
            pattern = re.compile('(_gr_{rx}\Z)|(\D\Z)'.format(
                rx=feature_grade_regex))

            for test_set in ['val', 'test']:
                get_model_predictions = """select * from
                (select student_lookup, true_label, predicted_label,
                predicted_label = true_label as correct
                from predictions."{table}" where split = '{test_set}') preds
                """.format(table = table_name, test_set = test_set)

                for features in feature_table_list:
                    get_model_predictions += """ left join
                (select * from model.{features}) {features}
                using(student_lookup)
                """.format(features = features)

                cursor.execute(get_model_predictions)
                predictions_and_features = cursor.fetchall()
                colnames = [i[0] for i in cursor.description]
                predictions = pd.DataFrame.from_records(
                    predictions_and_features, index = 'student_lookup',
                    columns = colnames)
                predictions = predictions.filter(regex=pattern)

In [48]:
colnames = list(predictions.columns)

In [66]:
predictions[['true_label', 'predicted_label']] = predictions[['true_label', 'predicted_label']].astype(bool)

In [68]:
predictions.dtypes

true_label                           bool
predicted_label                      bool
correct                              bool
days_absent_gr_9                  float64
days_absent_excused_gr_9          float64
days_absent_unexcused_gr_9        float64
days_present_gr_9                 float64
disability_gr_9                    object
disadvantagement_gr_9              object
discipline_incidents_gr_9         float64
district_gr_9                      object
gifted_gr_9                        object
iss_gr_9                          float64
limited_english_gr_9               object
oss_gr_9                          float64
section_504_plan_gr_9              object
special_ed_gr_9                    object
status_gr_9                        object
absence_gr_9                      float64
absence_unexcused_gr_9            float64
tardy_gr_9                          int64
tardy_unexcused_gr_9                int64
medical_gr_9                        int64
absence_consec_gr_9               

In [80]:
pd.cut(numeric_features.days_absent_unexcused_gr_9, bins = 5, precision = 1)

student_lookup
13440.0     (-0.02, 4.4]
6015.0      (-0.02, 4.4]
5781.0      (-0.02, 4.4]
27716.0     (-0.02, 4.4]
874.0       (-0.02, 4.4]
10661.0     (-0.02, 4.4]
1694.0      (-0.02, 4.4]
14514.0     (-0.02, 4.4]
1718.0      (-0.02, 4.4]
6236.0      (-0.02, 4.4]
13421.0     (-0.02, 4.4]
6154.0      (-0.02, 4.4]
1691.0      (-0.02, 4.4]
4712.0      (-0.02, 4.4]
13372.0       (4.4, 8.8]
14080.0     (-0.02, 4.4]
20775.0     (-0.02, 4.4]
33420.0     (-0.02, 4.4]
14048.0     (-0.02, 4.4]
9042.0      (-0.02, 4.4]
20860.0     (-0.02, 4.4]
14070.0     (-0.02, 4.4]
1368.0      (-0.02, 4.4]
6659.0      (-0.02, 4.4]
1768.0      (-0.02, 4.4]
1752.0      (-0.02, 4.4]
16553.0     (-0.02, 4.4]
6571.0      (-0.02, 4.4]
19154.0     (-0.02, 4.4]
27396.0     (-0.02, 4.4]
                ...     
14347.0     (-0.02, 4.4]
14462.0     (-0.02, 4.4]
14468.0     (-0.02, 4.4]
14513.0     (-0.02, 4.4]
14519.0     (-0.02, 4.4]
14528.0     (-0.02, 4.4]
14565.0     (-0.02, 4.4]
14615.0     (-0.02, 4.4]
14978.0   

In [74]:
numeric_features = predictions.select_dtypes(include=[np.number])

In [77]:
numeric_features.days_absent_unexcused_gr_9

student_lookup
13440.0     0.0
6015.0      0.0
5781.0      1.0
27716.0     0.0
874.0       0.0
10661.0     0.0
1694.0      0.0
14514.0     0.0
1718.0      0.0
6236.0      0.0
13421.0     0.5
6154.0      0.0
1691.0      0.0
4712.0      0.0
13372.0     4.5
14080.0     1.0
20775.0     0.0
33420.0     0.0
14048.0     4.0
9042.0      0.0
20860.0     0.0
14070.0     0.0
1368.0      0.0
6659.0      0.0
1768.0      0.0
1752.0      0.0
16553.0     4.0
6571.0      4.0
19154.0     0.0
27396.0     0.0
           ... 
14347.0     0.0
14462.0     0.0
14468.0     0.0
14513.0     4.0
14519.0     3.0
14528.0     0.0
14565.0     0.0
14615.0     0.0
14978.0     1.0
16608.0     0.0
16609.0     0.0
16799.0     0.0
20867.0     5.0
28024.0     1.0
33383.0     0.0
50810.0     0.0
54771.0     2.0
55118.0     1.0
63051.0     0.0
69522.0     0.0
70160.0     0.0
70537.0     0.0
70542.0     0.0
70869.0     0.0
699763.0    0.0
699791.0    0.0
699814.0    0.0
700017.0    0.0
700122.0    0.0
700208.0    0.0
Name: day

In [87]:
predictions.select_dtypes(include=[object, bool])

Unnamed: 0_level_0,true_label,predicted_label,correct,disability_gr_9,disadvantagement_gr_9,district_gr_9,gifted_gr_9,limited_english_gr_9,section_504_plan_gr_9,special_ed_gr_9,...,eighth_read_pl,eighth_math_pl,eighth_science_pl,ethnicity,gender,street_transition_in_gr_9,district_transition_in_gr_9,city_transition_in_gr_9,mid_year_withdraw_gr_9,interventions_gpa_gr_9
student_lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13440.0,False,False,True,none,none,Ridgewood,N,N,N,0,...,Accelerated,Accelerated,Accelerated,W,M,False,False,False,,
6015.0,False,False,True,none,none,Ridgewood,N,N,N,0,...,Proficient,Proficient,Proficient,W,F,False,False,False,,
5781.0,False,False,True,none,none,Maysville,Y,N,N,0,...,Accelerated,Proficient,Accelerated,W,F,False,False,False,,
27716.0,False,False,True,none,economic,Coshocton,N,N,N,0,...,Accelerated,Accelerated,Accelerated,W,F,False,False,False,,
874.0,False,False,True,none,economic,Coshocton,N,N,N,0,...,Accelerated,Accelerated,Accelerated,W,F,False,False,False,,
10661.0,False,False,True,none,economic,Ridgewood,N,N,N,0,...,Advanced,Accelerated,Accelerated,W,F,False,False,False,,
1694.0,False,False,True,none,economic,Logan_Hocking,N,N,,0,...,Basic,Proficient,Basic,W,F,False,False,False,,
14514.0,True,False,False,none,economic,Logan_Hocking,N,N,,0,...,Basic,Proficient,Basic,W,F,False,False,False,,
1718.0,False,False,True,none,none,Logan_Hocking,N,N,,0,...,Proficient,Accelerated,Accelerated,W,F,False,False,False,,
6236.0,False,False,True,none,none,East Muskingum,N,N,N,0,...,Accelerated,Proficient,Basic,W,F,False,False,False,,


In [86]:
pd.unique(predictions.section_504_plan_gr_9)

array(['N', None, '1', 'Y', '0'], dtype=object)