In [114]:
import os, sys
from os.path import isfile, join, abspath, basename
from optparse import OptionParser
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import sqlalchemy
import json

import os, sys
pathname = os.path.dirname(sys.argv[0])
full_pathname = os.path.abspath(pathname)
split_pathname = full_pathname.split(sep="mvesc")
base_pathname = os.path.join(split_pathname[0], "mvesc")
parentdir = os.path.join(base_pathname, "ETL")
sys.path.insert(0,parentdir)
from mvesc_utility_functions import *
import numpy as np
import pandas as pd
from feature_utilities import *


from mvesc_utility_functions import *


In [123]:
def main():
    schema = 'public'
    public_tables = ['INV_06_16_CO_M', 'INV_06_16_FR_M', 'INV_06_16_MA_M', 'INV_06_16_RV_M', 'INV_06_16_RW_M', 
                     'INV_06_16_TV_M', 'INV_06_16_WM_M', 'INV_10_16_CEVSD_M', 'INV_10_16_EM_M']
    new_column_names = ['student_lookup', 'status', 'grade', 'gender', 'hmrm', 
                        'membership_code', 'description', 'school_year', 'district']

    table_df = {}

    print(" - Reading intervention tables of different districts...")
    with postgres_pgconnection_generator() as conn:
        conn.autocommit = True
        with conn.cursor() as cursor:
            for tab in public_tables:
                nrows = -1
                table_df[tab] = read_table_to_df(conn, tab, schema=schema, nrows=nrows)
            table_df['INV_06_16_CO_M']['District'] = 'Coshocton' # add missed `district` column in table for CO
            codes_df = read_table_to_df(conn, 'INV_MembershipCodes', schema=schema, nrows=nrows) 

            
    dict_code2group = {codes_df.membership_code[i]:codes_df.membership_group[i] for i in range(codes_df.shape[0])}
    dict_group2abbrev = {'Post-secondary Enrollment Options Program':'post_secondary',
               'Academic Intervention':'academic_inv', 
               'Specialized Instructions':'spec_instruc',
               'Placement Options':'placement', 
               'Disadvantaged Pupil Programs (DPPF)':'DPPF',
               'Title I':'titleI', 'Vocational Programs':'vocational',
               'Extracurricular/Intracurricular Programs and Services':'extracurr_program',
               'Academic Intracurricular Descriptions (Vocational)':'academic_intracurr',
               'School Related Service Program':'school_program', 
               'Interscholastic Athletics':'atheletics',
               'Other':'other', 
               'Dropout':'dropout'}
    df = pd.DataFrame()
    for t in table_df:
        new_col_dict = {table_df[t].columns[i]:new_column_names[i] for i in range(len(new_column_names))}
        table_df[t] = table_df[t].rename(columns=new_col_dict)
        df = df.append(table_df[t])
    df = df.drop('hmrm', axis=1)
    df.reset_index(drop=True, inplace=True)

    print(" - Integrating and cleaning table...")
    # update and clean columns
    grade_converter = {'06':6, '05':5, '04':4, '03':3, '02':2, '01':1, 
                       'KG':0, 'PS':-1, '12':12, '11':11, '09':9,'10':10, 
                       '07':7, '08':8, '23':23, 'UG':None, 'GR':None, '13':13}
    codes_converter = {231101:231001, 231105:231005}
    new_grades = [None]*df.shape[0]
    inv_groups = ['']*df.shape[0]
    for i in range(df.shape[0]):
        if df.membership_code[i] in codes_converter:
            df.ix[i, 'membership_code'] = codes_converter[df.membership_code[i]]
        if df.grade[i] in grade_converter:
            new_grades[i] = grade_converter[df.grade[i]]
        if df.membership_code[i] in dict_code2group:
            inv_groups[i]=dict_group2abbrev[dict_code2group[df.membership_code[i]]]

    df.drop(['grade'], axis=1, inplace=True)
    def int_nonNone(x):
        """ Convert float to int and others None
        """
        if x==None or isinstance(x, str):
            return None
        else:
            return int(x)
    new_grades = list(map(int_nonNone, new_grades))
    df['grade'] = new_grades
    df['inv_group'] = inv_groups
    print(" - Saving intervention data frame to postgres... ")
    df2postgres(df, 'intervention', nrows=-1, if_exists='replace', schema='clean')
    print(" - clean.intervention generated!")
main()

 - Reading intervention tables of different districts...
 - Integrating and cleaning table...
 - Saving intervention data frame to postgres... 


OperationalError: (psycopg2.OperationalError) terminating connection due to administrator command
SSL connection has been closed unexpectedly
 [SQL: '\nDROP TABLE clean.intervention']

In [83]:
dict_code2group = {codes_df.membership_code[i]:codes_df.membership_group[i] for i in range(codes_df.shape[0])}
dict_group2abbrev = {'Post-secondary Enrollment Options Program':'post_secondary',
               'Academic Intervention':'academic_inv', 
               'Specialized Instructions':'spec instruc',
               'Placement Options':'placement', 
               'Disadvantaged Pupil Programs (DPPF)':'DPPF',
               'Title I':'titleI', 'Vocational Programs':'vocational',
               'Extracurricular/Intracurricular Programs and Services':'extracurr_program',
               'Academic Intracurricular Descriptions (Vocational)':'academic_intracurr',
               'School Related Service Program':'school_program', 
               'Interscholastic Athletics':'atheletics',
               'Other':'other', 
               'Dropout':'dropout'}

In [149]:

def set_null_as_0(cursor, columns, schema='model', table='intervention'):
    """ Set null data points as 0 (be careful to assume so)
    :param pg.connection.cursor cursor: postgres cursor
    :param str columns: a list of column names 
    :param str schema: schema name
    :param str table: table name
    """
    for column in columns:
        sqlcmd = """
        update {schema}.{table}
        set {column}=0
        where {column} is null;""".format(schema=schema, table=table, column=column)
        cursor.execute(sqlcmd)
    return None

def create_temp_intervention(conn, cursor, grade_range, table = 'intervention_1type_temp_table',
    source_schema = 'clean', type_str = 'academic_inv', source_table = 'intervention'):
    """
    """
    # create table with all student_lookups to store features for
    query_join_inv_features = """
    drop table if exists {t};
    create temporary table {t} as
    select * from
        (
            select distinct(student_lookup)
            from {source_schema}.{source_table}
        ) student_inv_list
    """.format(t=table, source_schema=source_schema, source_table=source_table)

    # for each student, get the number of distinct addresses, cities, districts
    # lived in up to the specified max_grade, also store the total number of
    # non-null records going into that count (how long they've been in data)
    # then compute average as (number_addresses - 1) / number_records
    for gr in grade_range:
        sql_join_grade = """
        left join
        (
            select student_lookup, 1 as {type_str}_gr_{gr}
            from {source_schema}.{source_table} 
            where grade={gr} and inv_group like '%{type_str}%'
        ) inv_{type_str}_{gr}
        using(student_lookup)
        """.format(gr=gr, type_str=type_str,
            source_schema=source_schema, source_table=source_table)
        query_join_inv_features += sql_join_grade

    cursor.execute(query_join_inv_features)
    # get column names in temporary table just created and return all in a list
    # remove student_lookup from list of column names returned
    #print(pd.read_sql_query("select * from {t} limit 20".format(t=table), conn))
    cursor.execute("select * from {t}".format(t=table))
    col_names = [i[0] for i in cursor.description]
    return(col_names[1:])
def main():
    source_schema, source_table = 'clean', 'intervention'
    schema, table = 'model', 'intervention_test' 
    temp_table = 'intervention_1type_temp_table'
    min_grd, max_grd = 7, 9
    all_features_list = ['extracurr_program','post_secondary', 'academic_inv',  'atheletics', 
                    'placement',  'spec_instruc','vocational', 'academic_intracurr', 
                    'school_program', 'titlei']
    top_feature_list = ['academic_inv',  'atheletics', 'placement']
    features2run = top_feature_list
    with postgres_pgconnection_generator() as conn:
        conn.autocommit = True
        with conn.cursor() as cursor:
            create_feature_table(cursor, table, schema=schema, replace=True)
            grades = list(range(min_grd, max_grd+1))
            # academic intervention
            for feature_desc in features2run:
                inv_type = feature_desc
                columns = create_temp_intervention(conn, cursor, grades, table = temp_table,
                                                   source_schema = source_schema, type_str = inv_type, 
                                                   source_table = source_table)
                update_column_with_join(cursor, table, columns, source_table=temp_table)
                set_null_as_0(cursor, columns, schema=schema, table=table)
                #print(pd.read_sql_query("select * from model.{t} limit 20".format(t=table), conn))
            conn.commit()
            print(" - Intervention features generated")
main()

 - Table model.intervention_test created!
 - updated academic_inv_gr_7, academic_inv_gr_8, academic_inv_gr_9 in model.intervention_test
    from intervention_1type_temp_table; 
 - updated atheletics_gr_7, atheletics_gr_8, atheletics_gr_9 in model.intervention_test
    from intervention_1type_temp_table; 
 - updated placement_gr_7, placement_gr_8, placement_gr_9 in model.intervention_test
    from intervention_1type_temp_table; 
 - Intervention features generated


In [156]:
with postgres_pgconnection_generator() as conn:
    conn.autocommit = True
    with conn.cursor() as cursor:
        df = read_table_to_df(conn, 'intervention_test', schema='model', nrows=-1)
df.head()

Unnamed: 0,student_lookup,spec_instruc_gr_3,spec_instruc_gr_4,spec_instruc_gr_5,spec_instruc_gr_6,spec_instruc_gr_7,spec_instruc_gr_8,spec_instruc_gr_9,spec_instruc_gr_10,spec_instruc_gr_11,...,vocational_gr_3,vocational_gr_4,vocational_gr_5,vocational_gr_6,vocational_gr_7,vocational_gr_8,vocational_gr_9,vocational_gr_10,vocational_gr_11,vocational_gr_12
0,35763.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
1,35772.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
2,36277.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
3,36755.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
4,36779.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1


In [157]:
df.sum(axis=0)[df.sum(axis=0)>0]

student_lookup             583796087.0
spec_instruc_gr_3                 69.0
spec_instruc_gr_4                290.0
spec_instruc_gr_5                287.0
spec_instruc_gr_6                348.0
spec_instruc_gr_7                364.0
spec_instruc_gr_8                504.0
spec_instruc_gr_9                166.0
spec_instruc_gr_10               171.0
spec_instruc_gr_11               381.0
spec_instruc_gr_12               612.0
titlei_gr_3                       53.0
titlei_gr_4                       88.0
titlei_gr_5                       96.0
titlei_gr_6                      266.0
titlei_gr_7                       60.0
titlei_gr_8                       59.0
titlei_gr_9                       66.0
titlei_gr_10                      12.0
titlei_gr_11                       4.0
titlei_gr_12                       1.0
post_secondary_gr_9                3.0
post_secondary_gr_10              37.0
post_secondary_gr_11             197.0
post_secondary_gr_12             294.0
academic_inv_gr_3        

In [154]:
df.student_lookup

0     35763.0
1     35772.0
2     36277.0
3     36755.0
4     36779.0
5     57243.0
6     57301.0
7     35803.0
8     36265.0
9     36708.0
10    36756.0
11    36758.0
12    57117.0
13    57198.0
14    35837.0
15    36227.0
16    57290.0
17    57921.0
18    34345.0
19    35709.0
Name: student_lookup, dtype: float64

In [68]:
['spec_instruc', 
 'titlei',
'post_secondary',
'academic_inv',
'academic_intracurr',
'atheletics',
'extracurr_program',
'school_program',
'placement',
'vocational']
[extracurr_program*, post_secondary*, academic_inv*,  
atheletics*,  placement*,  spec_instruc*, vocational*, 
academic_intracurr*,  school_program*, titlei*]

student_lookup                                           6.31788e+07
not_on_time                                                      235
is_dropout                                                        56
definite                                                          56
cohort_10th                                              3.50441e+06
cohort_9th                                                   3554921
cohort_8th                                                2.8829e+06
cohort_7th                                               2.19931e+06
cohort_6th                                               1.63633e+06
status             AAAAAAOOQAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAACAA...
grade              1010111210101011101010101011121011101110111210...
gender             MMMMMMFFMMMMMMMMMMMMMMMMMMMMMMMFFMMMMMMMMFFMMM...
membership_code                                            705974135
description        FFA (formerly\r\nFuture Farmers of\r\nAmerica)...
school_year        11/19/200820071

In [133]:
'aB1_2c'.lower()

'ab1_2c'