In [1]:
import os, sys
from os.path import isfile, join, abspath, basename
from optparse import OptionParser
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import sqlalchemy
import json

pathname = os.path.dirname(sys.argv[0])
full_pathname = os.path.abspath(pathname)
split_pathname = full_pathname.split(sep="mvesc")
base_pathname = os.path.join(split_pathname[0], "mvesc")
parentdir = os.path.join(base_pathname, "ETL")
sys.path.insert(0,parentdir)
from mvesc_utility_functions import *

In [44]:
def main():
    schema = 'public'
    public_tables = ['INV_06_16_CO_M', 'INV_06_16_FR_M', 'INV_06_16_MA_M', 'INV_06_16_RV_M', 'INV_06_16_RW_M', 
                     'INV_06_16_TV_M', 'INV_06_16_WM_M', 'INV_10_16_CEVSD_M', 'INV_10_16_EM_M']
    new_column_names = ['student_lookup', 'status', 'grade', 'gender', 'hmrm', 
                        'membership_code', 'description', 'school_year', 'district']
    
    table_df = {}
    
    print(" - Reading intervention tables of different districts...")
    with postgres_pgconnection_generator() as conn:
        with conn.cursor() as cursor:
            for tab in public_tables:
                nrows = 1000000
                sql_read_table = """select * from {s}."{t}" limit {n};""".format(s=schema, t=tab, n=nrows)
                table_df[tab] = pd.read_sql_query(sql_read_table, conn)
            table_df['INV_06_16_CO_M']['District'] = 'Coshocton' # add missed `district` column in table for CO
            codes_df = pd.read_sql_query("select * from public.\"INV_MembershipCodes\";", conn)
            outcome = pd.read_sql_query("select * from model.outcome;", conn)
    
    df = pd.DataFrame()
    for t in table_df:
        new_col_dict = {table_df[t].columns[i]:new_column_names[i] for i in range(len(new_column_names))}
        table_df[t] = table_df[t].rename(columns=new_col_dict)
        df = df.append(table_df[t])
    df = df.drop('hmrm', axis=1)
    df.reset_index(drop=True, inplace=True)
    
    print(" - Integrating and cleaning table...")
    # update and clean columns
    grade_converter = {'06':6, '05':5, '04':4, '03':3, '02':2, '01':1, 
                       'KG':0, 'PS':-1, '12':12, '11':11, '09':9,'10':10, 
                       '07':7, '08':8, '23':23, 'UG':None, 'GR':None, '13':13}
    codes_converter = {231101:231001, 231105:231005}
    new_grades = [None]*df.shape[0]
    for i in range(df.shape[0]):
        if df.membership_code[i] in codes_converter:
            df.ix[i, 'membership_code'] = codes_converter[df.membership_code[i]]
        if df.grade[i] in grade_converter:
            new_grades[i] = grade_converter[df.grade[i]]

    df.drop(['grade'], axis=1, inplace=True)
    def int_nonNone(x):
        """ Convert float to int and others None
        """
        if x==None or isinstance(x, str):
            return None
        else:
            return int(x)
    new_grades = list(map(int_nonNone, new_grades))
    df['grade'] = new_grades
    
    print(" - Saving intervention data frame to postgres... ")
    df2postgres(df, 'intervention', nrows=-1, if_exists='replace', schema='clean')

Unnamed: 0,student_lookup,status,gender,membership_code,description,school_year,district,grade
0,9997,A,F,152330,Student received intervention provided during ...,2007,TriValley,3.0
1,9454,A,F,152330,Student received intervention provided during ...,2007,TriValley,3.0
2,13426,A,F,152330,Student received intervention provided during ...,2007,TriValley,3.0
3,10279,A,F,152330,Student received intervention provided during ...,2007,TriValley,3.0
4,13701,A,M,152330,Student received intervention provided during ...,2007,TriValley,3.0


In [46]:
df2postgres(df, 'intervention', nrows=-1, if_exists='replace', schema='clean')

'intervention'

AttributeError: 'DataFrame' object has no attribute 'grade'

In [37]:
df.head()

Unnamed: 0,student_lookup,status,gender,membership_code,description,school_year,district,grade2
0,9997,A,F,152330,Student received intervention provided during ...,2007,TriValley,3
1,9454,A,F,152330,Student received intervention provided during ...,2007,TriValley,3
2,13426,A,F,152330,Student received intervention provided during ...,2007,TriValley,3
3,10279,A,F,152330,Student received intervention provided during ...,2007,TriValley,3
4,13701,A,M,152330,Student received intervention provided during ...,2007,TriValley,3


In [37]:
unique_codes = df.membership_code.unique()
code_not_in_codes = list(filter(lambda x: x not in list(codes_df.membership_code), unique_codes))

In [49]:
rows_not_in_codes = df[pd.Series([x in code_not_in_codes for x in df.membership_code])]
rows_not_in_codes.groupby(by=['membership_code', 'description']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,student_lookup,status,grade,gender,school_year,district
membership_code,description,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
231101,Title I Reading\r\n(Summer School),174,174,174,174,174,174
231105,Title I Mathematics\r\n(Summer School),261,261,261,261,261,261
410003,FFA (formerly\r\nFuture Farmers of\r\nAmerica),2754,2754,2754,2754,2754,2754


In [71]:
231105 in codes_converter

True

In [66]:
check_drop = outcome.merge(rows_not_in_codes, on=['student_lookup'], how='inner')

In [68]:
check_drop.sum(axis=0)

student_lookup                                           6.31788e+07
not_on_time                                                      235
is_dropout                                                        56
definite                                                          56
cohort_10th                                              3.50441e+06
cohort_9th                                                   3554921
cohort_8th                                                2.8829e+06
cohort_7th                                               2.19931e+06
cohort_6th                                               1.63633e+06
status             AAAAAAOOQAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAACAA...
grade              1010111210101011101010101011121011101110111210...
gender             MMMMMMFFMMMMMMMMMMMMMMMMMMMMMMMFFMMMMMMMMFFMMM...
membership_code                                            705974135
description        FFA (formerly\r\nFuture Farmers of\r\nAmerica)...
school_year        11/19/200820071