In [137]:
import pandas as pd
import numpy as np
import psycopg2 as pg
import matplotlib.pyplot as plt
import itertools
%matplotlib inline
# Gloabal Constants
pass_file = "/mnt/data/mvesc/pgpass" # username, db information

# Table names
sqlcmd_table_names = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"

# The table names in a string, quotes added with chr(34) = "
ALL_DISTRICTS_TABLES = [chr(34) + """AllDistricts""" + str(num-1) + str(num) + chr(34) for num in range(15,11, -1)]
ALL_DISTRICTS_TABLES.insert(0, chr(34)+"CurrentStudents"+chr(34))
# ['CurrentStudents', 'AllDistricts1112', 'AllDistricts1213', 'AllDistricts1314', 'AllDistricts1415']
# with double quotes!

# Set up connection to postgres database
with open(pass_file, 'r') as f:
    passinfo = f.read()
passinfo = passinfo.strip().split(':')

host_address = passinfo[0]
user_name = passinfo[2]
name_of_database = passinfo[3]
user_password = passinfo[4]
connection = pg.connect(host=host_address, database=name_of_database, user=user_name, password=user_password)
cursor = connection.cursor()
#table_names = pd.read_sql(sqlcmd_table_names, connection)
cursor.execute(sqlcmd_table_names)
table_names = cursor.fetchall()

for i in range(len(table_names)):
    print(table_names[i][0])

DistrictSchoolIDs
all_lookup
CurrentMobility
CurrentStudents
ASQ_Preschool
ActScores
AllDistricts1112
HSGrades
AllDistricts1213
AIRScores
AllDistricts1314
CurrentAbsenceDiscipline
DIBELSv2
AllDistricts1415
AllGradsTotal
OAAOGT
PARCC
StarRead
StarEL
StarMath
TerraNova


In [138]:
for i in range(len(ALL_DISTRICTS_TABLES)):
    print ("""select count(*) from """ + ALL_DISTRICTS_TABLES[i] + """ where "StudentLookup" = NULL;""")
    cursor.execute("""select count(*) from """ + ALL_DISTRICTS_TABLES[i] + """ where "StudentLookup"= NULL;""")
    print(cursor.fetchall())

select count(*) from "CurrentStudents" where "StudentLookup" = NULL;
[(0,)]
select count(*) from "AllDistricts1415" where "StudentLookup" = NULL;
[(0,)]
select count(*) from "AllDistricts1314" where "StudentLookup" = NULL;
[(0,)]
select count(*) from "AllDistricts1213" where "StudentLookup" = NULL;
[(0,)]
select count(*) from "AllDistricts1112" where "StudentLookup" = NULL;
[(0,)]


In [139]:
# functions explore a table from XC
def get_column_names(table, connection):
    """
    Get column names of a table 
    
    :param pg.extensions.connection object connection: sql connection
    :param string table: table name in the database
    :rtype: list 
    """
    temp_table = pd.read_sql("select * FROM \"%s\" limit 1" % table, connection)
    return list(temp_table.columns)

def read_table_to_df(table_name, connection, maxStudentLookup=1e6):
    """ from Jackie
    Takes a table name as input and returns postgres table as pandas data frame.
    
    :param string table_name: Name of table to read in
    :param psycopg2.connection connection: POSTGRESQL connection object for Python
    :return: a Pandas dataframe object containing the desired table
    :rtype: Pandas.dataframe
    """
    sql_query = "SELECT * FROM \"" + table_name + "\";"
    data = pd.read_sql(sql_query, connection)
    return data

def different_columns(table1, table2, connection):
    """ find different columns names of two tables
    
    :param string table1: name of table 1
    :param string table2: name of table 2
    :param pg.connection connection: sql connection
    :return: a list with 2 sets of different columns names in 2 tables
    :rtype: list[set, set]
    """
    columns1 = get_column_names(table1, connection)
    columns2 = get_column_names(table2, connection)
    diff_col1 = set(columns1) - set(columns2)
    diff_col2 = set(columns2) - set(columns1)
    return([diff_col1, diff_col2])

def generate_colnames(schoolyear='1415'):
    """ generate the colnames based on the shcool year, e.g. 1415
    
    :para string schoolyear: school year, e.g. 1415
    :return: a list of columns names
    :rtype: list[string]
    """
    columns_name_base = ['StudentLookup', 'CURRENT_GRADE', 'HANDICAP', 'HANDICAP_DESC', 'DISADVANTAGEMENT', 'DISADVANTAGEMENT_DESC', 'DATE_OF_BIRTH', 'STUDENT_STATUS', 'STUDENT_STATUS_DESC', 'GENDER', 'RACIAL_ETHNIC_DESC', 'LIMITED_ENGLISH', 'GIFTED', 'ADMISSION_DATE', 'PERCENT_ATTEND', 'DAYS_IN_ATTENDANCE', 'DAYS_ABSENT', 'TARDY', 'PERCENT_ATTEND', 'DAYS_IN_ATTENDANCE', 'DAYS_ABSENT', 'TARDY', 'DISC_INCIDENTS', 'IN_SCHOOL_SUSP', 'OUT_OF_SCHOOL_SUSP', 'EXPUL', 'District', 'School']
    current_year=int(schoolyear[-2:]) # last 2 characters of a table name
    year_append1 = "_20"+str(current_year-2)+"_"+str(current_year-1)
    year_append2 = "_20"+str(current_year-3)+"_"+str(current_year-2)
    year_append = [year_append1]*4 + [year_append2]*4 + [year_append1]*4
    col_append = ['']*14 + year_append + ['']*2
    columns = [columns_name_base[i]+col_append[i] for i in range(len(columns_name_base))]
    return(columns)

In [143]:
c1 = get_column_names("AllDistricts1112", connection)
c2 = get_column_names("AllDistricts1213", connection)

my_query = """
select "AllDistricts1112"."StudentLookup" from "AllDistricts1112" 
left outer join "AllDistricts1213" 
on "AllDistricts1112"."StudentLookup" = "AllDistricts1213"."StudentLookup"
union all
select "AllDistricts1213"."StudentLookup" from "AllDistricts1112" 
left outer join "AllDistricts1213" 
on "AllDistricts1112"."StudentLookup" = "AllDistricts1213"."StudentLookup";
"""
#print(my_query)
cursor.execute(my_query)
names = [ x[0] for x in cursor.description]
rows = cursor.fetchall()
res_table = pd.DataFrame( rows, columns=names)
print(len(names))
print(names)
print(res_table)

1
['StudentLookup']
       StudentLookup
0            38730.0
1            34940.0
2            34943.0
3            22403.0
4              287.0
5            39576.0
6            37263.0
7             3505.0
8            37919.0
9            39314.0
10            3219.0
11            3264.0
12           38128.0
13           38211.0
14            3439.0
15           34903.0
16            3726.0
17            3797.0
18            6995.0
19           34886.0
20            2928.0
21            6090.0
22            1807.0
23           31505.0
24           18823.0
25           15703.0
26           38491.0
27           38531.0
28           43424.0
29           38145.0
...              ...
68498            NaN
68499            NaN
68500            NaN
68501            NaN
68502            NaN
68503            NaN
68504            NaN
68505            NaN
68506            NaN
68507            NaN
68508            NaN
68509            NaN
68510            NaN
68511            NaN
68512         