In [19]:
import pandas as pd
import numpy as np
import psycopg2 as pg
import matplotlib.pyplot as plt
import matplotlib
import itertools
matplotlib.style.use('ggplot')
%matplotlib inline

# Gloabal Constants
pass_file = "/mnt/data/mvesc/pgpass" # username, db information

# Table names
sqlcmd_table_names = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"

# The table names in a string, quotes added with chr(34) = "
ALL_DISTRICTS_TABLES = [chr(34) + """AllDistricts""" + str(num-1) + str(num) + chr(34) for num in range(15,11, -1)]
ALL_DISTRICTS_TABLES.insert(0, chr(34)+"CurrentStudents"+chr(34))
# ['CurrentStudents', 'AllDistricts1112', 'AllDistricts1213', 'AllDistricts1314', 'AllDistricts1415']
# with double quotes!

# Set up connection to postgres database
with open(pass_file, 'r') as f:
    passinfo = f.read()
passinfo = passinfo.strip().split(':')

host_address = passinfo[0]
user_name = passinfo[2]
name_of_database = passinfo[3]
user_password = passinfo[4]
connection = pg.connect(host=host_address, database=name_of_database, user=user_name, password=user_password)
cursor = connection.cursor()
#table_names = pd.read_sql(sqlcmd_table_names, connection)
cursor.execute(sqlcmd_table_names)
table_names = cursor.fetchall()
table_names = [t[0] for t in table_names]
table_names.remove("DistrictSchoolIDs")
table_names.remove("all_lookup")
print(table_names)

['CurrentMobility', 'CurrentStudents', 'ASQ_Preschool', 'ActScores', 'AllDistricts1112', 'HSGrades', 'AllDistricts1213', 'AIRScores', 'AllDistricts1314', 'CurrentAbsenceDiscipline', 'DIBELSv2', 'AllDistricts1415', 'AllGradsTotal', 'OAAOGT', 'PARCC', 'StarRead', 'StarEL', 'StarMath', 'TerraNova']


In [20]:
for t in table_names:
    cursor.execute("""select count(*) from \"""" + t + """\" where "StudentLookup" is NULL;""")
    null_count = cursor.fetchall()[0][0];
    cursor.execute("""select count(*) from \"""" + t + """\"""")
    print("NULL student lookups in", t, ": ",null_count, " out of", cursor.fetchall()[0][0])

cursor.execute("""select count (distinct "StudentLookup") from "StarRead";""")
uniq_num = cursor.fetchall()[0][0]
cursor.execute("""select count (*) from "StarRead";""")
all_num = cursor.fetchall()[0][0]

NULL student lookups in CurrentMobility :  0  out of 850
NULL student lookups in CurrentStudents :  0  out of 34327
NULL student lookups in ASQ_Preschool :  260  out of 1690
NULL student lookups in ActScores :  1302  out of 10775
NULL student lookups in AllDistricts1112 :  5772  out of 33623
NULL student lookups in HSGrades :  0  out of 250506
NULL student lookups in AllDistricts1213 :  4740  out of 34090
NULL student lookups in AIRScores :  1127  out of 13465
NULL student lookups in AllDistricts1314 :  3479  out of 33164
NULL student lookups in CurrentAbsenceDiscipline :  0  out of 34259
NULL student lookups in DIBELSv2 :  1915  out of 10067
NULL student lookups in AllDistricts1415 :  2123  out of 34101
NULL student lookups in AllGradsTotal :  0  out of 9198
NULL student lookups in OAAOGT :  0  out of 24688
NULL student lookups in PARCC :  311  out of 23851
NULL student lookups in StarRead :  28071  out of 162394
NULL student lookups in StarEL :  6510  out of 32075
NULL student lookup

In [21]:
# functions explore a table from XC
def get_column_names(table, connection):
    """
    Get column names of a table 
    
    :param pg.extensions.connection object connection: sql connection
    :param string table: table name in the database
    :rtype: list 
    """
    temp_table = pd.read_sql("select * FROM \"%s\" limit 1" % table, connection)
    return list(temp_table.columns)

def read_table_to_df(table_name, connection, maxStudentLookup=1e6):
    """ from Jackie
    Takes a table name as input and returns postgres table as pandas data frame.
    
    :param string table_name: Name of table to read in
    :param psycopg2.connection connection: POSTGRESQL connection object for Python
    :return: a Pandas dataframe object containing the desired table
    :rtype: Pandas.dataframe
    """
    sql_query = "SELECT * FROM \"" + table_name + "\";"
    data = pd.read_sql(sql_query, connection)
    return data

def different_columns(table1, table2, connection):
    """ find different columns names of two tables
    
    :param string table1: name of table 1
    :param string table2: name of table 2
    :param pg.connection connection: sql connection
    :return: a list with 2 sets of different columns names in 2 tables
    :rtype: list[set, set]
    """
    columns1 = get_column_names(table1, connection)
    columns2 = get_column_names(table2, connection)
    diff_col1 = set(columns1) - set(columns2)
    diff_col2 = set(columns2) - set(columns1)
    return([diff_col1, diff_col2])

def generate_colnames(schoolyear='1415'):
    """ generate the colnames based on the school year, e.g. 1415
    
    :para string schoolyear: school year, e.g. 1415
    :return: a list of columns names
    :rtype: list[string]
    """
    columns_name_base = ['StudentLookup', 'CURRENT_GRADE', 'HANDICAP', 'HANDICAP_DESC', 'DISADVANTAGEMENT', 'DISADVANTAGEMENT_DESC', 'DATE_OF_BIRTH', 'STUDENT_STATUS', 'STUDENT_STATUS_DESC', 'GENDER', 'RACIAL_ETHNIC_DESC', 'LIMITED_ENGLISH', 'GIFTED', 'ADMISSION_DATE', 'PERCENT_ATTEND', 'DAYS_IN_ATTENDANCE', 'DAYS_ABSENT', 'TARDY', 'PERCENT_ATTEND', 'DAYS_IN_ATTENDANCE', 'DAYS_ABSENT', 'TARDY', 'DISC_INCIDENTS', 'IN_SCHOOL_SUSP', 'OUT_OF_SCHOOL_SUSP', 'EXPUL', 'District', 'School']
    current_year=int(schoolyear[-2:]) # last 2 characters of a table name
    year_append1 = "_20"+str(current_year-2)+"_"+str(current_year-1)
    year_append2 = "_20"+str(current_year-3)+"_"+str(current_year-2)
    year_append = [year_append1]*4 + [year_append2]*4 + [year_append1]*4
    col_append = ['']*14 + year_append + ['']*2
    columns = [columns_name_base[i]+col_append[i] for i in range(len(columns_name_base))]
    return(columns)

In [22]:
col_names = [];
for t in table_names:
    col_names.append(get_column_names(t, connection))
print(table_names)

['CurrentMobility', 'CurrentStudents', 'ASQ_Preschool', 'ActScores', 'AllDistricts1112', 'HSGrades', 'AllDistricts1213', 'AIRScores', 'AllDistricts1314', 'CurrentAbsenceDiscipline', 'DIBELSv2', 'AllDistricts1415', 'AllGradsTotal', 'OAAOGT', 'PARCC', 'StarRead', 'StarEL', 'StarMath', 'TerraNova']


In [23]:
# this will crash!

# my_query = "select "
# for t in table_names:
#     tb_cols = get_column_names(t, connection)
#     for col in tb_cols:
#         if col != "RecCounter":
#             my_query +=  "\"" + t + "\".\"" + col + "\" as \"" + t + "_" + col + "\","
# my_query = my_query[:-1]
# base = "all_lookup"
# my_query += """ from \"""" + base + "\""
# for t in table_names:
#     if t!=base:
#         my_query +=  """ left outer join \"""" + t + """\" on \"""" + base + """\".\"StudentLookup\"= \"""" + t + "\".\"StudentLookup\""
# my_query += ";"
# print(my_query)


In [27]:
# selecting all unique student lookup numbers
my_query = ""
for t in table_names:
    my_query += """select distinct \"StudentLookup\" from \"""" + t + """\" union """
my_query = my_query[:-6] + ";"

cursor.execute("drop table if exists my_temp;")
cursor.execute("""create temp table my_temp as """ + my_query)
cursor.execute("""select count(*) from my_temp;""")
rows = cursor.fetchmany(10)
#names = [ x[0] for x in cursor.description]
#res_table = pd.DataFrame( rows, columns=names)
print(rows)

[(43527,)]


In [25]:
# remove master table
cursor.execute("""drop table if exists clean.master;""")

In [26]:
print(table_names)
for t in table_names:
    if t != "my_temp":
        print(t)
        my_query = """drop table if exists my_temp_next;
        create temp table my_temp_next as select my_temp."StudentLookup", """
        tb_cols = get_column_names(t, connection)
        for col in tb_cols:
            if col != "RecCounter":
                my_query +=  "\"" + t + "\".\"" + col + "\" as \"" + t + "_" + col + "\","
        my_query = my_query[:-1]
        my_query += """ from my_temp """
        my_query +=  """ left outer join \"""" + t + """\""""
        my_query += """ on my_temp.\"StudentLookup\"= \"""" + t + "\".\"StudentLookup\""
        my_query += ";"
        # print(my_query)
        cursor.execute(my_query)
        cursor.execute("drop table if exists my_temp; alter table my_temp_next rename to my_temp;")
        connection.commit()
        cursor.execute("select count(*) from my_temp;")
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchmany(10)
        res_table = pd.DataFrame( rows, columns=names)
        print(res_table)
cursor.execute("create table clean.master as select * from my_temp;")
connection.commit()

['CurrentMobility', 'CurrentStudents', 'ASQ_Preschool', 'ActScores', 'AllDistricts1112', 'HSGrades', 'AllDistricts1213', 'AIRScores', 'AllDistricts1314', 'CurrentAbsenceDiscipline', 'DIBELSv2', 'AllDistricts1415', 'AllGradsTotal', 'OAAOGT', 'PARCC', 'StarRead', 'StarEL', 'StarMath', 'TerraNova']
CurrentMobility
   count
0  43957
CurrentStudents
   count
0  43957
ASQ_Preschool
   count
0  43957
ActScores
   count
0  48093
AllDistricts1112
   count
0  48609
HSGrades
    count
0  369375
AllDistricts1213
    count
0  373080
AIRScores
    count
0  377628
AllDistricts1314
    count
0  381282
CurrentAbsenceDiscipline
    count
0  381282
DIBELSv2
    count
0  388316
AllDistricts1415
    count
0  393474
AllGradsTotal
    count
0  393474
OAAOGT
    count
0  393474
PARCC
    count
0  426325
StarRead
     count
0  1098139
StarEL
     count
0  1428748
StarMath
     count
0  9666072
TerraNova
      count
0  10439468


In [10]:
cursor.execute("""select count (*) from clean.master;""")
names = [ x[0] for x in cursor.description]
rows = cursor.fetchmany(10)
res_table = pd.DataFrame( rows, columns=names)
print(res_table)

      count
0  10439468


In [12]:
cursor.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema = 'clean'""")
table_names = cursor.fetchall()
print(table_names)

[('act_scores',), ('master',)]


In [1]:
cursor.close()
connection.close()

NameError: name 'cursor' is not defined