In [2]:
import pandas
pandas.__version__

'0.18.1'

In [3]:
import pandas as pd
import psycopg2 as pg
pass_file = "/mnt/data/mvesc/PartnerData/default_profile"
f = open(pass_file, 'r')
with open(pass_file, 'r') as f:
   passinfo = f.readlines()
f.close()

In [4]:
database_credentials = [line.strip().split("=")[1] for line in passinfo]
#database_credentials # don't print this out in Jupyter notebook for world to see!

In [5]:
host_address = database_credentials[0]
user_name = database_credentials[1]
name_of_database = database_credentials[2]
user_password = database_credentials[3]
conn = pg.connect(host=host_address, database=name_of_database, user=user_name, password=user_password)

In [6]:
def read_table_to_df(table_name, connection):
    """ Takes a table name as input and returns postgres table as pandas data frame.
    
    :param string table_name: Name of table to read in
    :param psycopg2.connection connection: POSTGRESQL connection object for Python
    :return: a Pandas dataframe object containing the desired table
    :rtype: Pandas.dataframe
    """
    sql_query = "SELECT * FROM \"" + table_name + "\";"
    data = pd.read_sql(sql_query, connection)
    return data

In [7]:
ALL_DISTRICTS_TABLES = ["AllDistricts" + str(num) + str(num+1) for num in range(11,15)]
ALL_DISTRICTS_TABLES

['AllDistricts1112',
 'AllDistricts1213',
 'AllDistricts1314',
 'AllDistricts1415']

In [8]:
ALL_DISTRICTS_TABLES.append("CurrentStudents")

In [9]:
all_districts_data = [read_table_to_df(table_name, conn) for table_name in ALL_DISTRICTS_TABLES]

In [11]:
zipped_all_districts = zip(ALL_DISTRICTS_TABLES, all_districts_data)
["%s: %s" % (k,v.shape) for (k,v) in zipped_all_districts]

['AllDistricts1112: (33623, 29)',
 'AllDistricts1213: (34090, 29)',
 'AllDistricts1314: (33164, 29)',
 'AllDistricts1415: (34101, 29)',
 'CurrentStudents: (34327, 28)']

In [59]:
all_grads = read_table_to_df("AllGradsTotal", conn)
current_mobility = read_table_to_df("CurrentMobility", conn)
current_absence_discipline = read_table_to_df("CurrentAbsenceDiscipline", conn)

In [15]:
print(all_grads.shape)
print(all_grads.columns)

(9198, 85)
Index(['StudentLookup', 'RECORD_FOUND_Y_N', 'HIGH_SCHOOL_GRAD_DATE', 'College',
       'SWD', 'Disadvantaged', 'Att12th', 'Att11th', 'Att10th', 'Att9th',
       'IncPrev', 'OSSPrev', 'ISSPrev', 'IncPrev2', 'OSSPrev2', 'ISSPrev2',
       'Third_Read_PL', 'Third_Read_SS', 'Third_Math_PL', 'Third_Math_SS',
       'Fourth_Read_PL', 'Fourth_Read_SS', 'Fourth_Math_PL', 'Fourth_Math_SS',
       'Fourth_Write_PL', 'Fourth_Write_SS', 'Fourth_Ctz_PL', 'Fourth_Ctz_SS',
       'Fourth_Science_PL', 'Fourth_Science_SS', 'Fifth_Read_PL',
       'Fifth_Read_SS', 'Fifth_Math_PL', 'Fifth_Math_SS',
       'Fifth_SocStudies_PL', 'Fifth_SocStudies_SS', 'Fifth_Science_PL',
       'Fifth_Science_SS', 'Sixth_Read_PL', 'Sixth_Read_SS', 'Sixth_Math_PL',
       'Sixth_Math_SS', 'Sixth_Write_PL', 'Sixth_Write_SS', 'Sixth_Ctz_PL',
       'Sixth_Ctz_SS', 'Sixth_Science_PL', 'Sixth_Science_SS',
       'Seventh_Read_PL', 'Seventh_Read_SS', 'Seventh_Math_PL',
       'Seventh_Math_SS', 'Seventh_Write_PL', 'S

In [60]:
print(current_mobility.shape)
print(current_mobility.columns)

(850, 14)
Index(['StudentLookup', 'CURRENT_GRADE', 'HANDICAP', 'HANDICAP_DESC',
       'DISADVANTAGEMENT', 'DISADVANTAGEMENT_DESC', 'DATE_OF_BIRTH',
       'STUDENT_STATUS', 'STUDENT_STATUS_DESC', 'GENDER', 'RACIAL_ETHNIC_DESC',
       'ADMISSION_DATE', 'District', 'School'],
      dtype='object')


In [61]:
print(current_absence_discipline.shape)
print(current_absence_discipline.columns)

(34259, 8)
Index(['StudentLookup', 'Absent', 'Lates', 'AttPCt', 'Incidents', 'OSS', 'ISS',
       'Expul'],
      dtype='object')


In [52]:
# There are two very similar looking codes for economic/academic disadvantage: 
# "1 - Economic disadvantagement" and "Economic Disadvantagement"
# Let's just check if the coding scheme has changed over time or varies district to district

def get_unique_stripped_sorted_values(sort_column, coding, 
                                      code_column = "Disadvantaged", data = all_grads):
    """Subsets a dataframe by rows containing particular value in the code column, and then strips and 
       sorts all unique values in sort column of the matching rows. Returns these unique sorted values 
       in a list.
       
    :param string sort_column: Name of the column to find unique values of (for matching rows)
    :param string coding: Value of the code to subset the data by
    :param string code_column: Column to find the code to subset the data by
    :param Pandas.dataframe data: input dataframe
    :return: a list containing unique values in specified column for the matching rows
    :rtype: list[dtype of sort_column] 
    """
    unique_values_for_code = data[sort_column][data[code_column] == coding].unique()
    unique_values_for_code_stripped = [d.strip() for d in unique_values_for_code]
    unique_values_for_code_stripped.sort()
    return(unique_values_for_code_stripped)

districts_codetype1 = get_unique_stripped_sorted_values(sort_column = "DistrictCode", 
                                                        coding = "1 - Economic disadvantagement")
districts_codetype2 = get_unique_stripped_sorted_values(sort_column = "DistrictCode", 
                                                        coding = "Economic Disadvantagement")
print(districts_codetype1)
print(districts_codetype2)

grad_date_codetype1 = get_unique_stripped_sorted_values(sort_column = "HIGH_SCHOOL_GRAD_DATE",
                                                        coding = "1 - Economic disadvantagement")
grad_date_codetype2 = get_unique_stripped_sorted_values(sort_column = "HIGH_SCHOOL_GRAD_DATE",
                                                        coding = "Economic Disadvantagement")

print("%s to %s" % (grad_date_codetype1[0], grad_date_codetype1[-1]))
print("%s to %s" % (grad_date_codetype2[0], grad_date_codetype2[-1]))

['D13', 'D15', 'D5', 'D7', 'D8']
['D1', 'D10', 'D12', 'D13', 'D14', 'D15', 'D16', 'D17', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8']
20110114 to 20150630
20110516 to 20150701


In [None]:
# all_grads.Disadvantaged.replace(to_replace="1 - Economic disadvantagement", 
#                                value="Economic Disadvantagement", inplace=True)
# all_grads.Disadvantaged.replace(to_replace="2 - Academic disadvantagement", 
#                                value="Academic Disadvantagement", inplace=True)

In [29]:
all_grads_valuecounts = {colname: all_grads[colname].value_counts(dropna=False) for colname in all_grads.columns}

How many students are not disadvantaged? Economic ? Academic ? It isn't clear why there are two coding types for each, or why there are so few academic disadvantaged students in the data -- but this is grads only.

In [32]:
all_grads_valuecounts["Disadvantaged"].groupby()

                                          5488
Economic Disadvantagement                 3208
1 - Economic disadvantagement              475
Academic Disadvantagement                   19
Economic and Academic Disadvantagement       7
2 - Academic disadvantagement                1
Name: Disadvantaged, dtype: int64

In [58]:
district_by_district = all_grads.groupby("DistrictCode")
district_disadvantaged_counts = district_by_district.Disadvantaged.value_counts(dropna=False)
district_disadvantaged_counts.to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Disadvantaged
DistrictCode,Disadvantaged,Unnamed: 2_level_1
D1,Economic Disadvantagement,138
D1,,127
D1,Academic Disadvantagement,5
D10,Economic Disadvantagement,189
D10,,132
D12,Economic Disadvantagement,265
D12,,251
D13,,390
D13,Economic Disadvantagement,354
D13,1 - Economic disadvantagement,1
