In [17]:
# loading the database credentials from the .gitignore protected password file
import pandas as pd
import psycopg2 as pg
pass_file = "/mnt/data/mvesc/pgpass"
f = open(pass_file, 'r')
with open(pass_file, 'r') as f:
   passinfo = f.read()
passinfo = passinfo.strip().split(':')
f.close()
host_address = passinfo[0]
user_name = passinfo[2]
name_of_database = passinfo[3]
user_password = passinfo[4]
conn = pg.connect(host=host_address, database=name_of_database, user=user_name, password=user_password)

# notebook options
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1]:
#### Tables Zhe is in charge of
list_of_table_names = ['AIRScores', 'ASQ_Preschool', 'ActScores',
                       'DIBELSv2', 'HSGrades', 'OAAOGT', 'PARCC',
                       'StarEL', 'StarMath', 'StarRead', 'TerraNova']

In [6]:
# functions explore a table
def get_column_names(table, connection):
    """
    Get column names of a table 
    
    :param pg.extensions.connection object connection: sql connection
    :param string table: table name in the database
    :rtype: list 
    """
    temp_table = pd.read_sql("select * FROM \"%s\" limit 1" % table, connection)
    return list(temp_table.columns)

def read_table_to_df(table_name, connection, maxStudentLookup=1e6):
    """ from Jackie
    Takes a table name as input and returns postgres table as pandas data frame.
    
    :param string table_name: Name of table to read in
    :param psycopg2.connection connection: POSTGRESQL connection object for Python
    :return: a Pandas dataframe object containing the desired table
    :rtype: Pandas.dataframe
    """
    sql_query = "SELECT * FROM \"" + table_name + "\";"
    data = pd.read_sql(sql_query, connection)
    return data

def different_columns(table1, table2, connection):
    """ find different columns names of two tables
    
    :param string table1: name of table 1
    :param string table2: name of table 2
    :param pg.connection connection: sql connection
    :return: a list with 2 sets of different columns names in 2 tables
    :rtype: list[set, set]
    """
    columns1 = get_column_names(table1, connection)
    columns2 = get_column_names(table2, connection)
    diff_col1 = set(columns1) - set(columns2)
    diff_col2 = set(columns2) - set(columns1)
    return([diff_col1, diff_col2])

def generate_colnames(schoolyear='1415'):
    """ generate the colnames based on the shcool year, e.g. 1415
    
    :para string schoolyear: school year, e.g. 1415
    :return: a list of columns names
    :rtype: list[string]
    """
    columns_name_base = ['StudentLookup', 'CURRENT_GRADE', 'HANDICAP', 'HANDICAP_DESC', 'DISADVANTAGEMENT', 'DISADVANTAGEMENT_DESC', 'DATE_OF_BIRTH', 'STUDENT_STATUS', 'STUDENT_STATUS_DESC', 'GENDER', 'RACIAL_ETHNIC_DESC', 'LIMITED_ENGLISH', 'GIFTED', 'ADMISSION_DATE', 'PERCENT_ATTEND', 'DAYS_IN_ATTENDANCE', 'DAYS_ABSENT', 'TARDY', 'PERCENT_ATTEND', 'DAYS_IN_ATTENDANCE', 'DAYS_ABSENT', 'TARDY', 'DISC_INCIDENTS', 'IN_SCHOOL_SUSP', 'OUT_OF_SCHOOL_SUSP', 'EXPUL', 'District', 'School']
    current_year=int(schoolyear[-2:]) # last 2 characters of a table name
    year_append1 = "_20"+str(current_year-2)+"_"+str(current_year-1)
    year_append2 = "_20"+str(current_year-3)+"_"+str(current_year-2)
    year_append = [year_append1]*4 + [year_append2]*4 + [year_append1]*4
    col_append = ['']*14 + year_append + ['']*2
    columns = [columns_name_base[i]+col_append[i] for i in range(len(columns_name_base))]
    return(columns)

### Overall Test Scores Exploration
We're interested in seeing what the different test scores look like. So far, we've just done a brief look at each test, which we discuss below.

#### Leftover Tasks
- We need to merge the student lookup numbers to get district-level (or school-level analysis)
- Figure out what the 'Ctr' column means
- Get test-specific means and averages for the MVESC area -- to compare how students are to their peers.

#### AIRScores
This was only given in "Fall2015" and "Spring2015" in our data. We believe it was started to be given in Ohio in the 2014-15 school year. AIR Scores is an end-of-course exam in multiple subjects for students.

#### ASQ_Preschool
This table is unclear what's exactly going on. It contains one score for a student lookup number and if they passed. There's a column named 'RecCounter', which is unclear.

In [18]:
list_of_table_names[0]
air_scores = read_table_to_df(list_of_table_names[0], conn)

'AIRScores'

In [26]:
air_scores.head()
air_scores.dtypes

Unnamed: 0,StudentLookup,Ctr,EnrolledGrade,AdministrationDate,GiftedandTalented,504Plan,IEP,TestName,Overallscaledscore,Overallperformancelevel,GraduationPoints,PBA tested,EOY tested,Subscore1,Subscore2,Subscore3,Subscore4
0,3445.0,5256,10,Spring2015,,N,N,American History,702,Proficient,3.0,Y,Y,At Std,At Std,At Std,
1,11069.0,5257,9,Spring2015,,N,N,Physical Science,680,Limited,1.0,Y,Y,At Std,Below Std,Below Std,At Std
2,11068.0,5258,9,Spring2015,,N,N,Physical Science,672,Limited,1.0,Y,Y,Below Std,Below Std,At Std,Below Std
3,3524.0,5259,8,Spring2015,,N,N,Grade 8 Science,726,Accelerated,,Y,Y,Above Std,At Std,At Std,
4,3725.0,5260,10,Spring2015,,N,N,American History,738,Advanced,5.0,Y,Y,Above Std,Above Std,Above Std,


StudentLookup              float64
Ctr                          int64
EnrolledGrade               object
AdministrationDate          object
GiftedandTalented           object
504Plan                     object
IEP                         object
TestName                    object
Overallscaledscore          object
Overallperformancelevel     object
GraduationPoints           float64
PBA tested                  object
EOY tested                  object
Subscore1                   object
Subscore2                   object
Subscore3                   object
Subscore4                   object
dtype: object

In [25]:
for col_name in [x for x in air_scores.columns if x not in ['StudentLookup', 'Ctr', 'Overallscaledscore']]:
    air_scores.groupby(col_name).agg({col_name: 'count'})

Unnamed: 0_level_0,EnrolledGrade
EnrolledGrade,Unnamed: 1_level_1
,1144
3.0,382
4.0,1124
5.0,1039
6.0,1037
7.0,40
8.0,1120
9.0,1411
10.0,2176
11.0,554


Unnamed: 0_level_0,AdministrationDate
AdministrationDate,Unnamed: 1_level_1
Fall2015,1525
Spring2015,11940


Unnamed: 0_level_0,GiftedandTalented
GiftedandTalented,Unnamed: 1_level_1
,11162
N,2195
Y,108


Unnamed: 0_level_0,504Plan
504Plan,Unnamed: 1_level_1
,496
N,12687
Y,282


Unnamed: 0_level_0,IEP
IEP,Unnamed: 1_level_1
,2345
N,9536
Y,1584


Unnamed: 0_level_0,TestName
TestName,Unnamed: 1_level_1
Algebra 1,187
Algebra I,88
American Government,1430
American History,2101
Biology,140
ELA I,79
ELA II,2
English Language Arts 1,169
English Language Arts 2,124
Geometry,72


Unnamed: 0_level_0,Overallscaledscore
Overallscaledscore,Unnamed: 1_level_1
541,7
554,1
557,1
559,4
564,4
567,1
570,1
575,16
578,2
579,3


Unnamed: 0_level_0,Overallperformancelevel
Overallperformancelevel,Unnamed: 1_level_1
,8
Accelerated,2531
Advanced,1139
Basic,3002
Limited,2385
Proficient,4400


Unnamed: 0_level_0,GraduationPoints
GraduationPoints,Unnamed: 1_level_1
1.0,960
2.0,1392
3.0,2389
4.0,878
5.0,543


Unnamed: 0_level_0,PBA tested
PBA tested,Unnamed: 1_level_1
,278
N,229
Y,12958


Unnamed: 0_level_0,EOY tested
EOY tested,Unnamed: 1_level_1
,278
N,452
Y,12735


Unnamed: 0_level_0,Subscore1
Subscore1,Unnamed: 1_level_1
,10
Above Std,5005
At Std,5051
Below Std,3399


Unnamed: 0_level_0,Subscore2
Subscore2,Unnamed: 1_level_1
,8
Above Std,5469
At Std,4476
Below Std,3512


Unnamed: 0_level_0,Subscore3
Subscore3,Unnamed: 1_level_1
,8
Above Std,4835
At Std,5228
Below Std,3394


Unnamed: 0_level_0,Subscore4
Subscore4,Unnamed: 1_level_1
,11201
Above Std,797
At Std,958
Below Std,509


In [21]:
air_scores.describe()



Unnamed: 0,StudentLookup,Ctr,GraduationPoints
count,12338.0,13465.0,6162.0
mean,16932.026585,8563.98381,2.78124
std,9483.079019,5031.821931,1.138595
min,20.0,1.0,1.0
25%,,3367.0,
50%,,9545.0,
75%,,12911.0,
max,34235.0,16277.0,5.0


In [27]:
list_of_table_names[1]
asq = read_table_to_df(list_of_table_names[1], conn)
asq.head()
asq.dtypes

'ASQ_Preschool'

Unnamed: 0,StudentLookup,RecCounter,Score,Plevel
0,33022.0,1,30,Pass
1,,2,5,Pass
2,33032.0,3,25,Pass
3,33034.0,4,50,Pass
4,33039.0,5,0,Pass


StudentLookup    float64
RecCounter         int64
Score             object
Plevel            object
dtype: object

In [29]:
for col_name in [x for x in asq.columns if x not in ['StudentLookup', 'RecCounter']]:
    asq.groupby(col_name).agg({col_name: 'count'})

Unnamed: 0_level_0,Score
Score,Unnamed: 1_level_1
***,14
0,154
000,4
005,16
010,12
015,13
020,11
025,13
030,10
035,6


Unnamed: 0_level_0,Plevel
Plevel,Unnamed: 1_level_1
,14
Fail,266
Pass,1410
