In [1]:
import pandas as pd
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

In [2]:
!ls data/pickles

hs_data_16-18.pickle


In [3]:
with open('data/pickles/hs_data_16-18.pickle', 'rb') as f:
    df = pickle.load(f)

## Drop Unecessary Data

In [4]:
df.drop(columns = ['Address', 'Administrator_Title', 'CPS_School_Profile', 
                   'City', 'Classification_Description', 'Closed_For_Enrollment_Date',
                  'College_Enrollment_Rate_Mean', 'College_Enrollment_Rate_School',
                  'Demographic_Description', 'Earliest_Drop_Off_Time', 'Facebook', 
                  'Fax', 'Fifth_Contact_Name', 'Fifth_Contact_Title', 'Finance_ID', 
                  'Fourth_Contact_Name', 'Fourth_Contact_Title', 'Freshman_Start_End_Time',
                  'Grades_Offered', 'Graduation_Rate_Mean', 'Is_Elementary_School', 
                  'Is_GoCPS_Elementary', 'Is_GoCPS_High_School', 'Is_GoCPS_PreK', 
                  'Is_High_School', 'Is_Middle_School', 'Is_Pre_School', 'Kindergarten_School_Day',
                  'Legacy_Unit_ID', 'Long_Name', 'Mean_ACT', 'Open_For_Enrollment_Date', 
                  'Overall_Rating', 'Phone', 'Pinterest', 'PreK_School_Day', 'PreSchool_Inclusive', 
                  'Preschool_Instructional', 'Primary_Category', 'Rating_Statement', 'Rating_Status', 
                  'Refugee_Services', 'School_Type','Secondary_Contact', 'Secondary_Contact_Title', 
                  'Seventh_Contact_Name', 'Seventh_Contact_Title', 'Short_Name', 'Short_Name', 
                  'Significantly_Modified', 'Sixth_Contact_Name', 'Sixth_Contact_Title', 'State',
                  'Statistics_Description', 'Summary', 'Third_Contact_Name', 'Third_Contact_Title', 
                  'Title_1_Eligible', 'Twitter', 'Visual_Impairments', 'Website',
                  'Youtube', 'Zip_x'], inplace = True)

## Merge Income Data Per Zip for 16/17 and 17/18

In [5]:
df_2016 = df[df['School_Year'] == 'School Year 2016-2017']
df_2016_income = pd.read_csv('data/meanHHincome_2016.csv', header=None)
df_2016_income.columns = ['Zip_y', 'Zip_Mean_Income']
df_2016 = pd.merge(df_2016, df_2016_income, on = 'Zip_y' )

In [6]:
df_2017 = df[df['School_Year'] == 'School Year 2017-2018']
df_2017_income = pd.read_csv('data/meanHHincome_2017.csv', header=None)
df_2017_income.columns = ['Zip_y', 'Zip_Mean_Income']
df_2017 = pd.merge(df_2017, df_2017_income, on = 'Zip_y' )

In [7]:
df = df_2016.append(df_2017)

In [8]:
print(list(df))

['ADA_Accessible', 'Administrator', 'After_School_Hours', 'Attendance_Boundaries', 'Average_ACT_School', 'Bilingual_Services', 'Classroom_Languages', 'Dress_Code', 'Grades_Offered_All', 'Graduation_Rate_School', 'Hard_Of_Hearing', 'Is_GoCPS_Participant', 'Location', 'Network', 'School_Hours', 'School_ID', 'School_Latitude', 'School_Longitude', 'School_Year', 'Student_Count_Asian', 'Student_Count_Asian_Pacific_Islander', 'Student_Count_Black', 'Student_Count_English_Learners', 'Student_Count_Ethnicity_Not_Available', 'Student_Count_Hawaiian_Pacific_Islander', 'Student_Count_Hispanic', 'Student_Count_Low_Income', 'Student_Count_Multi', 'Student_Count_Native_American', 'Student_Count_Other_Ethnicity', 'Student_Count_Special_Ed', 'Student_Count_Total', 'Student_Count_White', 'Transportation_Bus', 'Transportation_El', 'Transportation_Metra', 'Zip_y', 'Zip_Mean_Income']


In [9]:
df.drop(columns = ['Is_GoCPS_Participant','Location', 'School_Latitude', 'School_Longitude', 'Average_ACT_School'], inplace=True)

In [10]:
df['School_Year'].value_counts()

School Year 2016-2017    121
School Year 2017-2018    118
Name: School_Year, dtype: int64

## Fill in ADA accessible

In [12]:
ada = df[df['School_Year'] == 'School Year 2016-2017'][['ADA_Accessible', 'School_ID']]

In [13]:
df = pd.merge(df, ada, on = 'School_ID' )
df.drop(columns = 'ADA_Accessible_x', inplace = True)

## Demographic Percentage Transformation

In [15]:
#Convert student demographic counts to percent of total population
print(list(df))
demog_headings = ['Student_Count_Asian', 'Student_Count_Asian_Pacific_Islander', 'Student_Count_Black', 
                  'Student_Count_English_Learners', 'Student_Count_Ethnicity_Not_Available', 
                  'Student_Count_Hawaiian_Pacific_Islander', 'Student_Count_Hispanic', 
                  'Student_Count_Low_Income', 'Student_Count_Multi', 'Student_Count_Native_American', 
                  'Student_Count_Other_Ethnicity', 'Student_Count_Special_Ed', 'Student_Count_White']
for dem in demog_headings:
    dem_per_heading = dem + '_perc'
    df[dem_per_heading] = df[dem]/df['Student_Count_Total']
    df.drop(columns = dem, inplace=True)

['Administrator', 'After_School_Hours', 'Attendance_Boundaries', 'Bilingual_Services', 'Classroom_Languages', 'Dress_Code', 'Grades_Offered_All', 'Graduation_Rate_School', 'Hard_Of_Hearing', 'Network', 'School_Hours', 'School_ID', 'School_Year', 'Student_Count_Asian', 'Student_Count_Asian_Pacific_Islander', 'Student_Count_Black', 'Student_Count_English_Learners', 'Student_Count_Ethnicity_Not_Available', 'Student_Count_Hawaiian_Pacific_Islander', 'Student_Count_Hispanic', 'Student_Count_Low_Income', 'Student_Count_Multi', 'Student_Count_Native_American', 'Student_Count_Other_Ethnicity', 'Student_Count_Special_Ed', 'Student_Count_Total', 'Student_Count_White', 'Transportation_Bus', 'Transportation_El', 'Transportation_Metra', 'Zip_y', 'Zip_Mean_Income', 'ADA_Accessible_y']


In [18]:
df.tail(30)

Unnamed: 0,Administrator,After_School_Hours,Attendance_Boundaries,Bilingual_Services,Classroom_Languages,Dress_Code,Grades_Offered_All,Graduation_Rate_School,Hard_Of_Hearing,Network,School_Hours,School_ID,School_Year,Student_Count_Total,Transportation_Bus,Transportation_El,Transportation_Metra,Zip_y,Zip_Mean_Income,ADA_Accessible_y,Student_Count_Asian_perc,Student_Count_Asian_Pacific_Islander_perc,Student_Count_Black_perc,Student_Count_English_Learners_perc,Student_Count_Ethnicity_Not_Available_perc,Student_Count_Hawaiian_Pacific_Islander_perc,Student_Count_Hispanic_perc,Student_Count_Low_Income_perc,Student_Count_Multi_perc,Student_Count_Native_American_perc,Student_Count_Other_Ethnicity_perc,Student_Count_Special_Ed_perc,Student_Count_White_perc
209,Falilat O Shokunbi,3:09 PM - 6:00 PM,Y,N,Spanish,Y,9101112,57.7,,,8:00 AM - 3:08 PM,609723,School Year 2016-2017,343,"20, 52, 82, 126",,,60624.0,35831,No/unknown accessibility,0.000,0.000e+00,0.980,0.000,0.000e+00,0.000,0.015,0.977,0.006,0.000e+00,0.0,0.271,0.000
210,Mr.Jammie Tenn Poole Jr,3:09 PM - 6:00 PM,Y,N,Spanish,Y,9101112,46.1,,Network 15,8:00 AM - 3:08 PM,609723,School Year 2017-2018,265,"20, 52, 82, 126",,,60624.0,36426,No/unknown accessibility,0.000,0.000e+00,0.985,0.004,0.000e+00,0.000,0.015,0.800,0.000,0.000e+00,0.0,0.287,0.000
211,Dr.Femi S. Skanes,"3:30 PM - 6:00 PM M,T,Th,F",N,N,Spanish,Y,9101112,73.4,,,"8:00 - 3:23 M, T, Th, F 8:00 - 2:24 W",610334,School Year 2016-2017,379,82,Green,Union Pacific West (UP-W),60624.0,35831,Generally accessible,0.003,0.000e+00,0.931,0.026,0.000e+00,0.003,0.058,0.987,0.000,0.000e+00,0.0,0.264,0.005
212,Ms.Michelle Theresa Harrell,"3:30 PM - 6:00 PM M,T,Th,F",N,N,Spanish,Y,9101112,67.5,,Network 15,"8:00 - 3:22 M, T, Th, F 8:00 - 2:24 W",610334,School Year 2017-2018,343,82,Green,Union Pacific West (UP-W),60624.0,36426,Generally accessible,0.003,0.000e+00,0.921,0.038,0.000e+00,0.003,0.061,0.860,0.000,0.000e+00,0.0,0.280,0.012
213,Patrick Mcgill,3:15 PM -6:00 PM,N,N,"Chinese, French, Spanish",N,9101112,97.6,,,8:00 AM-3:15 PM,609693,School Year 2016-2017,1199,"52, 82",Green,Union Pacific West (UP-W),60624.0,35831,Fully Accessible,0.040,0.000e+00,0.538,0.014,2.502e-03,0.000,0.389,0.818,0.008,8.340e-04,0.0,0.069,0.021
214,Ms.Kerry Ellen Leuschel,3:15 PM -6:00 PM,N,N,"Chinese, French, Spanish",N,9101112,93.3,,Network 15,8:00 AM-3:15 PM,609693,School Year 2017-2018,1203,"52, 82",Green,Union Pacific West (UP-W),60624.0,36426,Fully Accessible,0.047,0.000e+00,0.529,0.019,4.988e-03,0.000,0.394,0.669,0.006,0.000e+00,0.0,0.071,0.020
215,George Z Szkapiak,2:45 - 8:00 PM,Y,Y,"Polish, Spanish",N,9101112,68.8,,,7:30 AM-2:45 PM,609718,School Year 2016-2017,1686,"55N, 62, 62H",Orange,,60638.0,75639,Fully Accessible,0.006,0.000e+00,0.039,0.063,5.931e-04,0.001,0.730,0.736,0.005,1.068e-02,0.0,0.175,0.208
216,George Z Szkapiak,2:45 - 8:00 PM,Y,Y,"Polish, Spanish",N,9101112,73.8,,Network 16,7:30 AM-2:45 PM,609718,School Year 2017-2018,1600,"55N, 62, 62H",Orange,,60638.0,79245,Fully Accessible,0.007,0.000e+00,0.031,0.083,6.250e-04,0.001,0.744,0.448,0.006,1.000e-02,0.0,0.179,0.199
217,Mr.Wayne Joseph Bevis,3:05 PM - 8:30 PM,N,N,"Arabic, Chinese",N,789101112,89.2,,,8:00 - 3:05pm,610391,School Year 2016-2017,1331,"9, 48, 63",Green,,60636.0,39577,Generally accessible,0.010,0.000e+00,0.707,0.014,3.757e-03,0.000,0.244,0.663,0.008,2.254e-03,0.0,0.054,0.026
218,Mr.Wayne Joseph Bevis,3:05 PM - 8:30 PM,N,N,"Arabic, Chinese",N,789101112,89.3,,Network 16,8:00 - 3:05pm,610391,School Year 2017-2018,1364,"9, 48, 63",Green,,60636.0,40691,Generally accessible,0.009,0.000e+00,0.710,0.015,0.000e+00,0.000,0.243,0.626,0.007,3.666e-03,0.0,0.052,0.028


## Language Count Transform

In [19]:
df.Classroom_Languages.value_counts()

Spanish                                                                                                       96
French, Spanish                                                                                               39
Spanish, Spanish for Heritage Speakers                                                                        25
French, Spanish, Spanish for Heritage Speakers                                                                11
Chinese, French, Spanish                                                                                       6
Chinese, French, Japanese, Latin, Spanish                                                                      4
Chinese, Spanish                                                                                               4
Mandarin, Spanish                                                                                              3
Chinese, French, Latin, Spanish                                                                 

In [24]:
df_languages = df[['School_ID', 'Classroom_Languages', 'School_Year']]
#the data appears to show the same value for languages across the two years
#so the data will be transformed on one year to prevent duplication at time of merge
df_languages = df_languages[df_languages['School_Year'] == 'School Year 2016-2017']

In [25]:
#Because this is a comma count, the values with zero commas have 1 language, 
#one comma have 2, etc.
df_languages.Classroom_Languages.str.count(',').value_counts()

0.0    51
1.0    41
2.0    11
3.0     5
4.0     3
5.0     2
9.0     1
7.0     1
Name: Classroom_Languages, dtype: int64

In [26]:
df_languages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 121 entries, 0 to 237
Data columns (total 3 columns):
School_ID              121 non-null int64
Classroom_Languages    115 non-null object
School_Year            121 non-null object
dtypes: int64(1), object(2)
memory usage: 3.8+ KB


In [27]:
df_languages['Classroom_Languages_count'] = (
                                            df_languages['Classroom_Languages'].str.count(',') 
                                            + 1
                                            )
    

In [28]:
df_languages.sort_values('Classroom_Languages_count', ascending = False)
df_languages.drop(columns = ['Classroom_Languages', 'School_Year'], inplace=True)

In [29]:
df = pd.merge(df, df_languages, on='School_ID')

In [30]:
df['Classroom_Languages_count'].value_counts()
df['Classroom_Languages_count'].value_counts().sum() #shows there are 11 NaN number, matches original set.


228

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 239 entries, 0 to 238
Data columns (total 34 columns):
Administrator                                   239 non-null object
After_School_Hours                              143 non-null object
Attendance_Boundaries                           239 non-null object
Bilingual_Services                              228 non-null object
Classroom_Languages                             228 non-null object
Dress_Code                                      239 non-null object
Grades_Offered_All                              239 non-null object
Graduation_Rate_School                          239 non-null float64
Hard_Of_Hearing                                 11 non-null object
Network                                         116 non-null object
School_Hours                                    234 non-null object
School_ID                                       239 non-null int64
School_Year                                     239 non-null object
Student_Coun