In [1]:
import pandas as pd
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

In [2]:
!ls data/pickles

hs_data_16-18.pickle


In [3]:
with open('data/pickles/hs_data_16-18.pickle', 'rb') as f:
    df = pickle.load(f)

## Drop Unecessary Data

In [4]:
df.drop(columns = ['Address', 'Administrator_Title', 'CPS_School_Profile', 
                   'City', 'Classification_Description', 'Closed_For_Enrollment_Date',
                  'College_Enrollment_Rate_Mean', 'College_Enrollment_Rate_School',
                  'Demographic_Description', 'Earliest_Drop_Off_Time', 'Facebook', 
                  'Fax', 'Fifth_Contact_Name', 'Fifth_Contact_Title', 'Finance_ID', 
                  'Fourth_Contact_Name', 'Fourth_Contact_Title', 'Freshman_Start_End_Time',
                  'Grades_Offered', 'Graduation_Rate_Mean', 'Is_Elementary_School', 
                  'Is_GoCPS_Elementary', 'Is_GoCPS_High_School', 'Is_GoCPS_PreK', 
                  'Is_High_School', 'Is_Middle_School', 'Is_Pre_School', 'Kindergarten_School_Day',
                  'Legacy_Unit_ID', 'Long_Name', 'Mean_ACT', 'Open_For_Enrollment_Date', 
                  'Overall_Rating', 'Phone', 'Pinterest', 'PreK_School_Day', 'PreSchool_Inclusive', 
                  'Preschool_Instructional', 'Primary_Category', 'Rating_Statement', 'Rating_Status', 
                  'Refugee_Services', 'School_Type','Secondary_Contact', 'Secondary_Contact_Title', 
                  'Seventh_Contact_Name', 'Seventh_Contact_Title', 'Short_Name', 'Short_Name', 
                  'Significantly_Modified', 'Sixth_Contact_Name', 'Sixth_Contact_Title', 'State',
                  'Statistics_Description', 'Summary', 'Third_Contact_Name', 'Third_Contact_Title', 
                  'Title_1_Eligible', 'Twitter', 'Visual_Impairments', 'Website',
                  'Youtube', 'Zip_x'], inplace = True)

## Merge Income Data Per Zip for 16/17 and 17/18

In [5]:
df_2016 = df[df['School_Year'] == 'School Year 2016-2017']
df_2016_income = pd.read_csv('data/meanHHincome_2016.csv', header=None)
df_2016_income.columns = ['Zip_y', 'Zip_Mean_Income']
df_2016 = pd.merge(df_2016, df_2016_income, on = 'Zip_y' )

In [6]:
df_2017 = df[df['School_Year'] == 'School Year 2017-2018']
df_2017_income = pd.read_csv('data/meanHHincome_2017.csv', header=None)
df_2017_income.columns = ['Zip_y', 'Zip_Mean_Income']
df_2017 = pd.merge(df_2017, df_2017_income, on = 'Zip_y' )

In [7]:
df = df_2016.append(df_2017)

In [8]:
print(list(df))

['ADA_Accessible', 'Administrator', 'After_School_Hours', 'Attendance_Boundaries', 'Average_ACT_School', 'Bilingual_Services', 'Classroom_Languages', 'Dress_Code', 'Grades_Offered_All', 'Graduation_Rate_School', 'Hard_Of_Hearing', 'Is_GoCPS_Participant', 'Location', 'Network', 'School_Hours', 'School_ID', 'School_Latitude', 'School_Longitude', 'School_Year', 'Student_Count_Asian', 'Student_Count_Asian_Pacific_Islander', 'Student_Count_Black', 'Student_Count_English_Learners', 'Student_Count_Ethnicity_Not_Available', 'Student_Count_Hawaiian_Pacific_Islander', 'Student_Count_Hispanic', 'Student_Count_Low_Income', 'Student_Count_Multi', 'Student_Count_Native_American', 'Student_Count_Other_Ethnicity', 'Student_Count_Special_Ed', 'Student_Count_Total', 'Student_Count_White', 'Transportation_Bus', 'Transportation_El', 'Transportation_Metra', 'Zip_y', 'Zip_Mean_Income']


In [9]:
df.drop(columns = ['Is_GoCPS_Participant','Location', 'School_Latitude', 'School_Longitude', 'Average_ACT_School'], inplace=True)

In [10]:
df['School_Year'].value_counts()

School Year 2016-2017    121
School Year 2017-2018    118
Name: School_Year, dtype: int64

## Fill in ADA accessible

In [11]:
ada = df[df['School_Year'] == 'School Year 2016-2017'][['ADA_Accessible', 'School_ID']]

In [12]:
df = pd.merge(df, ada, on = 'School_ID' )
df.drop(columns = 'ADA_Accessible_x', inplace = True)

### ADA Dummie

In [13]:
df['ADA_Accessible_y'].value_counts()

Generally accessible        130
Fully Accessible             66
No/unknown accessibility     43
Name: ADA_Accessible_y, dtype: int64

In [14]:
df = pd.concat((df, pd.get_dummies(df['ADA_Accessible_y'])), axis=1)

In [15]:
df.head()

Unnamed: 0,Administrator,After_School_Hours,Attendance_Boundaries,Bilingual_Services,Classroom_Languages,Dress_Code,Grades_Offered_All,Graduation_Rate_School,Hard_Of_Hearing,Network,School_Hours,School_ID,School_Year,Student_Count_Asian,Student_Count_Asian_Pacific_Islander,Student_Count_Black,Student_Count_English_Learners,Student_Count_Ethnicity_Not_Available,Student_Count_Hawaiian_Pacific_Islander,Student_Count_Hispanic,Student_Count_Low_Income,Student_Count_Multi,Student_Count_Native_American,Student_Count_Other_Ethnicity,Student_Count_Special_Ed,Student_Count_Total,Student_Count_White,Transportation_Bus,Transportation_El,Transportation_Metra,Zip_y,Zip_Mean_Income,ADA_Accessible_y,Fully Accessible,Generally accessible,No/unknown accessibility
0,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,"French, Spanish",N,9101112,80.9,,,7:45 AM - 2:35 PM,609764,School Year 2016-2017,5,0,58,331,11,0,1644,1659,3,6,0,281,1739,12,"9, 21, 60","Blue, Pink, Red",,60608.0,53353,Generally accessible,0,1,0
1,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,"French, Spanish",N,9101112,78.6,,ISP,7:45 AM - 2:35 PM,609764,School Year 2017-2018,10,0,53,343,13,0,1608,1068,6,6,0,277,1706,10,"9, 21, 60","Blue, Pink, Red",,60608.0,57770,Generally accessible,0,1,0
2,Dr. Hillyn Senuholtz,3:45 PM - 5:30 PM,N,Y,Spanish,Y,9101112,82.8,,,7:45AM - 3:44PM,400104,School Year 2016-2017,0,0,24,169,0,0,714,720,0,1,0,146,742,3,"49, 60",,,60608.0,53353,No/unknown accessibility,0,0,1
3,Ms. Kimberly Burks,3:45 PM - 4:30 PM,N,Y,Spanish,Y,9101112,85.0,,Charter,7:45AM - 3:44PM,400104,School Year 2017-2018,0,0,14,213,1,0,726,693,0,2,0,146,745,2,"49, 60","Blue, Orange, Pink",BNSF Railway (BNSF),60608.0,57770,No/unknown accessibility,0,0,1
4,Sharnette Sims,,N,,,N,9101112,1.7,,,7:00 am - 4:00 pm,609748,School Year 2016-2017,0,0,154,2,0,0,36,174,0,3,0,58,199,6,,,,60608.0,53353,No/unknown accessibility,0,0,1


In [16]:
#having [0,0] for fully and generally accessible implies no or unknown accessibility
df.drop(columns = ['No/unknown accessibility','ADA_Accessible_y'], inplace=True)

In [17]:
df.head()

Unnamed: 0,Administrator,After_School_Hours,Attendance_Boundaries,Bilingual_Services,Classroom_Languages,Dress_Code,Grades_Offered_All,Graduation_Rate_School,Hard_Of_Hearing,Network,School_Hours,School_ID,School_Year,Student_Count_Asian,Student_Count_Asian_Pacific_Islander,Student_Count_Black,Student_Count_English_Learners,Student_Count_Ethnicity_Not_Available,Student_Count_Hawaiian_Pacific_Islander,Student_Count_Hispanic,Student_Count_Low_Income,Student_Count_Multi,Student_Count_Native_American,Student_Count_Other_Ethnicity,Student_Count_Special_Ed,Student_Count_Total,Student_Count_White,Transportation_Bus,Transportation_El,Transportation_Metra,Zip_y,Zip_Mean_Income,Fully Accessible,Generally accessible
0,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,"French, Spanish",N,9101112,80.9,,,7:45 AM - 2:35 PM,609764,School Year 2016-2017,5,0,58,331,11,0,1644,1659,3,6,0,281,1739,12,"9, 21, 60","Blue, Pink, Red",,60608.0,53353,0,1
1,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,"French, Spanish",N,9101112,78.6,,ISP,7:45 AM - 2:35 PM,609764,School Year 2017-2018,10,0,53,343,13,0,1608,1068,6,6,0,277,1706,10,"9, 21, 60","Blue, Pink, Red",,60608.0,57770,0,1
2,Dr. Hillyn Senuholtz,3:45 PM - 5:30 PM,N,Y,Spanish,Y,9101112,82.8,,,7:45AM - 3:44PM,400104,School Year 2016-2017,0,0,24,169,0,0,714,720,0,1,0,146,742,3,"49, 60",,,60608.0,53353,0,0
3,Ms. Kimberly Burks,3:45 PM - 4:30 PM,N,Y,Spanish,Y,9101112,85.0,,Charter,7:45AM - 3:44PM,400104,School Year 2017-2018,0,0,14,213,1,0,726,693,0,2,0,146,745,2,"49, 60","Blue, Orange, Pink",BNSF Railway (BNSF),60608.0,57770,0,0
4,Sharnette Sims,,N,,,N,9101112,1.7,,,7:00 am - 4:00 pm,609748,School Year 2016-2017,0,0,154,2,0,0,36,174,0,3,0,58,199,6,,,,60608.0,53353,0,0


## Demographic Percentage Transformation

In [18]:
#Convert student demographic counts to percent of total population
print(list(df))
demog_headings = ['Student_Count_Asian', 'Student_Count_Asian_Pacific_Islander', 'Student_Count_Black', 
                  'Student_Count_English_Learners', 'Student_Count_Ethnicity_Not_Available', 
                  'Student_Count_Hawaiian_Pacific_Islander', 'Student_Count_Hispanic', 
                  'Student_Count_Low_Income', 'Student_Count_Multi', 'Student_Count_Native_American', 
                  'Student_Count_Other_Ethnicity', 'Student_Count_Special_Ed', 'Student_Count_White']
for dem in demog_headings:
    dem_per_heading = dem + '_perc'
    df[dem_per_heading] = df[dem]/df['Student_Count_Total']
    df.drop(columns = dem, inplace=True)

['Administrator', 'After_School_Hours', 'Attendance_Boundaries', 'Bilingual_Services', 'Classroom_Languages', 'Dress_Code', 'Grades_Offered_All', 'Graduation_Rate_School', 'Hard_Of_Hearing', 'Network', 'School_Hours', 'School_ID', 'School_Year', 'Student_Count_Asian', 'Student_Count_Asian_Pacific_Islander', 'Student_Count_Black', 'Student_Count_English_Learners', 'Student_Count_Ethnicity_Not_Available', 'Student_Count_Hawaiian_Pacific_Islander', 'Student_Count_Hispanic', 'Student_Count_Low_Income', 'Student_Count_Multi', 'Student_Count_Native_American', 'Student_Count_Other_Ethnicity', 'Student_Count_Special_Ed', 'Student_Count_Total', 'Student_Count_White', 'Transportation_Bus', 'Transportation_El', 'Transportation_Metra', 'Zip_y', 'Zip_Mean_Income', 'Fully Accessible', 'Generally accessible']


In [19]:
df.tail(30)

Unnamed: 0,Administrator,After_School_Hours,Attendance_Boundaries,Bilingual_Services,Classroom_Languages,Dress_Code,Grades_Offered_All,Graduation_Rate_School,Hard_Of_Hearing,Network,School_Hours,School_ID,School_Year,Student_Count_Total,Transportation_Bus,Transportation_El,Transportation_Metra,Zip_y,Zip_Mean_Income,Fully Accessible,Generally accessible,Student_Count_Asian_perc,Student_Count_Asian_Pacific_Islander_perc,Student_Count_Black_perc,Student_Count_English_Learners_perc,Student_Count_Ethnicity_Not_Available_perc,Student_Count_Hawaiian_Pacific_Islander_perc,Student_Count_Hispanic_perc,Student_Count_Low_Income_perc,Student_Count_Multi_perc,Student_Count_Native_American_perc,Student_Count_Other_Ethnicity_perc,Student_Count_Special_Ed_perc,Student_Count_White_perc
209,Falilat O Shokunbi,3:09 PM - 6:00 PM,Y,N,Spanish,Y,9101112,57.7,,,8:00 AM - 3:08 PM,609723,School Year 2016-2017,343,"20, 52, 82, 126",,,60624.0,35831,0,0,0.000,0.000e+00,0.980,0.000,0.000e+00,0.000,0.015,0.977,0.006,0.000e+00,0.0,0.271,0.000
210,Mr.Jammie Tenn Poole Jr,3:09 PM - 6:00 PM,Y,N,Spanish,Y,9101112,46.1,,Network 15,8:00 AM - 3:08 PM,609723,School Year 2017-2018,265,"20, 52, 82, 126",,,60624.0,36426,0,0,0.000,0.000e+00,0.985,0.004,0.000e+00,0.000,0.015,0.800,0.000,0.000e+00,0.0,0.287,0.000
211,Dr.Femi S. Skanes,"3:30 PM - 6:00 PM M,T,Th,F",N,N,Spanish,Y,9101112,73.4,,,"8:00 - 3:23 M, T, Th, F 8:00 - 2:24 W",610334,School Year 2016-2017,379,82,Green,Union Pacific West (UP-W),60624.0,35831,0,1,0.003,0.000e+00,0.931,0.026,0.000e+00,0.003,0.058,0.987,0.000,0.000e+00,0.0,0.264,0.005
212,Ms.Michelle Theresa Harrell,"3:30 PM - 6:00 PM M,T,Th,F",N,N,Spanish,Y,9101112,67.5,,Network 15,"8:00 - 3:22 M, T, Th, F 8:00 - 2:24 W",610334,School Year 2017-2018,343,82,Green,Union Pacific West (UP-W),60624.0,36426,0,1,0.003,0.000e+00,0.921,0.038,0.000e+00,0.003,0.061,0.860,0.000,0.000e+00,0.0,0.280,0.012
213,Patrick Mcgill,3:15 PM -6:00 PM,N,N,"Chinese, French, Spanish",N,9101112,97.6,,,8:00 AM-3:15 PM,609693,School Year 2016-2017,1199,"52, 82",Green,Union Pacific West (UP-W),60624.0,35831,1,0,0.040,0.000e+00,0.538,0.014,2.502e-03,0.000,0.389,0.818,0.008,8.340e-04,0.0,0.069,0.021
214,Ms.Kerry Ellen Leuschel,3:15 PM -6:00 PM,N,N,"Chinese, French, Spanish",N,9101112,93.3,,Network 15,8:00 AM-3:15 PM,609693,School Year 2017-2018,1203,"52, 82",Green,Union Pacific West (UP-W),60624.0,36426,1,0,0.047,0.000e+00,0.529,0.019,4.988e-03,0.000,0.394,0.669,0.006,0.000e+00,0.0,0.071,0.020
215,George Z Szkapiak,2:45 - 8:00 PM,Y,Y,"Polish, Spanish",N,9101112,68.8,,,7:30 AM-2:45 PM,609718,School Year 2016-2017,1686,"55N, 62, 62H",Orange,,60638.0,75639,1,0,0.006,0.000e+00,0.039,0.063,5.931e-04,0.001,0.730,0.736,0.005,1.068e-02,0.0,0.175,0.208
216,George Z Szkapiak,2:45 - 8:00 PM,Y,Y,"Polish, Spanish",N,9101112,73.8,,Network 16,7:30 AM-2:45 PM,609718,School Year 2017-2018,1600,"55N, 62, 62H",Orange,,60638.0,79245,1,0,0.007,0.000e+00,0.031,0.083,6.250e-04,0.001,0.744,0.448,0.006,1.000e-02,0.0,0.179,0.199
217,Mr.Wayne Joseph Bevis,3:05 PM - 8:30 PM,N,N,"Arabic, Chinese",N,789101112,89.2,,,8:00 - 3:05pm,610391,School Year 2016-2017,1331,"9, 48, 63",Green,,60636.0,39577,0,1,0.010,0.000e+00,0.707,0.014,3.757e-03,0.000,0.244,0.663,0.008,2.254e-03,0.0,0.054,0.026
218,Mr.Wayne Joseph Bevis,3:05 PM - 8:30 PM,N,N,"Arabic, Chinese",N,789101112,89.3,,Network 16,8:00 - 3:05pm,610391,School Year 2017-2018,1364,"9, 48, 63",Green,,60636.0,40691,0,1,0.009,0.000e+00,0.710,0.015,0.000e+00,0.000,0.243,0.626,0.007,3.666e-03,0.0,0.052,0.028


## Language Count Transform

In [20]:
df.Classroom_Languages.value_counts()

Spanish                                                                                                        96
French, Spanish                                                                                                39
Spanish, Spanish for Heritage Speakers                                                                         25
French, Spanish, Spanish for Heritage Speakers                                                                 11
Chinese, French, Spanish                                                                                        6
Chinese, French, Japanese, Latin, Spanish                                                                       4
Chinese, Spanish                                                                                                4
Mandarin, Spanish                                                                                               3
Chinese, French, German, Japanese, Latin, Spanish                                       

In [21]:
df_languages = df[['School_ID', 'Classroom_Languages', 'School_Year']]
#the data appears to show the same value for languages across the two years
#so the data will be transformed on one year to prevent duplication at time of merge
df_languages = df_languages[df_languages['School_Year'] == 'School Year 2016-2017']

In [22]:
#Because this is a comma count, the values with zero commas have 1 language, 
#one comma have 2, etc.
df_languages.Classroom_Languages.str.count(',').value_counts()

0.0    51
1.0    41
2.0    11
3.0     5
4.0     3
5.0     2
9.0     1
7.0     1
Name: Classroom_Languages, dtype: int64

In [23]:
df_languages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 121 entries, 0 to 237
Data columns (total 3 columns):
School_ID              121 non-null int64
Classroom_Languages    115 non-null object
School_Year            121 non-null object
dtypes: int64(1), object(2)
memory usage: 3.8+ KB


In [24]:
df_languages['Classroom_Languages_count'] = (
                                            df_languages['Classroom_Languages'].str.count(',') 
                                            + 1
                                            )
    

In [25]:
df_languages.sort_values('Classroom_Languages_count', ascending = False)
df_languages.drop(columns = ['Classroom_Languages', 'School_Year'], inplace=True)

In [26]:
df = pd.merge(df, df_languages, on='School_ID')

In [27]:
df['Classroom_Languages_count'].value_counts()
df['Classroom_Languages_count'].value_counts().sum() #shows there are 11 NaN number, matches original set.


228

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 239 entries, 0 to 238
Data columns (total 35 columns):
Administrator                                   239 non-null object
After_School_Hours                              143 non-null object
Attendance_Boundaries                           239 non-null object
Bilingual_Services                              228 non-null object
Classroom_Languages                             228 non-null object
Dress_Code                                      239 non-null object
Grades_Offered_All                              239 non-null object
Graduation_Rate_School                          239 non-null float64
Hard_Of_Hearing                                 11 non-null object
Network                                         116 non-null object
School_Hours                                    234 non-null object
School_ID                                       239 non-null int64
School_Year                                     239 non-null object
Student_Coun

## El Dummie Transform

In [29]:
el_df = df[['School_ID', 'Transportation_El', 'School_Year']]

In [30]:
el_df.head(10)

Unnamed: 0,School_ID,Transportation_El,School_Year
0,609764,"Blue, Pink, Red",School Year 2016-2017
1,609764,"Blue, Pink, Red",School Year 2017-2018
2,400104,,School Year 2016-2017
3,400104,"Blue, Orange, Pink",School Year 2017-2018
4,609748,,School Year 2016-2017
5,609748,,School Year 2017-2018
6,400057,"Blue, Pink",School Year 2016-2017
7,400057,"Blue, Pink",School Year 2017-2018
8,609750,Pink,School Year 2016-2017
9,609750,Pink,School Year 2017-2018


In [31]:
#Again, like languages, looks like El values are consistent across years.
el_df = el_df[el_df['School_Year'] == 'School Year 2017-2018']

In [32]:
el_df['Transportation_El'].value_counts()

Red                                              16
Blue                                             15
Green, Red                                       12
Green                                            10
Orange                                            7
Blue, Green                                       6
Blue, Red                                         4
Pink                                              3
Brown                                             3
Blue, Brown                                       3
Blue, Pink, Red                                   2
Brown, Red                                        2
Blue, Brown, Green, Orange, Pink, Purple, Red     2
Blue, Pink                                        1
Blue, Brown, Pink, Red                            1
Blue, Orange, Pink                                1
Blue, Brown, Purple, Red                          1
Blue, Orange, Red                                 1
Blue, Brown, Green                                1
Name: Transp

In [33]:
#replace NaN with no_el so that with dummy variable I can drop No_El
el_df.fillna(value = 'No_El', inplace=True)

In [34]:
el_df

Unnamed: 0,School_ID,Transportation_El,School_Year
1,609764,"Blue, Pink, Red",School Year 2017-2018
3,400104,"Blue, Orange, Pink",School Year 2017-2018
5,609748,No_El,School Year 2017-2018
7,400057,"Blue, Pink",School Year 2017-2018
9,609750,Pink,School Year 2017-2018
11,400102,Blue,School Year 2017-2018
13,400091,"Blue, Pink, Red",School Year 2017-2018
15,400054,"Blue, Brown",School Year 2017-2018
17,609691,No_El,School Year 2017-2018
19,609716,Blue,School Year 2017-2018


In [35]:
el_dummies = el_df['Transportation_El'].str.get_dummies(sep = ', ')

In [36]:
el_dummies

Unnamed: 0,Blue,Brown,Green,No_El,Orange,Pink,Purple,Red
1,1,0,0,0,0,1,0,1
3,1,0,0,0,1,1,0,0
5,0,0,0,1,0,0,0,0
7,1,0,0,0,0,1,0,0
9,0,0,0,0,0,1,0,0
11,1,0,0,0,0,0,0,0
13,1,0,0,0,0,1,0,1
15,1,1,0,0,0,0,0,0
17,0,0,0,1,0,0,0,0
19,1,0,0,0,0,0,0,0


### Double Check with Alice/John

In [37]:
el_dummies.drop(columns = 'No_El', inplace=True)

In [38]:
el_df = pd.merge(el_df,el_dummies, left_index=True, right_index=True)

In [39]:
el_df.head()

Unnamed: 0,School_ID,Transportation_El,School_Year,Blue,Brown,Green,Orange,Pink,Purple,Red
1,609764,"Blue, Pink, Red",School Year 2017-2018,1,0,0,0,1,0,1
3,400104,"Blue, Orange, Pink",School Year 2017-2018,1,0,0,1,1,0,0
5,609748,No_El,School Year 2017-2018,0,0,0,0,0,0,0
7,400057,"Blue, Pink",School Year 2017-2018,1,0,0,0,1,0,0
9,609750,Pink,School Year 2017-2018,0,0,0,0,1,0,0


In [40]:
el_df.drop(columns = ['Transportation_El', 'School_Year'], inplace=True)

In [41]:
#decide which column to drop for dummies based on column count
line_list = ['Blue','Brown','Green','Orange','Pink','Purple','Red']
for line in line_list:
    print(el_df[line].value_counts())

0    80
1    38
Name: Blue, dtype: int64
0    105
1     13
Name: Brown, dtype: int64
0    87
1    31
Name: Green, dtype: int64
0    107
1     11
Name: Orange, dtype: int64
0    108
1     10
Name: Pink, dtype: int64
0    115
1      3
Name: Purple, dtype: int64
0    77
1    41
Name: Red, dtype: int64


In [42]:
#drop pink because there are 115 entries, the highest value, without it.

In [43]:
df = pd.merge(df, el_df, on='School_ID')

In [44]:
df

Unnamed: 0,Administrator,After_School_Hours,Attendance_Boundaries,Bilingual_Services,Classroom_Languages,Dress_Code,Grades_Offered_All,Graduation_Rate_School,Hard_Of_Hearing,Network,School_Hours,School_ID,School_Year,Student_Count_Total,Transportation_Bus,Transportation_El,Transportation_Metra,Zip_y,Zip_Mean_Income,Fully Accessible,Generally accessible,Student_Count_Asian_perc,Student_Count_Asian_Pacific_Islander_perc,Student_Count_Black_perc,Student_Count_English_Learners_perc,Student_Count_Ethnicity_Not_Available_perc,Student_Count_Hawaiian_Pacific_Islander_perc,Student_Count_Hispanic_perc,Student_Count_Low_Income_perc,Student_Count_Multi_perc,Student_Count_Native_American_perc,Student_Count_Other_Ethnicity_perc,Student_Count_Special_Ed_perc,Student_Count_White_perc,Classroom_Languages_count,Blue,Brown,Green,Orange,Pink,Purple,Red
0,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,"French, Spanish",N,9101112,80.9,,,7:45 AM - 2:35 PM,609764,School Year 2016-2017,1739,"9, 21, 60","Blue, Pink, Red",,60608.0,53353,0,1,0.003,0.000e+00,0.033,0.190,6.325e-03,0.000,0.945,0.954,0.002,0.003,0.0,0.162,0.007,2.0,1,0,0,0,1,0,1
1,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,"French, Spanish",N,9101112,78.6,,ISP,7:45 AM - 2:35 PM,609764,School Year 2017-2018,1706,"9, 21, 60","Blue, Pink, Red",,60608.0,57770,0,1,0.006,0.000e+00,0.031,0.201,7.620e-03,0.000,0.943,0.626,0.004,0.004,0.0,0.162,0.006,2.0,1,0,0,0,1,0,1
2,Dr. Hillyn Senuholtz,3:45 PM - 5:30 PM,N,Y,Spanish,Y,9101112,82.8,,,7:45AM - 3:44PM,400104,School Year 2016-2017,742,"49, 60",,,60608.0,53353,0,0,0.000,0.000e+00,0.032,0.228,0.000e+00,0.000,0.962,0.970,0.000,0.001,0.0,0.197,0.004,1.0,1,0,0,1,1,0,0
3,Ms. Kimberly Burks,3:45 PM - 4:30 PM,N,Y,Spanish,Y,9101112,85.0,,Charter,7:45AM - 3:44PM,400104,School Year 2017-2018,745,"49, 60","Blue, Orange, Pink",BNSF Railway (BNSF),60608.0,57770,0,0,0.000,0.000e+00,0.019,0.286,1.342e-03,0.000,0.974,0.930,0.000,0.003,0.0,0.196,0.003,1.0,1,0,0,1,1,0,0
4,Sharnette Sims,,N,,,N,9101112,1.7,,,7:00 am - 4:00 pm,609748,School Year 2016-2017,199,,,,60608.0,53353,0,0,0.000,0.000e+00,0.774,0.010,0.000e+00,0.000,0.181,0.874,0.000,0.015,0.0,0.291,0.030,,0,0,0,0,0,0,0
5,Sharnette Sims,,N,,,N,9101112,2.3,,Network 16,7:00 am - 4:00 pm,609748,School Year 2017-2018,184,,,,60608.0,57770,0,0,0.000,0.000e+00,0.777,0.027,0.000e+00,0.000,0.196,0.967,0.011,0.011,0.0,0.272,0.005,,0,0,0,0,0,0,0
6,Tressie McDonough,,N,Y,"Spanish, Spanish for Heritage Speakers",Y,9101112,83.3,,,8:00 AM-3:35 PM,400057,School Year 2016-2017,954,"50, 12","Blue, Pink",,60608.0,53353,0,1,0.034,0.000e+00,0.309,0.084,2.096e-03,0.002,0.635,0.853,0.002,0.003,0.0,0.148,0.013,2.0,1,0,0,0,1,0,0
7,Ms. Audrey Borling,,N,Y,"Spanish, Spanish for Heritage Speakers",Y,9101112,92.4,,Charter,8:00 AM-3:35 PM,400057,School Year 2017-2018,931,"50, 12","Blue, Pink",,60608.0,57770,0,1,0.031,0.000e+00,0.303,0.104,3.222e-03,0.001,0.642,0.840,0.004,0.003,0.0,0.142,0.012,2.0,1,0,0,0,1,0,0
8,Mrs.Sherita D Carter-King,,N,,Spanish,N,6789101112,21.4,,,8:00 AM - 3:05 PM,609750,School Year 2016-2017,38,,Pink,,60608.0,53353,0,0,0.000,0.000e+00,0.737,0.053,0.000e+00,0.000,0.211,0.974,0.000,0.000,0.0,0.158,0.053,1.0,0,0,0,0,1,0,0
9,Mrs.Sherita D Carter-King,,N,,Spanish,N,6789101112,23.1,,Network 15,8:00 AM - 3:00 PM,609750,School Year 2017-2018,34,,Pink,,60608.0,57770,0,0,0.000,0.000e+00,0.735,0.088,0.000e+00,0.000,0.265,0.735,0.000,0.000,0.0,0.088,0.000,1.0,0,0,0,0,1,0,0


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 0 to 235
Data columns (total 42 columns):
Administrator                                   236 non-null object
After_School_Hours                              143 non-null object
Attendance_Boundaries                           236 non-null object
Bilingual_Services                              226 non-null object
Classroom_Languages                             226 non-null object
Dress_Code                                      236 non-null object
Grades_Offered_All                              236 non-null object
Graduation_Rate_School                          236 non-null float64
Hard_Of_Hearing                                 11 non-null object
Network                                         116 non-null object
School_Hours                                    232 non-null object
School_ID                                       236 non-null int64
School_Year                                     236 non-null object
Student_Coun

## Make Dress Code Binary

In [46]:
df["Dress_Code_dummie"] = pd.get_dummies(df['Dress_Code'], drop_first = True)


In [47]:
df.drop(columns='Dress_Code', inplace=True)

In [48]:
df.head()

Unnamed: 0,Administrator,After_School_Hours,Attendance_Boundaries,Bilingual_Services,Classroom_Languages,Grades_Offered_All,Graduation_Rate_School,Hard_Of_Hearing,Network,School_Hours,School_ID,School_Year,Student_Count_Total,Transportation_Bus,Transportation_El,Transportation_Metra,Zip_y,Zip_Mean_Income,Fully Accessible,Generally accessible,Student_Count_Asian_perc,Student_Count_Asian_Pacific_Islander_perc,Student_Count_Black_perc,Student_Count_English_Learners_perc,Student_Count_Ethnicity_Not_Available_perc,Student_Count_Hawaiian_Pacific_Islander_perc,Student_Count_Hispanic_perc,Student_Count_Low_Income_perc,Student_Count_Multi_perc,Student_Count_Native_American_perc,Student_Count_Other_Ethnicity_perc,Student_Count_Special_Ed_perc,Student_Count_White_perc,Classroom_Languages_count,Blue,Brown,Green,Orange,Pink,Purple,Red,Dress_Code_dummie
0,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,"French, Spanish",9101112,80.9,,,7:45 AM - 2:35 PM,609764,School Year 2016-2017,1739,"9, 21, 60","Blue, Pink, Red",,60608.0,53353,0,1,0.003,0.0,0.033,0.19,0.006,0.0,0.945,0.954,0.002,0.003,0.0,0.162,0.007,2.0,1,0,0,0,1,0,1,0
1,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,"French, Spanish",9101112,78.6,,ISP,7:45 AM - 2:35 PM,609764,School Year 2017-2018,1706,"9, 21, 60","Blue, Pink, Red",,60608.0,57770,0,1,0.006,0.0,0.031,0.201,0.008,0.0,0.943,0.626,0.004,0.004,0.0,0.162,0.006,2.0,1,0,0,0,1,0,1,0
2,Dr. Hillyn Senuholtz,3:45 PM - 5:30 PM,N,Y,Spanish,9101112,82.8,,,7:45AM - 3:44PM,400104,School Year 2016-2017,742,"49, 60",,,60608.0,53353,0,0,0.0,0.0,0.032,0.228,0.0,0.0,0.962,0.97,0.0,0.001,0.0,0.197,0.004,1.0,1,0,0,1,1,0,0,1
3,Ms. Kimberly Burks,3:45 PM - 4:30 PM,N,Y,Spanish,9101112,85.0,,Charter,7:45AM - 3:44PM,400104,School Year 2017-2018,745,"49, 60","Blue, Orange, Pink",BNSF Railway (BNSF),60608.0,57770,0,0,0.0,0.0,0.019,0.286,0.001,0.0,0.974,0.93,0.0,0.003,0.0,0.196,0.003,1.0,1,0,0,1,1,0,0,1
4,Sharnette Sims,,N,,,9101112,1.7,,,7:00 am - 4:00 pm,609748,School Year 2016-2017,199,,,,60608.0,53353,0,0,0.0,0.0,0.774,0.01,0.0,0.0,0.181,0.874,0.0,0.015,0.0,0.291,0.03,,0,0,0,0,0,0,0,0


## Grades offered count

In [49]:
df.Grades_Offered_All.value_counts()

9,10,11,12                      191
7,8,9,10,11,12                   18
6,7,8,9,10,11,12                 17
10,11,12                          3
K,1,2,3,4,5,6,7,8,9,10,11,12      2
3,4,5,6,7,8,9,10,11,12            2
PK,9,10,11,12                     2
8,9,10,11,12                      1
Name: Grades_Offered_All, dtype: int64

In [50]:
df_grades = df[['Grades_Offered_All', 'School_ID']]

                                            

In [51]:
df_grades['grades_offered_count'] = (
                                            df_grades['Grades_Offered_All'].str.count(',') 
                                            + 1
                                            )
  
    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
df_grades.grades_offered_count.value_counts()

4     191
6      18
7      17
5       3
3       3
13      2
10      2
Name: grades_offered_count, dtype: int64

In [53]:
df_grades.drop(columns = ['Grades_Offered_All'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [54]:
df_grades.drop_duplicates('School_ID', inplace=True)
df_grades.sort_values('grades_offered_count')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,School_ID,grades_offered_count
0,609764,4
168,609727,4
166,609754,4
162,609695,4
160,609749,4
158,609737,4
154,400013,4
150,610524,4
148,609698,4
146,400087,4


In [55]:
df = pd.merge(df, df_grades, on='School_ID')

In [56]:
df.drop(columns = ['Classroom_Languages', 'Grades_Offered_All'], inplace=True)

In [57]:
df.head()

Unnamed: 0,Administrator,After_School_Hours,Attendance_Boundaries,Bilingual_Services,Graduation_Rate_School,Hard_Of_Hearing,Network,School_Hours,School_ID,School_Year,Student_Count_Total,Transportation_Bus,Transportation_El,Transportation_Metra,Zip_y,Zip_Mean_Income,Fully Accessible,Generally accessible,Student_Count_Asian_perc,Student_Count_Asian_Pacific_Islander_perc,Student_Count_Black_perc,Student_Count_English_Learners_perc,Student_Count_Ethnicity_Not_Available_perc,Student_Count_Hawaiian_Pacific_Islander_perc,Student_Count_Hispanic_perc,Student_Count_Low_Income_perc,Student_Count_Multi_perc,Student_Count_Native_American_perc,Student_Count_Other_Ethnicity_perc,Student_Count_Special_Ed_perc,Student_Count_White_perc,Classroom_Languages_count,Blue,Brown,Green,Orange,Pink,Purple,Red,Dress_Code_dummie,grades_offered_count
0,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,80.9,,,7:45 AM - 2:35 PM,609764,School Year 2016-2017,1739,"9, 21, 60","Blue, Pink, Red",,60608.0,53353,0,1,0.003,0.0,0.033,0.19,0.006,0.0,0.945,0.954,0.002,0.003,0.0,0.162,0.007,2.0,1,0,0,0,1,0,1,0,4
1,Juan Carlos Ocon,2:40 to 8:00 P.M.,Y,Y,78.6,,ISP,7:45 AM - 2:35 PM,609764,School Year 2017-2018,1706,"9, 21, 60","Blue, Pink, Red",,60608.0,57770,0,1,0.006,0.0,0.031,0.201,0.008,0.0,0.943,0.626,0.004,0.004,0.0,0.162,0.006,2.0,1,0,0,0,1,0,1,0,4
2,Dr. Hillyn Senuholtz,3:45 PM - 5:30 PM,N,Y,82.8,,,7:45AM - 3:44PM,400104,School Year 2016-2017,742,"49, 60",,,60608.0,53353,0,0,0.0,0.0,0.032,0.228,0.0,0.0,0.962,0.97,0.0,0.001,0.0,0.197,0.004,1.0,1,0,0,1,1,0,0,1,4
3,Ms. Kimberly Burks,3:45 PM - 4:30 PM,N,Y,85.0,,Charter,7:45AM - 3:44PM,400104,School Year 2017-2018,745,"49, 60","Blue, Orange, Pink",BNSF Railway (BNSF),60608.0,57770,0,0,0.0,0.0,0.019,0.286,0.001,0.0,0.974,0.93,0.0,0.003,0.0,0.196,0.003,1.0,1,0,0,1,1,0,0,1,4
4,Sharnette Sims,,N,,1.7,,,7:00 am - 4:00 pm,609748,School Year 2016-2017,199,,,,60608.0,53353,0,0,0.0,0.0,0.774,0.01,0.0,0.0,0.181,0.874,0.0,0.015,0.0,0.291,0.03,,0,0,0,0,0,0,0,0,4


In [58]:
df.drop(columns = ['Attendance_Boundaries', 'Bilingual_Services', 
                   'Hard_Of_Hearing', 'Transportation_Bus', 
                   'Transportation_El', 'Transportation_Metra'], inplace=True)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 0 to 235
Data columns (total 35 columns):
Administrator                                   236 non-null object
After_School_Hours                              143 non-null object
Graduation_Rate_School                          236 non-null float64
Network                                         116 non-null object
School_Hours                                    232 non-null object
School_ID                                       236 non-null int64
School_Year                                     236 non-null object
Student_Count_Total                             236 non-null int64
Zip_y                                           236 non-null float64
Zip_Mean_Income                                 236 non-null object
Fully Accessible                                236 non-null uint8
Generally accessible                            236 non-null uint8
Student_Count_Asian_perc                        236 non-null float64
Student_Coun

## Charter Dummie

In [60]:
df_networks = df[['School_ID', 'Network', 'School_Year']]

In [61]:
df_networks.Network.value_counts()

Charter       30
Network 16    18
Network 17    18
Network 15    17
Network 14    15
ISP           11
AUSL           5
Contract       2
Name: Network, dtype: int64

In [62]:
df_networks2017 = df_networks[df_networks['School_Year']=='School Year 2017-2018']

In [63]:
df_networks2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118 entries, 1 to 235
Data columns (total 3 columns):
School_ID      118 non-null int64
Network        116 non-null object
School_Year    118 non-null object
dtypes: int64(1), object(2)
memory usage: 3.7+ KB


In [64]:
def charter(row):
    if row['Network'] == 'Charter':
        return 1
    else:
        return 0


In [65]:
df_networks2017['charter'] = df_networks2017.apply(lambda row: charter(row), axis=1)
df_networks2017.drop(columns=['School_Year', 'Network'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [66]:
df_networks2017.head()

Unnamed: 0,School_ID,charter
1,609764,0
3,400104,1
5,609748,0
7,400057,1
9,609750,0


In [67]:
df_networks2017.charter.value_counts()

0    88
1    30
Name: charter, dtype: int64

In [68]:
df = pd.merge(df, df_networks2017, on='School_ID')

In [69]:
df.drop(columns='Network', inplace=True)

In [70]:
df.School_Hours.value_counts()

9:00  AM - 4:15  PM                                  10
8:00  AM - 3:15  PM                                  10
8:00 am-3:15 pm                                       8
8:00 AM-3:20 PM                                       8
8:00 AM-3:30 PM                                       6
8:30 AM-4:30 PM                                       6
8:00 AM-3:15 PM                                       5
7:30 AM-2:45 PM                                       4
7:45  AM - 3:00  PM                                   4
7:45 AM-3:00 PM                                       4
9:00  AM - 4:08  PM                                   3
8:00  AM - 3:23  PM                                   2
                                                     ..
8:00  AM - 3:15 PM                                    1
9:00 AM - 4:15 PM                                     1
7:45--3:16; 7:45--1:31 Wednesday                      1
8:40 AM - 3:53 PM (M,T,Th, F)   8:40AM - 3:01 (W)     1
7:45am - 2:42pm (M,T,TH,F); 7:45am - 1:32pm (W) 

## Admin. Gender

In [74]:
import re

In [115]:
df_admin = df[['Administrator', 'School_ID']]

In [116]:
df_admin.drop_duplicates('School_ID')

Unnamed: 0,Administrator,School_ID
0,Juan Carlos Ocon,609764
2,Dr. Hillyn Senuholtz,400104
4,Sharnette Sims,609748
6,Tressie McDonough,400057
8,Mrs.Sherita D Carter-King,609750
10,Mr. Patrick Robinson,400102
12,Ms. Linnea Garrett,400091
14,Ms. Carrie Spitz,400054
16,Ms.Emily Ann Feltes,609691
18,Leticia Hernandez,609716


In [117]:
gender = re.compile(r'Mrs|Ms|Mr')
#female_title = re.compile(r'Mrs|Ms')

In [126]:
df_admin['gender_marker'] = df_admin['Administrator'].astype(str).str.match(gender)
df_admin.drop_duplicates('School_ID', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [127]:
df_admin.gender_marker.value_counts()

False    72
True     46
Name: gender_marker, dtype: int64

In [162]:
male_marker = re.compile(r'Mr|Juan|Richard|Ali|Kevin|Douglas|Raul|Victor|Abdul|Charles|Antonio|Brian|Francisco|\
        Sheldon|Michael|Stephen|Peter|Gregory|Trent|Myron|Gerald|Elias|Octavio|Matthew|\
        David|Leonard|Ferdinand|Fernando|Mark|Patrick|George|Wayne|Anthony|William|\
        Stephen|Timothy|Paul')
female_marker = re.compile(r'Mrs|Ms|Dr. Hillyn|Sharnette|Tressie|Leticia|Priscilla|Joyce|Stephanie|Tanya|Veronica|Kathy|\
         Sandra|Torry|Stephanie|Carolyn|Milena|Vanesa|Breanda|Laura|Kelly|Anna|Nancy|\
         Tamika|Janice|Mary|Shanele|Falilat|Dr.Femi|Noel|Tawanna|Tonya|Sandra|Dr. Vanesa|Tamika')
print(df_admin[df_admin['gender_marker']==False].Administrator.values)


['Juan Carlos Ocon' 'Dr. Hillyn Senuholtz' 'Sharnette Sims'
 'Tressie McDonough' 'Leticia Hernandez' 'Richard C Smith Jr'
 'Ali N Muhammad' 'Tawanna Patton' 'Kevin John Gallick'
 'Douglas Lloyd Maclin' 'Priscilla Horton' 'Joyce Dorsey Kenner'
 'Dr. Richard Lebron' 'Raul Magdaleno' 'Victor Iturralde'
 'Abdul K Muhammad' 'Charles Anderson Jr' 'Antonio Deangelo Ross'
 'Stephanie Yvette Moore' 'Tonya Hammaker' 'Veronica M Iturralde'
 'Kathy Farr' 'Charles Naphtali Smith' 'Brian Joseph Rogers'
 'Francisco Albert Borras' 'Sheldon Dion House' 'Michael S Beyer'
 'Stephen Joseph Ngo' 'Sandra Kay Carlson' 'Peter J Auffant'
 'Torry A. Bennett' 'Michael Joseph Boraz' 'Gregory L Jones'
 'Stephanie K Glover-Douglas' 'Trent Kelly' 'Myron L Hester'
 'Carolyn Dolores Epps' 'Michael Wm Durr' 'Milena Nedeljkovic'
 'Gerald J Morrow' 'Dr. Vanesa Scott-Thompson' 'Elias Estrada'
 'Brian Craig Tennison' 'Brenda Stolle' 'Laura A Lemone' 'Kelly L Mest'
 'Anna Pavichevich' 'Octavio Casas' 'Matthew G Sullivan' 'D

In [163]:
df_admin['gender_f'] = df_admin['Administrator'].astype(str).str.match(female_marker)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [164]:
df_admin[df_admin['gender_f'] == False].values

array([['Juan Carlos Ocon', 609764, False, False],
       ['Mr. Patrick Robinson', 400102, True, False],
       ['Mr.Mark Schall', 609679, True, False],
       ['Richard C Smith Jr', 609705, False, False],
       ['Ali N Muhammad', 609761, False, False],
       ['Kevin John Gallick', 609739, False, False],
       ['Douglas Lloyd Maclin', 609674, False, False],
       ['Dr. Richard Lebron', 400036, False, False],
       ['Raul Magdaleno', 609715, False, False],
       ['Victor Iturralde', 610543, False, False],
       ['Abdul K Muhammad', 610245, False, False],
       ['Charles Anderson Jr', 610244, False, False],
       ['Antonio Deangelo Ross', 609713, False, False],
       ['Charles Naphtali Smith', 610384, False, False],
       ['Brian Joseph Rogers', 610392, False, False],
       ['Francisco Albert Borras', 610357, False, False],
       ['Sheldon Dion House', 609692, False, False],
       ['Mr. Eron Powell', 400062, True, False],
       ['Mr. Eron Powell', 400061, True, False],
   

In [166]:
df_admin.gender_f.value_counts()

False    60
True     58
Name: gender_f, dtype: int64

In [167]:
list(df_admin)

['Administrator', 'School_ID', 'gender_marker', 'gender_f']

In [169]:
df_admin.drop(columns = ['Administrator', 'gender_marker'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [172]:
df = pd.merge(df, df_admin, on='School_ID')


In [175]:
df.head(5)

Unnamed: 0,Administrator,After_School_Hours,Graduation_Rate_School,School_Hours,School_ID,School_Year,Student_Count_Total,Zip_y,Zip_Mean_Income,Fully Accessible,Generally accessible,Student_Count_Asian_perc,Student_Count_Asian_Pacific_Islander_perc,Student_Count_Black_perc,Student_Count_English_Learners_perc,Student_Count_Ethnicity_Not_Available_perc,Student_Count_Hawaiian_Pacific_Islander_perc,Student_Count_Hispanic_perc,Student_Count_Low_Income_perc,Student_Count_Multi_perc,Student_Count_Native_American_perc,Student_Count_Other_Ethnicity_perc,Student_Count_Special_Ed_perc,Student_Count_White_perc,Classroom_Languages_count,Blue,Brown,Green,Orange,Pink,Purple,Red,Dress_Code_dummie,grades_offered_count,charter,gender_f
0,Juan Carlos Ocon,2:40 to 8:00 P.M.,80.9,7:45 AM - 2:35 PM,609764,School Year 2016-2017,1739,60608.0,53353,0,1,0.003,0.0,0.033,0.19,0.006,0.0,0.945,0.954,0.002,0.003,0.0,0.162,0.007,2.0,1,0,0,0,1,0,1,0,4,0,False
1,Juan Carlos Ocon,2:40 to 8:00 P.M.,78.6,7:45 AM - 2:35 PM,609764,School Year 2017-2018,1706,60608.0,57770,0,1,0.006,0.0,0.031,0.201,0.008,0.0,0.943,0.626,0.004,0.004,0.0,0.162,0.006,2.0,1,0,0,0,1,0,1,0,4,0,False
2,Dr. Hillyn Senuholtz,3:45 PM - 5:30 PM,82.8,7:45AM - 3:44PM,400104,School Year 2016-2017,742,60608.0,53353,0,0,0.0,0.0,0.032,0.228,0.0,0.0,0.962,0.97,0.0,0.001,0.0,0.197,0.004,1.0,1,0,0,1,1,0,0,1,4,1,True
3,Ms. Kimberly Burks,3:45 PM - 4:30 PM,85.0,7:45AM - 3:44PM,400104,School Year 2017-2018,745,60608.0,57770,0,0,0.0,0.0,0.019,0.286,0.001,0.0,0.974,0.93,0.0,0.003,0.0,0.196,0.003,1.0,1,0,0,1,1,0,0,1,4,1,True
4,Sharnette Sims,,1.7,7:00 am - 4:00 pm,609748,School Year 2016-2017,199,60608.0,53353,0,0,0.0,0.0,0.774,0.01,0.0,0.0,0.181,0.874,0.0,0.015,0.0,0.291,0.03,,0,0,0,0,0,0,0,0,4,0,True


In [177]:
df.drop(columns = ['Administrator','After_School_Hours', 'School_Hours'], inplace = True)

In [178]:
df.head()

Unnamed: 0,Graduation_Rate_School,School_ID,School_Year,Student_Count_Total,Zip_y,Zip_Mean_Income,Fully Accessible,Generally accessible,Student_Count_Asian_perc,Student_Count_Asian_Pacific_Islander_perc,Student_Count_Black_perc,Student_Count_English_Learners_perc,Student_Count_Ethnicity_Not_Available_perc,Student_Count_Hawaiian_Pacific_Islander_perc,Student_Count_Hispanic_perc,Student_Count_Low_Income_perc,Student_Count_Multi_perc,Student_Count_Native_American_perc,Student_Count_Other_Ethnicity_perc,Student_Count_Special_Ed_perc,Student_Count_White_perc,Classroom_Languages_count,Blue,Brown,Green,Orange,Pink,Purple,Red,Dress_Code_dummie,grades_offered_count,charter,gender_f
0,80.9,609764,School Year 2016-2017,1739,60608.0,53353,0,1,0.003,0.0,0.033,0.19,0.006,0.0,0.945,0.954,0.002,0.003,0.0,0.162,0.007,2.0,1,0,0,0,1,0,1,0,4,0,False
1,78.6,609764,School Year 2017-2018,1706,60608.0,57770,0,1,0.006,0.0,0.031,0.201,0.008,0.0,0.943,0.626,0.004,0.004,0.0,0.162,0.006,2.0,1,0,0,0,1,0,1,0,4,0,False
2,82.8,400104,School Year 2016-2017,742,60608.0,53353,0,0,0.0,0.0,0.032,0.228,0.0,0.0,0.962,0.97,0.0,0.001,0.0,0.197,0.004,1.0,1,0,0,1,1,0,0,1,4,1,True
3,85.0,400104,School Year 2017-2018,745,60608.0,57770,0,0,0.0,0.0,0.019,0.286,0.001,0.0,0.974,0.93,0.0,0.003,0.0,0.196,0.003,1.0,1,0,0,1,1,0,0,1,4,1,True
4,1.7,609748,School Year 2016-2017,199,60608.0,53353,0,0,0.0,0.0,0.774,0.01,0.0,0.0,0.181,0.874,0.0,0.015,0.0,0.291,0.03,,0,0,0,0,0,0,0,0,4,0,True


## Pickle 29 Feature Columns

In [179]:
with open('data/pickles/features_29columns.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

## Can't Get Working: After School Hours

In [None]:
#df_as_hours = df[['After_School_Hours', 'School_ID']]

In [None]:
#print(df_as_hours['After_School_Hours'].value_counts())

In [None]:
#df_as_hours['as_start'] = df_as_hours['After_School_Hours'].astype(str).str[0:4]

In [None]:
#df_as_hours.head(30)

In [None]:
#df_as_hours.as_start.value_counts()

In [None]:
#df_as_hours['no_pm'] = df_as_hours.After_School_Hours.str.replace('P.M.', '')

In [None]:
#df_as_hours['no_pm'] = df_as_hours.no_pm.str.replace('PM', '')

In [None]:
#df_as_hours['as_end'] = df_as_hours['no_pm'].astype(str).str[-5:]

In [None]:
#df_as_hours.as_end = df_as_hours.as_end.str.replace('-', '')

In [None]:
#df_as_hours.as_end.value_counts()

In [None]:
#import re

In [None]:
#after_end_dash = re.compile(r"\-(.*)")
#after_end_to = re.compile(r"t(.*)")

In [None]:
'''
for hour_range in df['After_School_Hours'].items():
    #print(hour_range[1])
    if '-' in str(hour_range[1]):
        df_as_hours['as_stop'] = after_end_dash.search(hour_range[1])
    elif 'to' in str(hour_range[1]):
            df_as_hours['as_stop']=after_end_to.search(hour_range[1])

'''



In [None]:
#df_as_hours.as_stop.value_counts()