In [1]:
import pandas as pd
import numpy as np
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

import psycopg2 as pg
import pandas.io.sql as pd_sql

In [2]:
connection_args = {
    'host': 'localhost',  
    'dbname': 'cps',    
    'port': 5432
}

conn = pg.connect(**connection_args)

# The following data is not considered in this model
                   Address', 'Administrator_Title', 'CPS_School_Profile', 
                   'City', 'Classification_Description', 'Closed_For_Enrollment_Date',
                  'College_Enrollment_Rate_Mean', 'College_Enrollment_Rate_School',
                  'Demographic_Description', 'Earliest_Drop_Off_Time', 'Facebook', 
                  'Fax', 'Fifth_Contact_Name', 'Fifth_Contact_Title', 'Finance_ID', 
                  'Fourth_Contact_Name', 'Fourth_Contact_Title', 'Freshman_Start_End_Time',
                  'Grades_Offered', 'Graduation_Rate_Mean', 'Is_Elementary_School', 
                  'Is_GoCPS_Elementary', 'Is_GoCPS_High_School', 'Is_GoCPS_PreK', 
                  'Is_High_School', 'Is_Middle_School', 'Is_Pre_School', 'Kindergarten_School_Day',
                  'Legacy_Unit_ID', 'Long_Name', 'Mean_ACT', 'Open_For_Enrollment_Date', 
                  'Overall_Rating', 'Phone', 'Pinterest', 'PreK_School_Day', 'PreSchool_Inclusive', 
                  'Preschool_Instructional', 'Primary_Category', 'Rating_Statement', 'Rating_Status', 
                  'Refugee_Services', 'School_Type','Secondary_Contact', 'Secondary_Contact_Title', 
                  'Seventh_Contact_Name', 'Seventh_Contact_Title', 'Short_Name', 'Short_Name', 
                  'Significantly_Modified', 'Sixth_Contact_Name', 'Sixth_Contact_Title', 'State',
                  'Statistics_Description', 'Summary', 'Third_Contact_Name', 'Third_Contact_Title', 
                  'Title_1_Eligible', 'Twitter', 'Visual_Impairments', 'Website',
                  'Youtube', 'Zip_x', 'Attendance_Boundaries','Average_ACT_School', 
                   'Bilingual_Services','Hard_Of_Hearing', 'Is_GoCPS_Participant',
                  'School_Latitude', 'School_Longitude', 'Average_ACT_School', 'Location',
                   'Transportation_Bus','Transportation_Metra', 'After_School_Hours','School_Hours'

In [3]:
target_columns_1617 = '''School_Id,
                      Administrator,
                      Grades_Offered_All,
                      Student_Count_Total,
                      Student_Count_Low_Income,
                      Student_Count_Special_Ed,
                      Student_Count_English_Learners,
                      Student_Count_Black,
                      Student_Count_Hispanic,
                      Student_Count_White,
                      Student_Count_Asian,
                      Student_Count_Native_American,
                      Student_Count_Other_Ethnicity,
                      Student_Count_Asian_Pacific_Islander,
                      Student_Count_Multi,
                      Student_Count_Hawaiian_Pacific_Islander,
                      Student_Count_Ethnicity_Not_Available,
                      ADA_Accessible,
                      Dress_Code,
                      Classroom_Languages,
                      Transportation_El,
                      Graduation_Rate_School,
                      School_Year,
                      Zip
                '''

In [4]:
#no ADA accesible field for 1718, zip not differentiated for 1718
target_columns_1718 = '''School_Id,
                      Administrator,
                      Grades_Offered_All,
                      Student_Count_Total,
                      Student_Count_Low_Income,
                      Student_Count_Special_Ed,
                      Student_Count_English_Learners,
                      Student_Count_Black,
                      Student_Count_Hispanic,
                      Student_Count_White,
                      Student_Count_Asian,
                      Student_Count_Native_American,
                      Student_Count_Other_Ethnicity,
                      Student_Count_Asian_Pacific_Islander,
                      Student_Count_Multi,
                      Student_Count_Hawaiian_Pacific_Islander,
                      Student_Count_Ethnicity_Not_Available,
                      Dress_Code,
                      Classroom_Languages,
                      Transportation_El,
                      Graduation_Rate_School,
                      School_Year,
                      Network
                '''

In [5]:
cur = conn.cursor()
query_1617 = ('''SELECT %s FROM allschools_1617 WHERE is_high_school = 'Y' '''%target_columns_1617);
df_hs_201617 = pd_sql.read_sql(query_1617, conn)
query_1718 = ('''SELECT %s FROM allschools_1718 WHERE is_high_school = 'Y' ''' %target_columns_1718);
df_hs_201718 = pd_sql.read_sql(query_1718, conn)
dfs=[df_hs_201617,df_hs_201718]
df = pd.concat(dfs)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 368 entries, 0 to 183
Data columns (total 25 columns):
ada_accessible                             184 non-null object
administrator                              366 non-null object
classroom_languages                        279 non-null object
dress_code                                 368 non-null object
grades_offered_all                         368 non-null object
graduation_rate_school                     245 non-null float64
network                                    179 non-null object
school_id                                  368 non-null int64
school_year                                368 non-null object
student_count_asian                        368 non-null int64
student_count_asian_pacific_islander       368 non-null int64
student_count_black                        368 non-null int64
student_count_english_learners             368 non-null int64
student_count_ethnicity_not_available      368 non-null int64
student_count_hawa

## Fill in ADA accessible and create ADA dummie

In [7]:
# 2016-17 has ADA accessibility data, and I will assume it is the same for 17-18.

In [8]:
ada = df[df['school_year'] == 'School Year 2016-2017'][['ada_accessible', 'school_id']]
df = pd.merge(df, ada, on = 'school_id' )
df.drop(columns = 'ada_accessible_x', inplace = True)
df = pd.concat((df, pd.get_dummies(df['ada_accessible_y'])), axis=1)
#having [0,0] for fully and generally accessible implies no or unknown accessibility
df.drop(columns = ['No/unknown accessibility','ada_accessible_y'], inplace=True)

## Demographic Percentage Transformation

In [9]:
#Convert student demographic counts to percent of total population
print(list(df))
demog_headings = ['student_count_asian', 'student_count_asian_pacific_islander', 'student_count_black', 
                  'student_count_english_learners', 'student_count_ethnicity_not_available', 
                  'student_count_hawaiian_pacific_islander', 'student_count_hispanic', 
                  'student_count_low_income', 'student_count_multi', 'student_count_native_american', 
                  'student_count_other_ethnicity', 'student_count_special_ed', 'student_count_white']
for dem in demog_headings:
    dem_per_heading = dem + '_perc'
    df[dem_per_heading] = df[dem]/df['student_count_total']
    df.drop(columns = dem, inplace=True)

['administrator', 'classroom_languages', 'dress_code', 'grades_offered_all', 'graduation_rate_school', 'network', 'school_id', 'school_year', 'student_count_asian', 'student_count_asian_pacific_islander', 'student_count_black', 'student_count_english_learners', 'student_count_ethnicity_not_available', 'student_count_hawaiian_pacific_islander', 'student_count_hispanic', 'student_count_low_income', 'student_count_multi', 'student_count_native_american', 'student_count_other_ethnicity', 'student_count_special_ed', 'student_count_total', 'student_count_white', 'transportation_el', 'zip', 'Fully Accessible', 'Generally accessible']


## Language Count Transform

In [10]:
df.classroom_languages.value_counts()

Spanish                                                                                                        121
French, Spanish                                                                                                 45
Spanish, Spanish for Heritage Speakers                                                                          31
French, Spanish, Spanish for Heritage Speakers                                                                  13
Chinese, French, Spanish                                                                                         8
Mandarin, Spanish                                                                                                7
American Sign Language                                                                                           4
Chinese, Spanish                                                                                                 4
Chinese, French, Japanese, Latin, Spanish                                       

In [11]:
df.classroom_languages.isna().sum()

87

In [12]:
df_languages = df[['school_id', 'classroom_languages', 'school_year']]
#the data appears to show the same value for languages across the two years
#so the data will be transformed on one year to prevent duplication at time of merge
df_languages = df_languages[df_languages['school_year'] == 'School Year 2016-2017']

In [13]:
#Because this is a comma count, the values with zero commas have 1 language, 
#one comma have 2, etc.
df_languages.classroom_languages.str.count(',').value_counts()

0.0    64
1.0    50
2.0    14
3.0     5
4.0     3
5.0     2
9.0     1
7.0     1
Name: classroom_languages, dtype: int64

In [14]:
#NaN will be treated as 0 in the language count. 
#Since language count will be performed with a comma count,
#set nan to 15 and reset to 0 after the count is performed.
df_languages.fillna(',,,,,,,,,,,,,,', inplace=True)
df_languages['Classroom_Languages_count'] = (
                                            df_languages['classroom_languages'].str.count(',') 
                                            + 1
                                            )
df_languages['Classroom_Languages_count'].loc[df_languages['Classroom_Languages_count'] == 15] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
#count values to show that now there are 44 schools with 0 languages
df_languages['Classroom_Languages_count'].value_counts()

1     64
2     50
0     44
3     14
4      5
5      3
6      2
10     1
8      1
Name: Classroom_Languages_count, dtype: int64

In [16]:
df_languages.sort_values('Classroom_Languages_count', ascending = False)
df_languages.drop(columns = ['classroom_languages', 'school_year'], inplace=True)
df = pd.merge(df, df_languages, on='school_id')

In [17]:
df.head()

Unnamed: 0,administrator,classroom_languages,dress_code,grades_offered_all,graduation_rate_school,network,school_id,school_year,student_count_total,transportation_el,zip,Fully Accessible,Generally accessible,student_count_asian_perc,student_count_asian_pacific_islander_perc,student_count_black_perc,student_count_english_learners_perc,student_count_ethnicity_not_available_perc,student_count_hawaiian_pacific_islander_perc,student_count_hispanic_perc,student_count_low_income_perc,student_count_multi_perc,student_count_native_american_perc,student_count_other_ethnicity_perc,student_count_special_ed_perc,student_count_white_perc,Classroom_Languages_count
0,Ms. Irma Plaxico,,N,9101112,,,400124,School Year 2016-2017,124,Green,60616.0,0,0,0.0,0.0,0.944,0.016,0.0,0.0,0.056,0.976,0.0,0.0,0.0,0.234,0.0,0
1,Ms. Irma Plaxico,,N,9101112,,Options,400124,School Year 2017-2018,153,Green,,0,0,0.0,0.0,0.974,0.007,0.0,0.0,0.02,0.634,0.0,0.0,0.0,0.242,0.007,0
2,Juan Carlos Ocon,"French, Spanish",N,9101112,80.9,,609764,School Year 2016-2017,1739,"Blue, Pink, Red",60608.0,0,1,0.003,0.0,0.033,0.19,0.006,0.0,0.945,0.954,0.002,0.003,0.0,0.162,0.007,2
3,Juan Carlos Ocon,"French, Spanish",N,9101112,78.6,ISP,609764,School Year 2017-2018,1706,"Blue, Pink, Red",,0,1,0.006,0.0,0.031,0.201,0.008,0.0,0.943,0.626,0.004,0.004,0.0,0.162,0.006,2
4,Ms. Carrie Spitz,"Russian, Spanish",Y,9101112,90.3,,400054,School Year 2016-2017,992,"Blue, Brown",60639.0,0,1,0.001,0.0,0.03,0.121,0.0,0.0,0.953,0.944,0.002,0.005,0.0,0.107,0.009,2


## El Dummie Transform

In [18]:
el_df = df[['school_id', 'transportation_el', 'school_year']]
#Again, like languages, looks like El values are consistent across years.
el_df = el_df[el_df['school_year'] == 'School Year 2017-2018']
el_df['transportation_el'].value_counts()
#replace NaN with no_el so that with dummy variable I can drop No_El
el_df.fillna(value = 'No_El', inplace=True)
el_dummies = el_df['transportation_el'].str.get_dummies(sep = ', ')
el_dummies.drop(columns = 'No_El', inplace=True)
el_df = pd.merge(el_df,el_dummies, left_index=True, right_index=True)
el_df.drop(columns = ['transportation_el', 'school_year'], inplace=True)
df = pd.merge(df, el_df, on='school_id')
df.drop(columns = ['transportation_el'], inplace=True)

In [19]:
#decide which column to drop for dummies based on column count
line_list = ['Blue','Brown','Green','Orange','Pink','Purple','Red']
for line in line_list:
    print(el_df[line].value_counts())

0    128
1     54
Name: Blue, dtype: int64
0    168
1     14
Name: Brown, dtype: int64
0    138
1     44
Name: Green, dtype: int64
0    164
1     18
Name: Orange, dtype: int64
0    170
1     12
Name: Pink, dtype: int64
0    177
1      5
Name: Purple, dtype: int64
0    123
1     59
Name: Red, dtype: int64


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 364 entries, 0 to 363
Data columns (total 33 columns):
administrator                                   364 non-null object
classroom_languages                             277 non-null object
dress_code                                      364 non-null object
grades_offered_all                              364 non-null object
graduation_rate_school                          243 non-null float64
network                                         179 non-null object
school_id                                       364 non-null int64
school_year                                     364 non-null object
student_count_total                             364 non-null int64
zip                                             182 non-null float64
Fully Accessible                                364 non-null uint8
Generally accessible                            364 non-null uint8
student_count_asian_perc                        362 non-null float64
student_coun

## Make Dress Code Binary

In [21]:
df["Dress_Code_dummie"] = pd.get_dummies(df['dress_code'], drop_first = True)
df.drop(columns='dress_code', inplace=True)

## Grades offered count

In [22]:
df_grades = df[['grades_offered_all', 'school_id']]
df_grades['grades_offered_count'] = (df_grades['grades_offered_all'].str.count(',') + 1)     
df_grades.drop(columns = ['grades_offered_all'], inplace=True)
df_grades.drop_duplicates('school_id', inplace=True)
df_grades.sort_values('grades_offered_count')
df = pd.merge(df, df_grades, on='school_id')
df.drop(columns = ['classroom_languages', 'grades_offered_all'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [23]:
list(df)

['administrator',
 'graduation_rate_school',
 'network',
 'school_id',
 'school_year',
 'student_count_total',
 'zip',
 'Fully Accessible',
 'Generally accessible',
 'student_count_asian_perc',
 'student_count_asian_pacific_islander_perc',
 'student_count_black_perc',
 'student_count_english_learners_perc',
 'student_count_ethnicity_not_available_perc',
 'student_count_hawaiian_pacific_islander_perc',
 'student_count_hispanic_perc',
 'student_count_low_income_perc',
 'student_count_multi_perc',
 'student_count_native_american_perc',
 'student_count_other_ethnicity_perc',
 'student_count_special_ed_perc',
 'student_count_white_perc',
 'Classroom_Languages_count',
 'Blue',
 'Brown',
 'Green',
 'Orange',
 'Pink',
 'Purple',
 'Red',
 'Dress_Code_dummie',
 'grades_offered_count']

## Charter Dummie

In [24]:
df_networks = df[['school_id', 'network', 'school_year']]

In [25]:
df_networks2017 = df_networks[df_networks['school_year']=='School Year 2017-2018']

In [26]:
def charter(row):
    if row['network'] == 'Charter':
        return 1
    else:
        return 0


In [27]:
df_networks2017['charter'] = df_networks2017.apply(lambda row: charter(row), axis=1)
df_networks2017.drop(columns=['school_year', 'network'], inplace=True)
df = pd.merge(df, df_networks2017, on='school_id')
df.drop(columns='network', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Admin. Gender

In [28]:
import re

In [29]:
df_admin = df[['administrator', 'school_id']]

In [30]:
df_admin.drop_duplicates('school_id')

Unnamed: 0,administrator,school_id
0,Ms. Irma Plaxico,400124
2,Juan Carlos Ocon,609764
4,Ms. Carrie Spitz,400054
6,Ms.Shannae Bea Jackson,609726
8,Tawanna Patton,400094
10,Ms.Mary Bradley,400175
12,Joyce Dorsey Kenner,609755
14,Mr. Christopher Goins,400156
16,Ms.Allison C Tingwall,609756
18,Mrs.Patricia J Harper Reynolds,610518


In [31]:
gender = re.compile(r'Mrs|Ms|Mr')
#female_title = re.compile(r'Mrs|Ms')

In [32]:
df_admin['gender_marker'] = df_admin['administrator'].astype(str).str.match(gender)
df_admin.drop_duplicates('school_id', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [33]:
male_marker = re.compile(r'Mr|Juan|Richard|Ali|Kevin|Douglas|Raul|Victor|Abdul|Charles|Antonio|Brian|Francisco|\
        Sheldon|Michael|Stephen|Peter|Gregory|Trent|Myron|Gerald|Elias|Octavio|Matthew|\
        David|Leonard|Ferdinand|Fernando|Mark|Patrick|George|Wayne|Anthony|William|\
        Stephen|Timothy|Paul')
female_marker = re.compile(r'Mrs|Ms|Dr. Hillyn|Sharnette|Tressie|Leticia|Priscilla|Joyce|Stephanie|Tanya|Veronica|Kathy|\
         Sandra|Torry|Stephanie|Carolyn|Milena|Vanesa|Breanda|Laura|Kelly|Anna|Nancy|\
         Tamika|Janice|Mary|Shanele|Falilat|Dr.Femi|Noel|Tawanna|Tonya|Sandra|Dr. Vanesa|Tamika')
df_admin['gender_f'] = df_admin['administrator'].astype(str).str.match(female_marker)
df_admin.drop(columns = ['administrator', 'gender_marker'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [34]:
df = pd.merge(df, df_admin, on='school_id')
def gender(row):
    if row['gender_f'] == True:
        return 1
    else:
        return 0
df['gender_f_bn'] = df.apply(lambda row: gender(row), axis=1)
df.drop(columns = ['administrator', 'gender_f'], inplace = True)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 364 entries, 0 to 363
Data columns (total 32 columns):
graduation_rate_school                          243 non-null float64
school_id                                       364 non-null int64
school_year                                     364 non-null object
student_count_total                             364 non-null int64
zip                                             182 non-null float64
Fully Accessible                                364 non-null uint8
Generally accessible                            364 non-null uint8
student_count_asian_perc                        362 non-null float64
student_count_asian_pacific_islander_perc       362 non-null float64
student_count_black_perc                        362 non-null float64
student_count_english_learners_perc             362 non-null float64
student_count_ethnicity_not_available_perc      362 non-null float64
student_count_hawaiian_pacific_islander_perc    362 non-null float64
student

## Reflect and log transform graduation rate

In [36]:
import seaborn as sns
import matplotlib.pyplot as plt

In [37]:
df['Grad_Rate_Reflected'] = 100 - df['graduation_rate_school']

In [38]:
df['log_grad_rate'] = np.log(df['Grad_Rate_Reflected'])

In [39]:
df.drop(columns = ['graduation_rate_school', 'Grad_Rate_Reflected'], inplace=True)

## Merge Income Data Per Zip for 16/17 and 17/18

In [40]:
zip_2016 = df[df.school_year == 'School Year 2016-2017'][['school_id', 'zip']]
df = pd.merge(df, zip_2016, on='school_id')
df.drop('zip_x', axis=1, inplace=True)
df.rename(columns = {'zip_y':'zip'}, inplace=True)

In [41]:
df_2016 = df[df['school_year'] == 'School Year 2016-2017']
df_2016_income = pd.read_csv('data/meanHHincome_2016.csv', header=None)
df_2016_income.columns = ['zip', 'Zip_Mean_Income']
df_2016 = pd.merge(df_2016, df_2016_income, on = 'zip' )

In [42]:
df_2017 = df[df['school_year'] == 'School Year 2017-2018']
df_2017_income = pd.read_csv('data/meanHHincome_2017.csv', header=None)
df_2017_income.columns = ['zip', 'Zip_Mean_Income']
df_2017 = pd.merge(df_2017, df_2017_income, on = 'zip' )

In [43]:
df = df_2016.append(df_2017)

In [44]:
print(list(df))

['school_id', 'school_year', 'student_count_total', 'Fully Accessible', 'Generally accessible', 'student_count_asian_perc', 'student_count_asian_pacific_islander_perc', 'student_count_black_perc', 'student_count_english_learners_perc', 'student_count_ethnicity_not_available_perc', 'student_count_hawaiian_pacific_islander_perc', 'student_count_hispanic_perc', 'student_count_low_income_perc', 'student_count_multi_perc', 'student_count_native_american_perc', 'student_count_other_ethnicity_perc', 'student_count_special_ed_perc', 'student_count_white_perc', 'Classroom_Languages_count', 'Blue', 'Brown', 'Green', 'Orange', 'Pink', 'Purple', 'Red', 'Dress_Code_dummie', 'grades_offered_count', 'charter', 'gender_f_bn', 'log_grad_rate', 'zip', 'Zip_Mean_Income']


In [45]:
def replace_comma(row):
    return float(row['Zip_Mean_Income'].replace(',', ''))

df['Zip_Mean_Income'] = df.apply(lambda row: replace_comma(row), axis =1) 


In [46]:
df.head()

Unnamed: 0,school_id,school_year,student_count_total,Fully Accessible,Generally accessible,student_count_asian_perc,student_count_asian_pacific_islander_perc,student_count_black_perc,student_count_english_learners_perc,student_count_ethnicity_not_available_perc,student_count_hawaiian_pacific_islander_perc,student_count_hispanic_perc,student_count_low_income_perc,student_count_multi_perc,student_count_native_american_perc,student_count_other_ethnicity_perc,student_count_special_ed_perc,student_count_white_perc,Classroom_Languages_count,Blue,Brown,Green,Orange,Pink,Purple,Red,Dress_Code_dummie,grades_offered_count,charter,gender_f_bn,log_grad_rate,zip,Zip_Mean_Income
0,400124,School Year 2016-2017,124,0,0,0.0,0.0,0.944,0.016,0.0,0.0,0.056,0.976,0.0,0.0,0.0,0.234,0.0,0,0,0,1,0,0,0,0,0,4,0,1,,60616.0,64486.0
1,400134,School Year 2016-2017,161,0,0,0.006,0.0,0.267,0.062,0.0,0.0,0.683,0.963,0.006,0.0,0.0,0.149,0.037,0,0,0,0,1,0,0,0,1,4,0,1,,60616.0,64486.0
2,400064,School Year 2016-2017,388,1,0,0.01,0.0,0.869,0.021,0.0,0.005,0.101,0.863,0.01,0.0,0.0,0.193,0.005,1,0,0,1,0,0,0,1,1,7,1,1,2.66,60616.0,64486.0
3,400105,School Year 2016-2017,322,0,0,0.0,0.0,0.975,0.0,0.0,0.0,0.012,0.814,0.009,0.003,0.0,0.193,0.0,2,0,0,1,0,0,0,1,1,4,1,0,2.868,60616.0,64486.0
4,609676,School Year 2016-2017,455,0,1,0.0,0.0,0.965,0.02,0.0,0.0,0.033,0.965,0.0,0.002,0.0,0.273,0.0,2,0,0,0,0,0,0,0,0,4,0,0,3.35,60616.0,64486.0


In [47]:
# df.to_csv('data/hs_formodeling.csv', index=False)

## Pickle 29 Feature Columns

In [None]:
df.drop(columns = ['school_id', 'zip', 'school_year'], inplace=True)

In [None]:
with open('data/pickles/from_sql_29feat_formodeling.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)