In [208]:
import os, sys

# Set absolute path to the root folder of the directory
full_path = os.getcwd()
home_folder = 'CPS_GradRate_Analysis'
root = full_path.split(home_folder)[0] + home_folder + '/'
sys.path.append(root)


import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt

import sklearn.pipeline
from sklearn.linear_model import LinearRegression

from sklearn.impute import SimpleImputer
from statsmodels.graphics.gofplots import qqplot


from src.preprocessing.preprocessing_schoolid import merge_pr_and_sp
from src.preprocessing.preprocessing_schoolid import create_df_for_modeling
from src.preprocessing.preprocessing_schoolid import isolate_high_schools,\
                                                     drop_no_gr_schools,\
                                                     drop_specialed_options,\
                                                     make_percent_low_income,\
                                                     prep_teacher_attendance

import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Exploring Pipeline 

In [24]:
path_to_sp_csv = '../../data/chicago_data_portal_csv_files/Chicago_Public_Schools_-_School_Profile_Information_SY1819.csv'
path_to_pr_csv = '../../data/chicago_data_portal_csv_files/Chicago_Public_Schools_-_School_Progress_Reports_SY1819.csv'

hs_201819 = merge_pr_and_sp(path_to_sp_csv, path_to_pr_csv)

from sklearn.model_selection import train_test_split, cross_validate

X = hs_201819.drop('Graduation_Rate_School', axis=1)
y = hs_201819.loc[:,'Graduation_Rate_School']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

train_df = X_train.merge(y_train, left_index=True, right_index=True)



In [27]:
train_df = isolate_high_schools(train_df)
train_df = drop_no_gr_schools(train_df)
train_df = drop_specialed_options(train_df)

In [62]:
# Pipeline for TTS with Creative School Encoding, Dress Code

from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, Binarizer

In [63]:
col_trans = ColumnTransformer(transformers=[
        ('creative_school', OneHotEncoder(drop=['EMERGING']), ['Creative_School_Certification']), 
        ('dress_code', Binarizer(), ['Dress_Code'])], 
                             remainder='drop')

In [64]:
col_trans.fit_transform(train_df)

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0.

In [50]:
# This throws and error.  Will add a feature Union to resolve this
col_trans.get_feature_names()

AttributeError: Transformer dress_code (type Binarizer) does not provide get_feature_names.

One solution to the problem above is to use the OneHotEncoder on dress code instead of the Binarizer.

In [164]:
cc_trans = ColumnTransformer(transformers=[
        ('creative_school', OneHotEncoder(drop=['EMERGING']), ['Creative_School_Certification']),
        ('dress_code', OneHotEncoder(drop=[False]), ['Dress_Code'])], 
                             remainder='drop')

In [134]:
cc_trans.fit_transform(train_df)

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0.

In [136]:
# Now I can use get_feature_names()
cc_trans.get_feature_names()

['creative_school__x0_DEVELOPING',
 'creative_school__x0_EXCELLING',
 'creative_school__x0_INCOMPLETE DATA',
 'creative_school__x0_STRONG',
 'dress_code__x0_True']

In [137]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(cc_trans, LinearRegression())

In [138]:
cc_trans.fit_transform(train_df).shape

(97, 5)

In [139]:
pipe.fit(X=train_df, y=train_df['Graduation_Rate_School'])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('creative_school',
                                                  OneHotEncoder(drop=['EMERGING']),
                                                  ['Creative_School_Certification']),
                                                 ('dress_code',
                                                  OneHotEncoder(drop=[False]),
                                                  ['Dress_Code'])])),
                ('linearregression', LinearRegression())])

In [162]:
# Now I can use get_feature_names() and coef_ in unison to see the effect of each feature using the name.
pipe.steps[0][1].get_feature_names()

['creative_school__x0_DEVELOPING',
 'creative_school__x0_EXCELLING',
 'creative_school__x0_INCOMPLETE DATA',
 'creative_school__x0_STRONG',
 'dress_code__x0_True']

In [163]:
pipe.steps[1][1].coef_

array([ 6.05432371, 13.00709102, 15.33317811,  7.7797619 , -4.97937909])

Is column Transformer really necessary here? Couldn't we have just specified the columns in the Pipeline?
Yes, if the data has already been preprocessed appropriately.  But if the data is cominging in in raw form, the pipeline will not work correctly.  Certain columns need to be specificied

# 5 Feature Model

Model with Student Count, % Low income, creative school certification, dress code, and teacher attendance

In [175]:
# Raw data intake and train-test split
path_to_sp_csv = '../../data/chicago_data_portal_csv_files/Chicago_Public_Schools_-_School_Profile_Information_SY1819.csv'
path_to_pr_csv = '../../data/chicago_data_portal_csv_files/Chicago_Public_Schools_-_School_Progress_Reports_SY1819.csv'

hs_201819 = merge_pr_and_sp(path_to_sp_csv, path_to_pr_csv)

from sklearn.model_selection import train_test_split, cross_validate

X = hs_201819.drop('Graduation_Rate_School', axis=1)
y = hs_201819.loc[:,'Graduation_Rate_School']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

train_df = X_train.merge(y_train, left_index=True, right_index=True)

In [190]:
#prep train data

#remove elementary schools and preschools
train_df = isolate_high_schools(train_df)

#remove schools with no graduation rates
train_df = drop_no_gr_schools(train_df)

#drop both special ed schools and options schools because of non-standard grad rates
train_df = drop_specialed_options(train_df)

#create column with percentage of low income students in a school
train_df = make_percent_low_income(train_df)

#create column with teacher attendance numbers
train_df = prep_teacher_attendance(train_df)



In [204]:
#subset columns in training set
mask_5_predictors = ['Student_Count_Total', 'perc_low_income',
                     'teacher_attendance','Creative_School_Certification',
                    'Dress_Code']

predictors = train_df[mask_5_predictors]
y = train_df['Graduation_Rate_School']

In [243]:
#The categorical transformer makes it easy to specify how to preprocess the categorical columns
cc_trans = ColumnTransformer(transformers=[
        ('creative_school', OneHotEncoder(drop=['EMERGING']), ['Creative_School_Certification']),
        ('dress_code', OneHotEncoder(drop=[False]), ['Dress_Code'])], 
                             remainder='passthrough')

In [244]:
pipe_5_features = make_pipeline(cc_trans, SimpleImputer(), LinearRegression() )

In [245]:
pipe_5_features.fit(predictors, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('creative_school',
                                                  OneHotEncoder(drop=['EMERGING']),
                                                  ['Creative_School_Certification']),
                                                 ('dress_code',
                                                  OneHotEncoder(drop=[False]),
                                                  ['Dress_Code'])])),
                ('simpleimputer', SimpleImputer()),
                ('linearregression', LinearRegression())])

In [235]:
from sklearn.model_selection import cross_validate

cv_5_pred = cross_validate(pipe_5_features, predictors, y, cv =5, return_train_score=True)
cv_5_pred

{'fit_time': array([0.0077951 , 0.0064671 , 0.00620008, 0.00575423, 0.0057652 ]),
 'score_time': array([0.00408792, 0.00343704, 0.00352407, 0.00360894, 0.00372696]),
 'test_score': array([ 0.28076557,  0.08353311,  0.40431821, -0.01376877,  0.27190454]),
 'train_score': array([0.39249572, 0.42400898, 0.3742811 , 0.41371051, 0.40998664])}

In [236]:
np.mean(cv_5_pred['train_score'])

0.4028965897366016

In [237]:
np.mean(cv_5_pred['test_score'])

0.2053505295706576

In [238]:
cv_5_pred

{'fit_time': array([0.0077951 , 0.0064671 , 0.00620008, 0.00575423, 0.0057652 ]),
 'score_time': array([0.00408792, 0.00343704, 0.00352407, 0.00360894, 0.00372696]),
 'test_score': array([ 0.28076557,  0.08353311,  0.40431821, -0.01376877,  0.27190454]),
 'train_score': array([0.39249572, 0.42400898, 0.3742811 , 0.41371051, 0.40998664])}

## Fit on entire predictor set to see params

In [250]:
pipe_5_features.fit(predictors, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('creative_school',
                                                  OneHotEncoder(drop=['EMERGING']),
                                                  ['Creative_School_Certification']),
                                                 ('dress_code',
                                                  OneHotEncoder(drop=[False]),
                                                  ['Dress_Code'])])),
                ('simpleimputer', SimpleImputer()),
                ('linearregression', LinearRegression())])

In [248]:
pipe_5_features.score(predictors, y)

0.3933455499426064

In [255]:
pipe_5_features['linearregression'].coef_

array([ 1.47879167e+00,  4.82406603e+00,  1.06141388e+01,  4.49528135e+00,
       -1.41306250e-01,  4.89175517e-03, -2.04129349e+01,  4.90461499e+00])

In [261]:
pipe_5_features.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('creative_school',
                                    OneHotEncoder(drop=['EMERGING']),
                                    ['Creative_School_Certification']),
                                   ('dress_code', OneHotEncoder(drop=[False]),
                                    ['Dress_Code'])])),
  ('simpleimputer', SimpleImputer()),
  ('linearregression', LinearRegression())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('creative_school',
                                  OneHotEncoder(drop=['EMERGING']),
                                  ['Creative_School_Certification']),
                                 ('dress_code', OneHotEncoder(drop=[False]),
                                  ['Dress_Code'])]),
 'simpleimputer': SimpleImputer(),
 'linearregression': LinearRegression(),
 'column

Do I need to use FeatureUnion to be able to use get_feature_names() to get the column names from the transformer?