# LOOCV Modeling

This notebook uses Leave One Out Cross Validation. The model is trained on all high schools but 1, which is then used to generate a test score.  

In [33]:
import sys
sys.path.append('../..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.cleaning import prep_high_school_dataframe
from src.cleaning import filter_cwoption_special_ed
from src.cleaning import isolate_important_columns

# Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Pipelines
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# models
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor

# Validation and metrics
from src.cps_model import print_cv_results
from sklearn.model_selection import LeaveOneOut, cross_validate, cross_val_predict

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
path_to_sp = '../../data/chicago_data_portal_csv_files/Chicago_Public_Schools_-_School_Profile_Information_SY1819.csv'
path_to_pr = '../../data/chicago_data_portal_csv_files/Chicago_Public_Schools_-_School_Progress_Reports_SY1819.csv'
path_to_prior_sp = '../../data/chicago_data_portal_csv_files/Chicago_Public_Schools_-_School_Profile_Information_SY1718.csv'
path_to_prior_pr = '../../data/chicago_data_portal_csv_files/Chicago_Public_Schools_-_School_Progress_Reports_SY1718.csv'

sy_1819_hs = prep_high_school_dataframe(path_to_sp, 
                                        path_to_pr, 
                                        path_to_prior_sp,
                                        path_to_prior_pr)
                   

sy_1819_hs = filter_cwoption_special_ed(sy_1819_hs)

0 Student Count
2 schools
15     ENGLEWOOD STEM HS
582       YCCS - VIRTUAL
Name: Short_Name_sp, dtype: object
All 0 Student Count Schools Dropped
0 Graduation Rate
2 schools
261    PATHWAYS - AVONDALE HS
343     NORTHSIDE LEARNING HS
Name: Short_Name_sp, dtype: object
##########
NA Graduation Rates
38 schools
All 0/NA Graduation Rate Schools Dropped


In [35]:
# Create a copy of the original df to use for EDA explanations
df_for_correlations = sy_1819_hs.copy()

# Separate target from features for modeling dataframes
grad_rates = sy_1819_hs['Graduation_Rate_School']
sy_1819_hs.drop('Graduation_Rate_School', axis=1, inplace=True)

In [36]:
sy_1819_hs.head()

Unnamed: 0,School_ID,Legacy_Unit_ID,Finance_ID,Short_Name_sp,Long_Name_sp,Primary_Category_sp,Is_High_School,Is_Middle_School,Is_Elementary_School,Is_Pre_School,...,perc_Student_Count_White,perc_Student_Count_Asian,perc_Student_Count_Native_American,perc_Student_Count_Other_Ethnicity,perc_Student_Count_Asian_Pacific_Islander,perc_Student_Count_Multi,perc_Student_Count_Hawaiian_Pacific_Islander,perc_Student_Count_Ethnicity_Not_Available,Student_Count_Total_1718,student_count_total_change_1_year
1,609754,1800,70070,CHICAGO MILITARY HS,Chicago Military Academy High School,HS,True,False,False,False,...,0.013559,0.0,0.0,0.0,0.0,0.010169,0.0,0.0,298,-3
2,400051,1930,66141,NOBLE - NOBLE HS,Noble - Noble College Prep,HS,True,False,False,False,...,0.012121,0.007576,0.006061,0.0,0.0,0.004545,0.0,0.001515,668,-8
3,609678,1060,47021,JONES HS,William Jones College Preparatory High School,HS,True,False,False,False,...,0.391623,0.142932,0.003141,0.0,0.0,0.041885,0.005759,0.0,1918,-8
4,400085,3343,66394,ACERO - GARCIA HS,Acero Charter Schools - Major Hector P. Garcia MD,HS,True,False,False,False,...,0.017742,0.003226,0.003226,0.0,0.0,0.0,0.0,0.0,625,-5
5,400117,9034,66574,NOBLE - HANSBERRY HS,Noble - Hansberry College Prep,HS,True,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001795,0.0,563,-6


## Dummy Regressor (FSM)

I use a Dummy Regressor for a first simple model.  It simply predicts the mean of the graduation rate of the schools in the training set.  

As shown below, on average, the dummy model misses by 12.84 graduation rate percentage points.  This will be the baseline which I will work up from. 


In [37]:
# dictionary to track RMSE's of various models
rmse = {}


In [38]:
def loocv_model_evaluator(estimator, 
                         X=sy_1819_hs, y=grad_rates, 
                         features=None, 
                         return_resids=False):
    
    '''
    Peform Leave One Out Cross validation on a regression model. 
    Create residual array using cross_val_predict
    
    Parameters:
    estimator: regression algorithm
    X: independent features
    y: graduation rate target
    features:  A list of column names to subset X
    return_resids: boolean to return residuals for inspecting
    
    Returns:
    Mean of test set RMSE.
    Plus residual array if return_resids = True
    '''
    
    # Subset the independent variables if a feature list is passed
    if features:
        X = X[features]

    cv = cross_validate(estimator, X, y,
                        cv=LeaveOneOut(), scoring=['neg_mean_squared_error'], 
                       return_train_score=True)
    
    # Store predictions of the test set to inspect prediction error
    y_hat_test = cross_val_predict(estimator, X, y,
                        cv=LeaveOneOut())

    resids = y_hat_test - grad_rates
    
    # Returning resids if you want to inspect the residual pattern
    train_rmse = np.sqrt(-cv[ 'train_neg_mean_squared_error']).mean()
    test_rmse = np.sqrt(-cv['test_neg_mean_squared_error'].mean())
    
    if return_resids:    
        return train_rmse, test_rmse, resids
    else:
        return train_rmse, test_rmse


In [39]:
train, test = loocv_model_evaluator(DummyRegressor())
rmse['dummy'] = {'train': train, 'test':test}

print(f'''Mean Squared Error - Test Set LOOCV:
{rmse['dummy']}''')

Mean Squared Error - Test Set LOOCV:
{'train': 12.733353663169186, 'test': 12.838397868216594}


## Student Count Total

I will use Student_Count_Total for the first model that includes predictive features.  I use this because Student_Count_Total has a high correlation to graduation rate.

In [40]:
print("High correlation between total students and graduation rate:")
df_for_correlations[['Student_Count_Total', 'Graduation_Rate_School']].corr().iloc[0,:]

High correlation between total students and graduation rate:


Student_Count_Total       1.000000
Graduation_Rate_School    0.447154
Name: Student_Count_Total, dtype: float64

In [41]:
train, test =  loocv_model_evaluator(LinearRegression(), 
                                    features=['Student_Count_Total'])

rmse['lr_simple_sct'] = {'train': train, 'test':test}

print(f'''Mean Squared Error - Test Set LOOCV:
{rmse['lr_simple_sct']}''')

Mean Squared Error - Test Set LOOCV:
{'train': 11.388919305634172, 'test': 11.627259011576475}


Using a simple linear regression with 1 relatively highly correlated feature increases the RMSE by 1.21 points.

In [42]:
rmse['dummy']['test'] -  rmse['lr_simple_sct']['test']

1.2111388566401189

## Add Networks 

In [43]:
# One hot encode the networks
network_transformer = make_column_transformer((OneHotEncoder(drop='first'), ['Network']), 
                                               remainder='passthrough')
pipe = make_pipeline(network_transformer, LinearRegression())

si_net = SimpleImputer(strategy='constant', fill_value='missing_network')
sy_1819_hs['Network'] = si_net.fit_transform(sy_1819_hs[['Network']])


In [44]:
loocv_model_evaluator(pipe, features=['Student_Count_Total', 'Network'])

(9.966502158094304, 10.902603599286637)

## All Confident Columns


In [45]:
sy_1819_hs = isolate_important_columns(sy_1819_hs, confident_columns=True)
si_net = SimpleImputer(strategy='constant', fill_value='missing_network')
sy_1819_hs['Network'] = si_net.fit_transform(sy_1819_hs[['Network']])

sy_1819_hs.head()

Unnamed: 0,perc_Student_Count_Low_Income,perc_Student_Count_Special_Ed,perc_Student_Count_English_Learners,perc_Student_Count_Black,perc_Student_Count_Hispanic,perc_Student_Count_White,perc_Student_Count_Asian,perc_Student_Count_Native_American,perc_Student_Count_Other_Ethnicity,perc_Student_Count_Asian_Pacific_Islander,perc_Student_Count_Multi,perc_Student_Count_Hawaiian_Pacific_Islander,perc_Student_Count_Ethnicity_Not_Available,Student_Count_Total_1718,student_count_total_change_1_year,Dress_Code,Network
1,0.715254,0.091525,0.061017,0.637288,0.338983,0.013559,0.0,0.0,0.0,0.0,0.010169,0.0,0.0,298,-3,True,Network 17
2,0.836364,0.140909,0.098485,0.112121,0.856061,0.012121,0.007576,0.006061,0.0,0.0,0.004545,0.0,0.001515,668,-8,True,Charter
3,0.373298,0.053927,0.010995,0.119372,0.295288,0.391623,0.142932,0.003141,0.0,0.0,0.041885,0.005759,0.0,1918,-8,False,ISP
4,0.935484,0.151613,0.248387,0.009677,0.966129,0.017742,0.003226,0.003226,0.0,0.0,0.0,0.0,0.0,625,-5,True,Charter
5,0.800718,0.213645,0.014363,0.989228,0.008977,0.0,0.0,0.0,0.0,0.0,0.0,0.001795,0.0,563,-6,True,Charter


In [46]:
loocv_model_evaluator(pipe, sy_1819_hs)

(7.602764277593577, 10.988016065200995)

In [48]:
train, test = loocv_model_evaluator(pipe, sy_1819_hs)

rmse['lr_confident'] = {'train': train, 'test':test}

print(f'''Mean Squared Error - Test Set LOOCV:
{rmse['lr_confident']}''')

Mean Squared Error - Test Set LOOCV:
{'train': 7.602764277593577, 'test': 10.988016065200995}


In [49]:
rmse

{'dummy': {'train': 12.733353663169186, 'test': 12.838397868216594},
 'lr_simple_sct': {'train': 11.388919305634172, 'test': 11.627259011576475},
 'lr_confident': {'train': 7.602764277593577, 'test': 10.988016065200995}}