In [74]:
%config IPCompleter.greedy=True
import numpy as np
import pandas as pd

# Read dataset off Github
hs_2017 = pd.read_csv("https://github.com/jeffweltman/MSDS7331/raw/master/hs_2017.csv",index_col=0)

In [75]:
Y=hs_2017['GraduationRate_5yr_All']
todrop=hs_2017.columns[hs_2017.columns.str.lower().str.contains('graduation')]
# todrop2=pd.Series('4-Year Cohort Graduation Rate Score')
# todrop=todrop.append(todrop2, ignore_index=True)
# todrop_final = todrop.rename(index=str, columns={0: "Column"})
#hs_2017 = hs_2017.drop(todrop, axis=1) #, inplace=True 
# hs_2017.info()
X_highSchools = hs_2017.drop(todrop, axis=1)

In [76]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GroupShuffleSplit

groups = hs_2017['nc_district'].values

# groupshufflesplit, have tried different parameters
cv = GroupShuffleSplit(test_size=0.1, n_splits=10).split(X_highSchools, Y, groups)

# regular shufflesplit
# cv = ShuffleSplit(n_splits=10, test_size=0.10, random_state=0)


In [77]:
#Use mean absolute error (MAE) to score the regression models created 
#(the scale of MAE is identical to the response variable)
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error

#Function for Root mean squared error
#https://stackoverflow.com/questions/17197492/root-mean-square-error-in-python
def rmse(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

#Function for Mean Absolute Percentage Error (MAPE) - Untested
#Adapted from - https://stackoverflow.com/questions/42250958/how-to-optimize-mape-code-in-python
def mape(y_actual, y_predicted): 
    mask = y_actual != 0
    return (np.fabs(y_actual - y_predicted)/y_actual)[mask].mean() * 100

#Create scorers for rmse and mape functions
mae_scorer = make_scorer(score_func=mean_absolute_error, greater_is_better=False)
rmse_scorer = make_scorer(score_func=rmse, greater_is_better=False)
mape_scorer = make_scorer(score_func=mape, greater_is_better=False)

#Make scorer array to pass into cross_validate() function for producing mutiple scores for each cv fold.
errorScoring = {'MAE':  mae_scorer, 
                'RMSE': rmse_scorer,
                'MAPE': mape_scorer
               }

In [78]:
from sklearn.model_selection import cross_validate

def EvaluateRegressionEstimator(regEstimator, X, y, cv):
    
    scores = cross_validate(regEstimator, X, y, scoring=errorScoring, cv=cv, return_train_score=True)

    #cross val score sign-flips the outputs of MAE
    # https://github.com/scikit-learn/scikit-learn/issues/2439
    scores['test_MAE'] = scores['test_MAE'] * -1
    scores['test_MAPE'] = scores['test_MAPE'] * -1
    scores['test_RMSE'] = scores['test_RMSE'] * -1

    #print mean MAE for all folds 
    maeAvg = scores['test_MAE'].mean()
    print_str = "The average MAE for all cv folds is: \t\t\t {maeAvg:.5}"
    print(print_str.format(maeAvg=maeAvg))

    #print mean test_MAPE for all folds
    scores['test_MAPE'] = scores['test_MAPE']
    mape_avg = scores['test_MAPE'].mean()
    print_str = "The average MAE percentage (MAPE) for all cv folds is: \t {mape_avg:.5}"
    print(print_str.format(mape_avg=mape_avg))

    #print mean MAE for all folds 
    RMSEavg = scores['test_RMSE'].mean()
    print_str = "The average RMSE for all cv folds is: \t\t\t {RMSEavg:.5}"
    print(print_str.format(RMSEavg=RMSEavg))
    print('*********************************************************')

    print('Cross Validation Fold Mean Error Scores')
    scoresResults = pd.DataFrame()
    scoresResults['MAE'] = scores['test_MAE']
    scoresResults['MAPE'] = scores['test_MAPE']
    scoresResults['RMSE'] = scores['test_RMSE']
    return scoresResults

In [34]:
#Make new estimator compatible for use with GridSearchCV() and cross_validate()
# -  Cap predict function for LinearRegression between 0 and 100
# -  See: Roll your own estimator links above for details. 
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression

class CappedLinearRegression(LinearRegression):

    def predict(self, X):
        return np.clip(super(CappedLinearRegression, self).predict(X), 0, 100)

In [64]:
#Create a Linear Regression object and perform a grid search to find the best parameters
linreg = CappedLinearRegression()
parameters = {'normalize':(True,False), 'fit_intercept':(True,False)}

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=linreg
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring=mae_scorer)

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(X_highSchools, Y, groups=strata)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.7s finished


GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, train_size=None),
       error_score='raise',
       estimator=CappedLinearRegression(copy_X=True, fit_intercept=True, n_jobs=1,
            normalize=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'normalize': (True, False), 'fit_intercept': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(mean_absolute_error, greater_is_better=False),
       verbose=1)

In [80]:
#Print the parameterization of the best estimator
regGridSearch.best_estimator_

HuberRegressor(alpha=1, epsilon=1.5, fit_intercept=True, max_iter=100,
        tol=1e-05, warm_start=True)

In [81]:
#Create CappedLinearRegression predictions between 0 and 100% using the best parameters for our Linear Regression object
regEstimator = regGridSearch.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics. 
EvaluateRegressionEstimator(regEstimator, X_highSchools, Y, cv)

The average MAE for all cv folds is: 			 5.5196
The average MAE percentage (MAPE) for all cv folds is: 	 5.6467
The average RMSE for all cv folds is: 			 8.2922
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,MAE,MAPE,RMSE
0,4.718475,5.281303,6.046371
1,9.171422,7.224847,17.709978
2,5.381915,5.855133,7.209808
3,5.124662,4.2841,11.541613
4,5.381915,5.855133,7.209808
5,6.273958,6.932056,8.056124
6,3.661476,4.042716,4.682592
7,5.381915,5.855133,7.209808
8,5.381915,5.855133,7.209808
9,4.718475,5.281303,6.046371


In [83]:
#Create a regression object and perform a grid search to find the best parameters
from sklearn.linear_model import HuberRegressor 

reg = HuberRegressor(epsilon=1.50,fit_intercept=True, alpha=0.0, max_iter=100)

#Test parameters 
alpha = [0.001, 0.1, 1, 10, 20]
warm_start = [True, False]
parameters = {'alpha': alpha, 'warm_start': warm_start}
# epsilon = [1.35, 1.5, 1.75, 1.9]

#Create a grid search object using the parameters above
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=reg
                   , n_jobs=4 # jobs to run in parallel
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring=mae_scorer)

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(X_highSchools, Y)

Fitting 0 folds for each of 10 candidates, totalling 0 fits


[Parallel(n_jobs=4)]: Done   0 out of   0 | elapsed:    0.0s finished


ValueError: not enough values to unpack (expected 5, got 0)

In [84]:
#Display the best estimator parameters
regGridSearch.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [85]:
#Create a regression estimator with best parameters for cross validation
regEstimator = regGridSearch.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics.
EvaluateRegressionEstimator(regEstimator, X_highSchools, Y, cv)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

Exceptional Work - GridSearch with Scaling

In [None]:
# Create a regression object and perform a grid search to find the best parameters
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.linear_model import HuberRegressor 

robust_X = RobustScaler(quantile_range=(25, 75)).fit_transform(X_highSchools)
standard_X = StandardScaler().fit_transform(X_highSchools)
MinMax_X = MinMaxScaler().fit_transform(X_highSchools)
MaxAbs_X = MaxAbsScaler().fit_transform(X_highSchools)
UniformQuantile_X = QuantileTransformer(output_distribution='uniform').fit_transform(X_highSchools)
GaussianQuantile_X = QuantileTransformer(output_distribution='normal').fit_transform(X_highSchools)
Normalized_X = Normalizer().fit_transform(X_highSchools)

#Perform hyperparameter search to find the best combination of parameters for our data
for i in Scales:
    regGridSearch.fit(i, Y)
    #Create a regression estimator with best parameters for cross validation
    regEstimator = regGridSearch.best_estimator_
    #Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics.
    EvaluateRegressionEstimator(regEstimator, X_highSchools, Y, cv)