In [1]:
%config IPCompleter.greedy=True
import numpy as np
import pandas as pd

# Read dataset off Github
hs_2017 = pd.read_csv("https://github.com/jeffweltman/MSDS7331/raw/master/hs_2017.csv",index_col=0)

In [2]:
Y=hs_2017['GraduationRate_5yr_All']
todrop=hs_2017.columns[hs_2017.columns.str.lower().str.contains('graduation')]
# todrop2=pd.Series('4-Year Cohort Graduation Rate Score')
# todrop=todrop.append(todrop2, ignore_index=True)
# todrop_final = todrop.rename(index=str, columns={0: "Column"})
#hs_2017 = hs_2017.drop(todrop, axis=1) #, inplace=True 
# hs_2017.info()
X_highSchools = hs_2017.drop(todrop, axis=1)

In [3]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

groups = hs_2017['nc_district'].values

# groupshufflesplit, have tried different parameters
# cv = GroupShuffleSplit(test_size=0.1, n_splits=10).split(X_highSchools, Y, groups)

# shufflesplit for regression
cv = ShuffleSplit(n_splits=10, test_size=0.10, random_state=0)

# stratifiedshufflesplit for classification
class_cv = StratifiedShuffleSplit(n_splits=10, test_size=0.10, random_state=0)

In [21]:
#Use mean absolute error (MAE) to score the regression models created 
#(the scale of MAE is identical to the response variable)
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error

#Function for Root mean squared error
#https://stackoverflow.com/questions/17197492/root-mean-square-error-in-python
def rmse(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

#Function for Mean Absolute Percentage Error (MAPE) - Untested
#Adapted from - https://stackoverflow.com/questions/42250958/how-to-optimize-mape-code-in-python
def mape(y_actual, y_predicted): 
    mask = y_actual != 0
    return (np.fabs(y_actual - y_predicted)/y_actual)[mask].mean() * 100

#Create scorers for rmse and mape functions
mae_scorer = make_scorer(score_func=mean_absolute_error, greater_is_better=False)
rmse_scorer = make_scorer(score_func=rmse, greater_is_better=False)
mape_scorer = make_scorer(score_func=mape, greater_is_better=False)

#Make scorer array to pass into cross_validate() function for producing mutiple scores for each cv fold.
errorScoring = {'MAE':  mae_scorer, 
                'RMSE': rmse_scorer,
                'MAPE': mape_scorer
               }

In [5]:
from sklearn.model_selection import cross_validate

def EvaluateRegressionEstimator(regEstimator, X, y, cv):
    
    scores = cross_validate(regEstimator, X, y, scoring=errorScoring, cv=cv, return_train_score=True)

    #cross val score sign-flips the outputs of MAE
    # https://github.com/scikit-learn/scikit-learn/issues/2439
    scores['test_MAE'] = scores['test_MAE'] * -1
    scores['test_MAPE'] = scores['test_MAPE'] * -1
    scores['test_RMSE'] = scores['test_RMSE'] * -1

    #print mean MAE for all folds 
    maeAvg = scores['test_MAE'].mean()
    print_str = "The average MAE for all cv folds is: \t\t\t {maeAvg:.5}"
    print(print_str.format(maeAvg=maeAvg))

    #print mean test_MAPE for all folds
    scores['test_MAPE'] = scores['test_MAPE']
    mape_avg = scores['test_MAPE'].mean()
    print_str = "The average MAE percentage (MAPE) for all cv folds is: \t {mape_avg:.5}"
    print(print_str.format(mape_avg=mape_avg))

    #print mean MAE for all folds 
    RMSEavg = scores['test_RMSE'].mean()
    print_str = "The average RMSE for all cv folds is: \t\t\t {RMSEavg:.5}"
    print(print_str.format(RMSEavg=RMSEavg))
    print('*********************************************************')

    print('Cross Validation Fold Mean Error Scores')
    scoresResults = pd.DataFrame()
    scoresResults['MAE'] = scores['test_MAE']
    scoresResults['MAPE'] = scores['test_MAPE']
    scoresResults['RMSE'] = scores['test_RMSE']
    return scoresResults

In [22]:
#Make new estimator compatible for use with GridSearchCV() and cross_validate()
# -  Cap predict function for LinearRegression between 0 and 100
# -  See: Roll your own estimator links above for details. 
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression

class CappedLinearRegression(LinearRegression):

    def predict(self, X):
        return np.clip(super(CappedLinearRegression, self).predict(X), 0, 100)

In [11]:
#Create a Linear Regression object and perform a grid search to find the best parameters
linreg = CappedLinearRegression()
parameters = {'normalize':(True,False), 'fit_intercept':(True,False)}

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=linreg
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring=mae_scorer)

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(X_highSchools, Y)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.7s finished


GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, train_size=None),
       error_score='raise',
       estimator=CappedLinearRegression(copy_X=True, fit_intercept=True, n_jobs=1,
            normalize=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'normalize': (True, False), 'fit_intercept': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(mean_absolute_error, greater_is_better=False),
       verbose=1)

In [12]:
#Print the parameterization of the best estimator
regGridSearch.best_estimator_

CappedLinearRegression(copy_X=True, fit_intercept=False, n_jobs=1,
            normalize=True)

In [13]:
#Create CappedLinearRegression predictions between 0 and 100% using the best parameters for our Linear Regression object
regEstimator = regGridSearch.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics. 
EvaluateRegressionEstimator(regEstimator, X_highSchools, Y, cv)

The average MAE for all cv folds is: 			 14.496
The average MAE percentage (MAPE) for all cv folds is: 	 14.782
The average RMSE for all cv folds is: 			 22.443
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,MAE,MAPE,RMSE
0,15.270329,16.865013,21.514295
1,14.691241,14.265926,22.227498
2,16.826449,18.115333,26.912093
3,12.882962,11.900412,21.067263
4,11.772602,13.72035,19.840574
5,22.693321,20.696769,35.36216
6,17.601925,18.87118,25.751776
7,12.348817,11.602191,20.401302
8,11.411585,13.14257,15.324814
9,9.465476,8.63614,16.02363


In [32]:
#Create a regression object and perform a grid search to find the best parameters
from sklearn.linear_model import HuberRegressor 
import numpy as np

reg = HuberRegressor(epsilon=1.50,fit_intercept=True, alpha=0.001, max_iter=100)

#Test parameters 
epsilon_range = np.arange(1.0, 2.0, 0.0025)
alpha_options = [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.05, 0.01, 0.1, 10, 20]
warm_start = [True, False]
fit_intercept = [True, False]
parameters = {'epsilon': epsilon_range, 'alpha': alpha_options, 'warm_start': warm_start, 'fit_intercept': fit_intercept}


#Create a grid search object using the parameters above
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=reg
                   , n_jobs=4 # jobs to run in parallel
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring=rmse_scorer)

#Perform hyperparameter search to find the best combination of parameters for our data

regGridSearch.fit(X_highSchools, Y)

Fitting 10 folds for each of 800 candidates, totalling 8000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   19.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   39.6s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  3.5min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  5.8min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  7.1min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  8.6min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 10.3min
[Parallel(n_jobs=4)]: Done 8000 out of 8000 | elapsed: 11.4min finished


GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, train_size=None),
       error_score='raise',
       estimator=HuberRegressor(alpha=0.001, epsilon=1.5, fit_intercept=True, max_iter=100,
        tol=1e-05, warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'epsilon': array([1.  , 1.05, 1.1 , 1.15, 1.2 , 1.25, 1.3 , 1.35, 1.4 , 1.45, 1.5 ,
       1.55, 1.6 , 1.65, 1.7 , 1.75, 1.8 , 1.85, 1.9 , 1.95]), 'alpha': [1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.05, 0.01, 0.1, 10, 20], 'warm_start': [True, False], 'fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(mean_absolute_error, greater_is_better=False),
       verbose=1)

In [33]:
#Display the best estimator parameters
regGridSearch.best_estimator_

HuberRegressor(alpha=1e-05, epsilon=1.8500000000000008, fit_intercept=False,
        max_iter=100, tol=1e-05, warm_start=True)

In [34]:
#Create a regression estimator with best parameters for cross validation
regEstimator = regGridSearch.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics.
EvaluateRegressionEstimator(regEstimator, X_highSchools, Y, cv)

The average MAE for all cv folds is: 			 6.123
The average MAE percentage (MAPE) for all cv folds is: 	 5.2163
The average RMSE for all cv folds is: 			 12.069
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,MAE,MAPE,RMSE
0,6.133758,6.950699,8.516677
1,7.968258,6.277967,17.42046
2,4.722158,5.283158,6.041752
3,6.068924,4.770153,13.620533
4,5.846725,4.237147,14.480543
5,8.444336,4.484613,21.244071
6,4.852297,5.311391,5.975065
7,6.543793,5.098294,14.215473
8,4.063084,4.587298,5.020706
9,6.586974,5.162577,14.159705


Classification

In [4]:
from sklearn.model_selection import cross_validate

def EvaluateClassifierEstimator(classifierEstimator, X, y, cv):
   
    #Perform cross validation 
    scores = cross_validate(classifierEstimator, X_Class, Y_Class, scoring=['accuracy','precision','recall']
                            , cv=cv, return_train_score=True)

    Accavg = scores['test_accuracy'].mean()
    Preavg = scores['test_precision'].mean()
    Recavg = scores['test_recall'].mean()

    print_str = "The average accuracy for all cv folds is: \t\t\t {Accavg:.5}"
    print_str2 = "The average precision for all cv folds is: \t\t\t {Preavg:.5}"
    print_str3 = "The average recall for all cv folds is: \t\t\t {Recavg:.5}"

    print(print_str.format(Accavg=Accavg))
    print(print_str2.format(Preavg=Preavg))
    print(print_str3.format(Recavg=Recavg))
    print('*********************************************************')

    print('Cross Validation Fold Mean Error Scores')
    scoresResults = pd.DataFrame()
    scoresResults['Accuracy'] = scores['test_accuracy']
    scoresResults['Precision'] = scores['test_precision']
    scoresResults['Recall'] = scores['test_recall']

    return scoresResults

def EvaluateClassifierEstimator2(classifierEstimator, X, y, cv):
    
    #Perform cross validation 
    from sklearn.model_selection import cross_val_predict
    predictions = cross_val_predict(classifierEstimator, X_Class, Y_Class, cv=cv)
    
    #model evaluation 
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
    
    #pass true test set values and predictions to classification_report
    classReport = classification_report(Y,predictions)
    confMat = confusion_matrix(Y,predictions)
    acc = accuracy_score(Y,predictions)
    
    print(classReport)
    print(confMat)
    print(acc)


In [5]:
Y_sat_high=hs_2017['sat_high_level']
Y_Class = Y_sat_high
del hs_2017['sat_high_level']
X_Class=hs_2017# remove .values if necessary

In [17]:
#Logisitic regression 10-fold cross-validation 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
#from scipy.spatial.distance import cosine
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import jaccard
import warnings
warnings.filterwarnings('ignore')

knc = KNeighborsClassifier(n_neighbors=5)

k_range = list(range(1, 30))
# leaf_range = list(range(1,50))
# algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute']
metrics = ['euclidean','chebyshev','manhattan','minkowski','jaccard']
# real_metrics = ['euclidean','chebyshev','manhattan','minkowski']
weights_options = ['uniform','distance']

knn_parameters = {'n_neighbors': k_range,'weights': weights_options, 'metric': metrics}
# real_parameters = {'n_neighbors': k_range,'weights': weights_options, 'algorithm': algorithms, 'metric': real_metrics, 'leaf_size': leaf_range}
#Create a grid search object using the  

kGridSearch = GridSearchCV(knc,param_grid=knn_parameters,n_jobs=4,verbose=1,cv=class_cv,scoring='precision_macro')

#Perform hyperparameter search to find the best combination of parameters for our data
kGridSearch.fit(X_Class, Y_Class)

Fitting 10 folds for each of 290 candidates, totalling 2900 fits


[Parallel(n_jobs=4)]: Done  62 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 662 tasks      | elapsed:   10.9s
[Parallel(n_jobs=4)]: Done 1662 tasks      | elapsed:   23.0s
[Parallel(n_jobs=4)]: Done 2553 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 2900 out of 2900 | elapsed:  1.8min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.1,
            train_size=None),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'chebyshev', 'manhattan', 'minkowski', 'jaccard']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='precision_macro', verbose=1)

In [18]:
#Diplay the top model parameters
# kGridSearch.best_params_
kGridSearch.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='jaccard',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [20]:
EvaluateClassifierEstimator(kGridSearch.best_estimator_, X_Class, Y_Class, class_cv)

The average accuracy for all cv folds is: 			 0.78605
The average precision for all cv folds is: 			 0.72552
The average recall for all cv folds is: 			 0.7
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,Accuracy,Precision,Recall
0,0.697674,0.588235,0.625
1,0.744186,0.631579,0.75
2,0.744186,0.727273,0.5
3,0.790698,0.769231,0.625
4,0.813953,0.722222,0.8125
5,0.837209,0.8,0.75
6,0.953488,1.0,0.875
7,0.697674,0.6,0.5625
8,0.767442,0.666667,0.75
9,0.813953,0.75,0.75


Exceptional Work - GridSearch with Scaling

In [None]:
# Create a regression object and perform a grid search to find the best parameters
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.linear_model import HuberRegressor 

robust_X = RobustScaler(quantile_range=(25, 75)).fit_transform(X_highSchools)
standard_X = StandardScaler().fit_transform(X_highSchools)
MinMax_X = MinMaxScaler().fit_transform(X_highSchools)
MaxAbs_X = MaxAbsScaler().fit_transform(X_highSchools)
UniformQuantile_X = QuantileTransformer(output_distribution='uniform').fit_transform(X_highSchools)
GaussianQuantile_X = QuantileTransformer(output_distribution='normal').fit_transform(X_highSchools)
Normalized_X = Normalizer().fit_transform(X_highSchools)

#Perform hyperparameter search to find the best combination of parameters for our data
for i in Scales:
    regGridSearch.fit(i, Y)
    #Create a regression estimator with best parameters for cross validation
    regEstimator = regGridSearch.best_estimator_
    #Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics.
    EvaluateRegressionEstimator(regEstimator, X_highSchools, Y, cv)

In [26]:
alpha_range = np.arange(1.0, 2.0, 0.05)
print(alpha_range)

[1.   1.05 1.1  1.15 1.2  1.25 1.3  1.35 1.4  1.45 1.5  1.55 1.6  1.65
 1.7  1.75 1.8  1.85 1.9  1.95]
