# Regression Modeling

### Import Libraries and Data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import make_column_transformer, make_column_selector, TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [2]:
df = pd.read_csv('../../01_data/cleaned_data/school_df_v6.csv')

In [3]:
df = df.iloc[:,:45]
df.dropna(how='all', inplace=True)

In [4]:
# drop 2 schools that have no attendance data
df.drop(df[df['code'] == 201].index, inplace = True)
df.drop(df[df['code'] == 347].index, inplace = True)
df = df.replace(-1,np.nan)
df.dropna(how='any',inplace=True)

In [5]:
df = df.replace(-1,np.nan)
df.dropna(how='any',inplace=True)

In [6]:
df.shape

(81, 45)

In [7]:
df.head(2)

Unnamed: 0,code,name,grade_band,enrollment_SY1718,enrollment_SY1819,star_score_SY1718,star_score_SY1819,star_rating_SY1718,star_rating_SY1819,capacity_SY1718,...,count_20+_SY1718,pct_20+_SY1718,budgeted_amount_FY16,budgeted_enrollment_FY16,budgeted_amount_FY17,budgeted_enrollment_FY17,pct_meet_exceed_math_SY1718,pct_meet_exceed_ela_SY1718,pct_meet_exceed_math_SY1819,pct_meet_exceed_ela_SY1819
0,175.0,School-Within-School @ Goding,Elementary,308.0,313.0,88.55,88.26,5.0,5.0,444.0,...,0.0,0.0,10592800.0,5880.0,3815456.0,2674740.0,0.765,0.765,0.743,0.752
2,202.0,Aiton Elementary School,Elementary,243.0,244.0,43.85,10.79,3.0,1.0,529.0,...,12.0,0.049383,7429920.0,2650.0,3780814.0,2499672.0,0.129,0.171,0.153,0.056


In [8]:
df = df[df['star_rating_SY1718'] !=-1]

In [9]:
X=df[['enrollment_SY1718', 'enrollment_SY1819', 'capacity_SY1718', 
        'capacity_SY1819', 'latitude', 'longitude', 'cluster', 'ward','pct_0_SY1819', 
        'pct_1-5_SY1819', 'pct_6-10_SY1819', 'pct_11-20_SY1819', 'pct_20+_SY1819', 
        'pct_0_SY1718', 'pct_1-5_SY1718', 'pct_6-10__SY1718', 'pct_11-20_SY1718', 
        'pct_20+_SY1718', 'budgeted_amount_FY16', 'budgeted_enrollment_FY16', 
        'budgeted_amount_FY17', 'budgeted_enrollment_FY17', 'pct_meet_exceed_math_SY1718', 
        'pct_meet_exceed_ela_SY1718', 'pct_meet_exceed_math_SY1819', 'pct_meet_exceed_ela_SY1819']]

In [10]:
y = df['star_rating_SY1718']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [12]:
sscaler = StandardScaler()
X_train_scaled = sscaler.fit_transform(X_train)
X_test_scaled = sscaler.fit_transform(X_test)

### Null Model

In [13]:
y.value_counts()

3.0    29
2.0    20
4.0    13
5.0    11
1.0     8
Name: star_rating_SY1718, dtype: int64

In [14]:
#null model will be 34.5%
y.value_counts(normalize=True)

3.0    0.358025
2.0    0.246914
4.0    0.160494
5.0    0.135802
1.0    0.098765
Name: star_rating_SY1718, dtype: float64

### Linear Regression

In [15]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)

-460845212343844.56

### Ridge

In [16]:
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)
ridge.score(X_test_scaled, y_test)

0.6817591357894337

In [17]:
pipe = make_pipeline(StandardScaler(), Ridge())
params = {'ridge__alpha': ['.01', '0.1', '1', '10', '100', '200', '500', '1000', '10_000'],
          'ridge__normalize': [True, False]}
ridge_grid = GridSearchCV(estimator=pipe, param_grid=params, n_jobs=-1)
ridge_grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             n_jobs=-1,
             param_grid={'ridge__alpha': ['.01', '0.1', '1', '10', '100', '200',
                                          '500', '1000', '10_000'],
                         'ridge__normalize': [True, False]})

In [18]:
print(f' ridge score is         {ridge_grid.score(X_test, y_test)}')
print(f' ridge mse is           {mean_squared_error(y_test, ridge_grid.predict(X_test), squared=False)}')
print(f' ridge explained var is {explained_variance_score(y_test, ridge_grid.predict(X_test))}')

 ridge score is         0.7278814777927329
 ridge mse is           0.6618947425880253
 ridge explained var is 0.7440290733622903


### Lasso

In [19]:
lasso = Lasso()
lasso.fit(X_train_scaled, y_train)

Lasso()

In [20]:
print(f' lasso score is         {lasso.score(X_test, y_test)}')
print(f' lasso mse is           {mean_squared_error(y_test, lasso.predict(X_test), squared=False)}')

 lasso score is         -0.057690140845070514
 lasso mse is           1.3049356853336271


### Decision Tree Regressor

In [21]:
dtree = DecisionTreeRegressor()
dtree.fit(X_train_scaled, y_train)
dtree.score(X_test_scaled, y_test)

0.4971830985915493

In [22]:
dtree_pipe = make_pipeline(StandardScaler(), DecisionTreeRegressor())
params = {'decisiontreeregressor__min_samples_split': [1, 2, 3, 5, 7, 10, 15, 20, 30]}
dtree_grid = GridSearchCV(estimator=dtree_pipe, param_grid=params, n_jobs=-1)
dtree_grid.fit(X_train, y_train)

 0.31583968 0.48970377 0.52352962]


GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('decisiontreeregressor',
                                        DecisionTreeRegressor())]),
             n_jobs=-1,
             param_grid={'decisiontreeregressor__min_samples_split': [1, 2, 3,
                                                                      5, 7, 10,
                                                                      15, 20,
                                                                      30]})

In [23]:
print(f' decision tree score is    {dtree_grid.score(X_test, y_test)}')
print(f' decision tree mse is      {mean_squared_error(y_test, dtree_grid.predict(X_test), squared=False)}')

 decision tree score is    0.49232914059781996
 decision tree mse is      0.9040677916100314


### KNN Regressor

In [24]:
knn = KNeighborsRegressor()
knn.fit(X_train_scaled, y_train)
knn.score(X_test_scaled, y_test)

0.620225352112676

In [25]:
knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
params = {'kneighborsregressor__n_neighbors': [1, 2, 3, 5, 7, 10, 15, 20, 30],
         'kneighborsregressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
knn_grid = GridSearchCV(estimator=knn_pipe, param_grid=params, n_jobs=-1)
knn_grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('kneighborsregressor',
                                        KNeighborsRegressor())]),
             n_jobs=-1,
             param_grid={'kneighborsregressor__algorithm': ['auto', 'ball_tree',
                                                            'kd_tree',
                                                            'brute'],
                         'kneighborsregressor__n_neighbors': [1, 2, 3, 5, 7, 10,
                                                              15, 20, 30]})

In [26]:
print(f' knn score is    {knn_grid.score(X_test, y_test)}')
print(f' knn mse is      {mean_squared_error(y_test, knn_grid.predict(X_test), squared=False)}')

 knn score is    0.6909456740442657
 knn mse is      0.7053867426838487


### Random Forest Regressor

In [27]:
rforest = RandomForestRegressor()
rforest.fit(X_train_scaled, y_train)
rforest.score(X_test_scaled, y_test)

0.5917895774647888

In [28]:
rforest_pipe = make_pipeline(StandardScaler(), RandomForestRegressor())
params = {'randomforestregressor__n_estimators': [50, 100, 200, 500],
         'randomforestregressor__criterion': ["mse", "mae"],
#          'randomforestregressor__max_depth': ["None", 2, 5, 10, 30],
         'randomforestregressor__min_samples_split': [2, 5, 10, 15],
         'randomforestregressor__ccp_alpha': [0.0, 0.0001, 0.01, 0.1, 1, 10],}
rforest_grid = GridSearchCV(estimator=rforest_pipe, param_grid=params, n_jobs=-1)
rforest_grid.fit(X_train, y_train)
rforest_grid.score(X_test, y_test)

0.6620486422525172

In [29]:
rforest_grid.best_params_

{'randomforestregressor__ccp_alpha': 0.01,
 'randomforestregressor__criterion': 'mse',
 'randomforestregressor__min_samples_split': 15,
 'randomforestregressor__n_estimators': 200}

In [30]:
print(f' random forest score is    {rforest_grid.score(X_test, y_test)}')
print(f' random forest mse is      {mean_squared_error(y_test, rforest_grid.predict(X_test), squared=False)}')

 random forest score is    0.6620486422525172
 random forest mse is      0.7376272925247229


### Elastic Net

In [31]:
enet_pipe = make_pipeline(StandardScaler(), ElasticNet())
enet_pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('elasticnet', ElasticNet())])

In [32]:
print(f' enet score is    {enet_pipe.score(X_test, y_test)}')
print(f' enet mse is      {mean_squared_error(y_test, enet_pipe.predict(X_test), squared=False)}')

 enet score is    0.38567383419938306
 enet mse is      0.9945105311862962


### Bagging Regressor

In [33]:
bag = BaggingRegressor()
bag.fit(X_train_scaled, y_train)

BaggingRegressor()

In [34]:
print(f' bagging regressor score is    {bag.score(X_test, y_test)}')
print(f' bagging regressor mse is      {mean_squared_error(y_test, bag.predict(X_test), squared=False)}')

 bagging regressor score is    0.0535211267605632
 bagging regressor mse is      1.2344267996967355


### Adaboost

In [35]:
ada = AdaBoostRegressor()
ada.fit(X_train_scaled, y_train)
ada.score(X_test_scaled, y_test)

0.6049541517767574

In [None]:
ada_pipe = make_pipeline(StandardScaler(), AdaBoostRegressor())
params = {'adaboostregressor__n_estimators': [50, 100, 200, 300, 500, 1000],
         'adaboostregressor__loss': ['linear', 'square', 'exponential']}
ada_grid = GridSearchCV(estimator=ada_pipe, param_grid=params, n_jobs=-1)
ada_grid.fit(X_train, y_train)

In [None]:
print(f' Adaboost score is    {ada_grid.score(X_test, y_test)}')
print(f' Adaboost mse is      {mean_squared_error(y_test, ada_grid.predict(X_test), squared=False)}')

### Gradient Boosting Regressor

In [None]:
gboost = GradientBoostingRegressor()
gboost.fit(X_train_scaled, y_train)
gboost.score(X_test_scaled, y_test)

In [None]:
gboost_pipe = make_pipeline(StandardScaler(), GradientBoostingRegressor())
params = {'gradientboostingregressor__n_estimators': [50, 100, 200, 300, 500, 1000]}
gboost_grid = GridSearchCV(estimator=gboost_pipe, param_grid=params, n_jobs=-1)
gboost_grid.fit(X_train, y_train)

In [None]:
print(f' gradient boost score is    {gboost_grid.score(X_test, y_test)}')
print(f' gradient boost mse is      {mean_squared_error(y_test, gboost_grid.predict(X_test), squared=False)}')

### Support Vector

In [None]:
svr = SVR(C=3)
svr.fit(X_train_scaled, y_train)

In [None]:
print(f' svr score is    {svr.score(X_test, y_test)}')
print(f' svr mse is      {mean_squared_error(y_test, svr.predict(X_test), squared=False)}')

# Feature Importance & Model Evaluation

In [None]:
# Random Forest Feature Importance - Top Features
feat_imp = pd.DataFrame(rforest.feature_importances_, index=X.columns)
feat_imp.sort_values(by=0, ascending=False).head(5)
# Top 5 Features:
# pct_meet_exceed_math_SY1819
# pct_meet_exceed_ela_SY1718
# pct_meet_exceed_math_SY1718
# pct_meet_exceed_ela_SY1819
# pct_11-20_SY1819

In [None]:
# Random Forest Feature Importance - Bottom Features
feat_imp = pd.DataFrame(rforest.feature_importances_, index=X.columns)
feat_imp.sort_values(by=0, ascending=True).head(5)

# Bottom 5 Features:
# ward
# pct_0_SY1819
# budgeted_enrollment_FY17
# pct_11-20_SY1718
# enrollment_SY1718

In [None]:
fig = plt.figure(figsize=(10, 5))

ax = fig.add_subplot(1, 2, 1)
y_pred_forest = rforest_grid.predict(X_test)
plt.scatter(y_test, y_pred_forest)
ax.plot([0,1], [0,1], transform=ax.transAxes, ls="--", c="orange")
plt.title('Random Forest Grid Model')
plt.xlabel('School Star Rating')
plt.ylabel('Model Predictions')

ax = fig.add_subplot(1, 2, 2)
y_pred_ridge = ridge_grid.predict(X_test)
plt.scatter(y_test, y_pred_ridge)
ax.plot([0,1], [0,1], transform=ax.transAxes, ls="--", c="orange")
plt.title('Ridge Model')
plt.xlabel('School Star Rating')
plt.ylabel('Model Predictions');

In [None]:
print(f' ridge score is        {ridge_grid.score(X_test, y_test)}')
print(f' ridge mse is          {mean_squared_error(y_test, ridge_grid.predict(X_test), squared=False)}')
print()
print(f' lasso score is        {lasso.score(X_test_scaled, y_test)}')
print(f' lasso mse is          {mean_squared_error(y_test, lasso.predict(X_test), squared=False)}')
print()
print(f' dtree_grid score is   {dtree_grid.score(X_test, y_test)}')
print(f' dtree_grid mse is     {mean_squared_error(y_test, dtree_grid.predict(X_test), squared=False)}')
print()
print(f' knn_grid score is     {knn_grid.score(X_test, y_test)}')
print(f' knn_grid mse is       {mean_squared_error(y_test, knn_grid.predict(X_test), squared=False)}')
print()
print(f' rforest_grid score is {rforest_grid.score(X_test, y_test)}')
print(f' rforest_grid mse is   {mean_squared_error(y_test, rforest_grid.predict(X_test), squared=False)}')
print()
print(f' enet_pipe score is    {enet_pipe.score(X_test, y_test)}')
print(f' enet_pipe mse is      {mean_squared_error(y_test, enet_pipe.predict(X_test), squared=False)}')
print()
print(f' bag score is          {bag.score(X_test_scaled, y_test)}')
print(f' bag mse is            {mean_squared_error(y_test, bag.predict(X_test), squared=False)}')
print()
print(f' ada_grid score is     {ada_grid.score(X_test, y_test)}')
print(f' ada_grid mse is       {mean_squared_error(y_test, ada_grid.predict(X_test), squared=False)}')
print()
print(f' gboost score is       {gboost_grid.score(X_test, y_test)}')
print(f' gboost mse is         {mean_squared_error(y_test, gboost_grid.predict(X_test), squared=False)}')
print()
print(f' svr score is          {svr.score(X_test_scaled, y_test)}')
print(f' svr mse is            {mean_squared_error(y_test, svr.predict(X_test), squared=False)}')

In [None]:
# Models that performed well:
# - Ridge Model
# - Decision Trees
# - KNN
# - Random Forest Model
# - Neural Network

In [None]:
# Models that did not performed well:
# - Lasso Model
# - Elastic Net Modelnet_pipe
# - Bagging Regressor
# - Adaboost & Gradientboost
# - SVR