## Regression Models (Mixed Years)
* looks at mixed years

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from category_encoders import OneHotEncoder 

In [10]:
data = pd.read_csv('../../01_data/cleaned_data/school_df_v6.csv')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 844 entries, 0 to 843
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   code                         109 non-null    float64
 1   name                         109 non-null    object 
 2   grade_band                   109 non-null    object 
 3   enrollment_SY1718            109 non-null    float64
 4   enrollment_SY1819            109 non-null    float64
 5   star_score_SY1718            109 non-null    float64
 6   star_score_SY1819            109 non-null    float64
 7   star_rating_SY1718           109 non-null    float64
 8   star_rating_SY1819           109 non-null    float64
 9   capacity_SY1718              109 non-null    float64
 10  capacity_SY1819              109 non-null    float64
 11  latitude                     109 non-null    float64
 12  longitude                    109 non-null    float64
 13  cluster             

In [12]:
# drop unnamed columns and "bad import" full-nan rows
data = data.iloc[:,:45]
data.dropna(how='all', inplace=True)

In [13]:
# drop 2 schools that have no attendance data
data.drop(data[data['code'] == 201].index, inplace = True) # Oyster Adams Bilingual School (Adams) has no attendance data
data.drop(data[data['code'] == 347].index, inplace = True) # Brookland Middle School has no attendance data

In [14]:
# drop columns looking at "count" of absences (non-relatable across schools of different sizes)
X = data.drop(columns = data.filter(regex='^count',axis=1)).drop(columns='name')

In [15]:
# drop school_code
X.drop('code', axis = 1, inplace = True)

In [16]:
# OHE grade_band column
X_ohe = OneHotEncoder(cols=['grade_band']).fit_transform(X)

  elif pd.api.types.is_categorical(cols):


<br><br>

# Create no_nan_df
### Only looks at 82 rows
* Need a better solution for dealing with np.nan (originally cast as -1)

In [17]:
no_nan_df = X_ohe.replace(-1,np.nan)

In [18]:
no_nan_df.dropna(how='any',inplace=True)

In [19]:
no_nan_y = no_nan_df['star_rating_SY1718']
no_nan_X = no_nan_df.drop(columns=['star_score_SY1718','star_score_SY1819','star_rating_SY1718','star_rating_SY1819'])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(no_nan_X, no_nan_y)

<br><br>
# Regression Models

In [21]:
def print_results(grid, X_train, X_test, y_train, y_test):
    #print(f'Estimator : {grid.get_estimator_}')
    
    print('*** TRAIN set ***')
    print(f'Score : {grid.score(X_train, y_train)}')
    print(f'RMSE: {mean_squared_error(y_train, grid.predict(X_train), squared = False)}')
    print(f'MSE: {mean_squared_error(y_train, grid.predict(X_train))}')
    print()
    print('*** TEST set ***')
    print(f'Score : {grid.score(X_test, y_test)}')
    print(f'RMSE: {mean_squared_error(y_test, grid.predict(X_test), squared = False)}')
    print(f'MSE: {mean_squared_error(y_test, grid.predict(X_test))}')
    print()
    print(f'Best Params : {grid.best_params_}')


<br><br>

### Multiple Linear Regression Model

In [22]:
pipe = Pipeline([('poly', PolynomialFeatures()), ('scaler', StandardScaler()), ('lr', LinearRegression())])
param = [
    {'lr__normalize': ['True','False']}
]
grid = GridSearchCV(pipe, param, n_jobs = -1)
grid.fit(X_train, y_train)
print_results(grid, X_train, X_test, y_train, y_test)

*** TRAIN set ***
Score : 1.0
RMSE: 3.456776372894296e-15
MSE: 1.1949302892200244e-29

*** TEST set ***
Score : 0.11891441535789982
RMSE: 1.102152126268666
MSE: 1.2147393094385417

Best Params : {'lr__normalize': 'True'}


<br><br>

### Ridge Model

In [23]:
pipe1 = Pipeline([('poly', PolynomialFeatures()), ('scaler', StandardScaler()), ('ridge', Ridge())])
param1 = [
    {'ridge__alpha': [.01, .1, 1, 10, 100]}
]
grid1 = GridSearchCV(pipe1, param1, n_jobs = -1)
grid1.fit(X_train, y_train)
print_results(grid1, X_train, X_test, y_train, y_test)

*** TRAIN set ***
Score : 0.8414409072301295
RMSE: 0.4588061340109381
MSE: 0.21050306860606288

*** TEST set ***
Score : 0.7296180453438974
RMSE: 0.6105501560401186
MSE: 0.3727714930406132

Best Params : {'ridge__alpha': 100}


<br><br>

### KNeighbors Model

In [25]:
pipe2 = Pipeline([('poly', PolynomialFeatures()), ('scaler', StandardScaler()), ('knearest', KNeighborsRegressor())])
param2 = [
    {'knearest__n_neighbors': [3,7,11,25,35]}
]
grid2 = GridSearchCV(pipe2, param2, n_jobs = -1)
grid2.fit(X_train, y_train)
print_results(grid2, X_train, X_test, y_train, y_test)

*** TRAIN set ***
Score : 0.665843179377014
RMSE: 0.666053024150676
MSE: 0.44362663098026095

*** TEST set ***
Score : 0.7011278195488722
RMSE: 0.6419116251475074
MSE: 0.41205053449951407

Best Params : {'knearest__n_neighbors': 7}


<br><br>

### Decision Tree Model

In [26]:
pipe3 = Pipeline([('poly', PolynomialFeatures()), ('scaler', StandardScaler()), ('dtree', DecisionTreeRegressor())])
param3 = [
    {'dtree__max_depth':[3,5,7,9],
    'dtree__min_samples_split':[3,5,7,9], 
    'dtree__min_samples_leaf':[3,5,7,9], 
    'dtree__ccp_alpha':[.01, .1, 1, 10, 100]}
]
grid3 = GridSearchCV(pipe3, param3, n_jobs = -1)
grid3.fit(X_train, y_train)
print_results(grid3, X_train, X_test, y_train, y_test)

*** TRAIN set ***
Score : 0.8351866381540897
RMSE: 0.46776728926223
MSE: 0.21880623690373474

*** TEST set ***
Score : 0.630891526936715
RMSE: 0.7133612297019641
MSE: 0.5088842440418985

Best Params : {'dtree__ccp_alpha': 0.01, 'dtree__max_depth': 3, 'dtree__min_samples_leaf': 7, 'dtree__min_samples_split': 3}


<br><br>

### Bagged Decision Tree Model

In [27]:
pipe4 = Pipeline([('poly', PolynomialFeatures()), ('scaler', StandardScaler()), ('btree', BaggingRegressor())])
param4 = [
    {'btree__n_estimators':[50, 100, 500]}
]
grid4 = GridSearchCV(pipe4, param4, n_jobs = -1)
grid4.fit(X_train, y_train)
print_results(grid4, X_train, X_test, y_train, y_test)

*** TRAIN set ***
Score : 0.9539894534412956
RMSE: 0.24715097927658927
MSE: 0.06108360655737706

*** TEST set ***
Score : 0.7199574013157894
RMSE: 0.621361791704701
MSE: 0.3860904761904762

Best Params : {'btree__n_estimators': 100}


<br><br>

### Random Forest Model

In [28]:
pipe5 = Pipeline([('poly', PolynomialFeatures()), ('scaler', StandardScaler()), ('randforest', RandomForestRegressor())])
param5 = [
    {'randforest__max_depth':[3,5,7,9],
    'randforest__min_samples_split':[3,5,7,9], 
    'randforest__min_samples_leaf':[3,5,7,9], 
    'randforest__n_estimators':[50, 100, 500]}
]
grid5 = GridSearchCV(pipe5, param5, n_jobs = -1)
grid5.fit(X_train, y_train)
print_results(grid5, X_train, X_test, y_train, y_test)

*** TRAIN set ***
Score : 0.8129958011407351
RMSE: 0.49826378004841426
MSE: 0.24826679450813455

*** TEST set ***
Score : 0.7631209655123949
RMSE: 0.5714731192327309
MSE: 0.3265815260055871

Best Params : {'randforest__max_depth': 3, 'randforest__min_samples_leaf': 9, 'randforest__min_samples_split': 5, 'randforest__n_estimators': 100}


<br><br>

### Adaboost Model

In [29]:
pipe6 = Pipeline([('poly', PolynomialFeatures()), ('scaler', StandardScaler()), ('ada', AdaBoostRegressor())])
param6 = [
    {'ada__n_estimators':[50, 100, 500]}
]
grid6 = GridSearchCV(pipe6, param6, n_jobs = -1)
grid6.fit(X_train, y_train)
print_results(grid6, X_train, X_test, y_train, y_test)

*** TRAIN set ***
Score : 0.9978257682001488
RMSE: 0.05372625402173552
MSE: 0.002886510371208052

*** TEST set ***
Score : 0.5240742282391654
RMSE: 0.8100318703041021
MSE: 0.6561516309083616

Best Params : {'ada__n_estimators': 50}


<br><br>

### Support Vector Regressor Model

#### Was only able to get a very limited SVM to ever execute fully.

In [30]:
pipe7 = Pipeline([('poly', PolynomialFeatures()), ('scaler', StandardScaler()), ('svr', SVR())])
param7 = [
    {'svr__kernel': ['linear'],
     'svr__C': [.01, 1, 10],
     'svr__gamma': np.logspace(-2, 2, 20)
    }
#     {'svr__kernel': ['linear', 'poly'],
#      'svr__C': [.01, 1, 100],
#      'svr__gamma': np.logspace(-5, 2, 20)
#     }
]
grid7 = GridSearchCV(pipe7, param7, n_jobs = -1)
grid7.fit(X_train, y_train)
print_results(grid7, X_train, X_test, y_train, y_test)

*** TRAIN set ***
Score : 0.862708128042659
RMSE: 0.4269293899102819
MSE: 0.18226870396916553

*** TEST set ***
Score : 0.6900500272360852
RMSE: 0.6536997158168981
MSE: 0.4273233184590934

Best Params : {'svr__C': 0.01, 'svr__gamma': 0.01, 'svr__kernel': 'linear'}
