In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error

# Completing the Workflow

- Load and prepare data
- Split train and test sets
- Build the model
 - Addition of Cross Validation
- Tune the Parameters
 - Add `GridSearchCV`
- Reflect on the Model

In [84]:
bikeshare = pd.read_csv('data/bikeshare.csv')
weather = pd.get_dummies(bikeshare.weathersit, prefix='weather')


modeldata = bikeshare[['temp', 'hum']].join(weather[['weather_1', 'weather_2', 'weather_3']])
y = bikeshare.casual

In [85]:
#split train-test set


In [86]:
#instantiate, fit, and evaluate the model





### Cross-Validation

Rather than using a single training set, we can iterate through a number of splits of the larger training set itself. One approach is called K-folds, where we take a separate fold each time through.  For example, if we perform a cross-validation with 5 folds, we would split the training data 5 times, each time fitting and evaluating a model.  

In [75]:
from sklearn.model_selection import cross_val_score

In [87]:
scores = cross_val_score(lm, modeldata, y, scoring = "neg_mean_squared_error", cv = 5)

In [88]:
rmse_scores = np.sqrt(-scores)

In [89]:
rmse_scores

array([38.17550295, 45.07309253, 35.58704297, 69.43371546, 56.6001007 ])

In [81]:
rmse_scores.mean()

48.11277828232687

In [82]:
rmse_scores.std()

13.627064849219837

In [159]:
kf = cross_validation.KFold(len(modeldata), n_folds = 5, shuffle = True)

In [164]:
mse_values = []
scores = []
n= 0
print( "~~~~ CROSS VALIDATION each fold ~~~~")
for train_index, test_index in kf:
    lm = LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
    mse_values.append(mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index])))
    scores.append(lm.score(modeldata, y))
    n+=1
    print( 'Model', n)
    print( 'MSE:', mse_values[n-1])
    print( 'R2:', scores[n-1])

print( "~~~~ SUMMARY OF CROSS VALIDATION ~~~~")
print( 'Mean of MSE for all folds:', np.mean(mse_values))
print( 'Mean of R2 for all folds:', np.mean(scores))

~~~~ CROSS VALIDATION each fold ~~~~
Model 1
MSE: 1705.2033037910232
R2: 0.31190233073841744
Model 2
MSE: 1631.8360098639637
R2: 0.3119241721003434
Model 3
MSE: 1731.4021669007136
R2: 0.31191900862279376
Model 4
MSE: 1586.1711382052615
R2: 0.31188912818902204
Model 5
MSE: 1711.5132782226212
R2: 0.3118918806170695
~~~~ SUMMARY OF CROSS VALIDATION ~~~~
Mean of MSE for all folds: 1673.2251793967166
Mean of R2 for all folds: 0.31190530405352923


### Grid Search

Now, suppose we are using our regularized methods here.  We want to be able to also experiment with parameters and determine something like our ideal value for $\alpha$ in the `Ridge()` model.  We can feed a list of values to the `GridSearchCV` and it will run through the possible combinations of these using cross validation.

In [92]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [97]:
param_grid = [
    {'alpha': [alpha for alpha in [1, 5, 10, 40]],
    'fit_intercept': [True, False]
    }
]

In [98]:
ridge = Ridge()
grid = GridSearchCV(ridge, param_grid, cv = 5, scoring = 'neg_mean_squared_error')

In [99]:
grid.fit(modeldata, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'alpha': [1, 5, 10, 40], 'fit_intercept': [True, False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

### Best Fit and Parameters

Now, we can investigate the model performance.  The `best_params_` will give us the ideal parameters from the search, `best_estimator_` gives the full model information, and `cv_results_` will give us full information including performance for each individual model attempt.

In [100]:
grid.best_params_

{'alpha': 40, 'fit_intercept': False}

In [101]:
grid.best_estimator_

Ridge(alpha=40, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [103]:
results = grid.cv_results_

In [107]:
results = zip(results['mean_test_score'], results['params'])

In [108]:
for mean_score, param in results:
    print(np.sqrt(-mean_score), param)

42.19907472429994 {'alpha': 1, 'fit_intercept': True}
42.206081583932566 {'alpha': 1, 'fit_intercept': False}
42.18885870516342 {'alpha': 5, 'fit_intercept': True}
42.18488903631506 {'alpha': 5, 'fit_intercept': False}
42.177685282479935 {'alpha': 10, 'fit_intercept': True}
42.162559420472945 {'alpha': 10, 'fit_intercept': False}
42.15168131166849 {'alpha': 40, 'fit_intercept': True}
42.10058388779278 {'alpha': 40, 'fit_intercept': False}


### Split, Search, Evaluate

In [149]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(modeldata, y)

In [150]:
#Grid Search
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'alpha': [1, 5, 10, 40], 'fit_intercept': [True, False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [151]:
#Examine results
results = grid.cv_results_
results = zip(results['mean_test_score'], results['params'])
for mean_score, param in results:
    print(np.sqrt(-mean_score), param)

41.00086619515355 {'alpha': 1, 'fit_intercept': True}
41.00615249220171 {'alpha': 1, 'fit_intercept': False}
41.002529715157365 {'alpha': 5, 'fit_intercept': True}
41.00825503357845 {'alpha': 5, 'fit_intercept': False}
41.006906030024126 {'alpha': 10, 'fit_intercept': True}
41.01467141911244 {'alpha': 10, 'fit_intercept': False}
41.082147559583596 {'alpha': 40, 'fit_intercept': True}
41.11453764421828 {'alpha': 40, 'fit_intercept': False}


In [152]:
grid.best_estimator_

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

### Fit on Test

In [153]:
model = Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [154]:
model.fit(X_test, y_test)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [155]:
model.score(X_test, y_test)

0.3150800266831697

In [156]:
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)

In [157]:
rmse = np.sqrt(mse)

In [158]:
rmse

40.65759825178781

### California Housing

The dataset comes from sklearn's dataset library.  We want to perform an end to end modeling project where we predict the median house value in a district.  You should follow a workflow similar to what we've encountered to this point, and may go something like:

- Investigate variable types and distributions
- Transformations?
- Collinearity?
- New Features?
 - Perhaps something like Rooms per Household, Bedrooms per Room, Population per Household
- Missing Values?
 - Examine total bedrooms; drop the districts, delete the attribute, or fill values with sensible number (like median) `.fillna()`
- Encode any Categorical Variables
- Feature Scaling? (`MaxMinScaler` or `StandardScaler`)
- Prepare numerical data
- Split Train and Test Set
- Use Cross Validation and Grid Search to explore models on training set
- Determine best model and evaluate on Test set
- Communicate your results including a visualization of the locations of the houses, colored by their median house value and sized by population (`plt.scatter`)

In [175]:
cali_houses = pd.read_csv('data/cali_housing.csv')