In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error

# Completing the Workflow

- Load and prepare data
- Split train and test sets
- Build the model
 - Addition of Cross Validation
- Tune the Parameters
 - Add `GridSearchCV`
- Reflect on the Model

In [None]:
bikeshare = pd.read_csv('data/bikeshare.csv')
weather = pd.get_dummies(bikeshare.weathersit, prefix='weather')


modeldata = bikeshare[['temp', 'hum']].join(weather[['weather_1', 'weather_2', 'weather_3']])
y = bikeshare.casual

In [None]:
#split train-test set


In [None]:
#instantiate, fit, and evaluate the model





### Cross-Validation

Rather than using a single training set, we can iterate through a number of splits of the larger training set itself. One approach is called K-folds, where we take a separate fold each time through.  For example, if we perform a cross-validation with 5 folds, we would split the training data 5 times, each time fitting and evaluating a model.  

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(lm, modeldata, y, scoring = "neg_mean_squared_error", cv = 5)

In [None]:
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

In [None]:
rmse_scores.mean()

In [None]:
rmse_scores.std()

In [None]:
kf = cross_validation.KFold(len(modeldata), n_folds = 5, shuffle = True)

In [None]:
mse_values = []
scores = []
n= 0
print( "~~~~ CROSS VALIDATION each fold ~~~~")
for train_index, test_index in kf:
    lm = LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
    mse_values.append(mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index])))
    scores.append(lm.score(modeldata, y))
    n+=1
    print( 'Model', n)
    print( 'MSE:', mse_values[n-1])
    print( 'R2:', scores[n-1])

print( "~~~~ SUMMARY OF CROSS VALIDATION ~~~~")
print( 'Mean of MSE for all folds:', np.mean(mse_values))
print( 'Mean of R2 for all folds:', np.mean(scores))

### Grid Search

Now, suppose we are using our regularized methods here.  We want to be able to also experiment with parameters and determine something like our ideal value for $\alpha$ in the `Ridge()` model.  We can feed a list of values to the `GridSearchCV` and it will run through the possible combinations of these using cross validation.

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [
    {'alpha': [alpha for alpha in [1, 5, 10, 40]],
    'fit_intercept': [True, False]
    }
]

In [None]:
ridge = Ridge()
grid = GridSearchCV(ridge, param_grid, cv = 5, scoring = 'neg_mean_squared_error')

In [None]:
grid.fit(modeldata, y)

### Best Fit and Parameters

Now, we can investigate the model performance.  The `best_params_` will give us the ideal parameters from the search, `best_estimator_` gives the full model information, and `cv_results_` will give us full information including performance for each individual model attempt.

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
results = grid.cv_results_

In [None]:
results = zip(results['mean_test_score'], results['params'])

In [None]:
for mean_score, param in results:
    print(np.sqrt(-mean_score), param)

### Split, Search, Evaluate

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(modeldata, y)

In [None]:
#Grid Search
grid.fit(X_train, y_train)

In [None]:
#Examine results
results = grid.cv_results_
results = zip(results['mean_test_score'], results['params'])
for mean_score, param in results:
    print(np.sqrt(-mean_score), param)

In [None]:
grid.best_estimator_

### Fit on Test

In [None]:
model = Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [None]:
model.fit(X_test, y_test)

In [None]:
model.score(X_test, y_test)

In [None]:
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)

In [None]:
rmse = np.sqrt(mse)

In [None]:
rmse

### California Housing

The dataset comes from sklearn's dataset library.  We want to perform an end to end modeling project where we predict the median house value in a district.  You should follow a workflow similar to what we've encountered to this point, and may go something like:

- Investigate variable types and distributions
- Transformations?
- Collinearity?
- New Features?
 - Perhaps something like Rooms per Household, Bedrooms per Room, Population per Household
- Missing Values?
 - Examine total bedrooms; drop the districts, delete the attribute, or fill values with sensible number (like median) `.fillna()`
- Encode any Categorical Variables
- Feature Scaling? (`MaxMinScaler` or `StandardScaler`)
- Prepare numerical data
- Split Train and Test Set
- Use Cross Validation and Grid Search to explore models on training set
- Determine best model and evaluate on Test set
- Communicate your results including a visualization of the locations of the houses, colored by their median house value and sized by population (`plt.scatter`)

In [None]:
cali_houses = pd.read_csv('data/cali_housing.csv')