# Guided Practice/Demo

The following code samples are provided directly from the lesson and should serve as a jumping off point for students to run the code on their own.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model, metrics
%matplotlib inline

df = pd.DataFrame({'x': range(100), 'y': range(100)})
biased_df = df.copy()
varied_df = df.copy()

def append_jitter(series,scale=1):
    jitter = np.random.random_sample(size=len(series))
    return [series[i] + (jitter[i] * scale) for i in xrange(len(series))]

df['x'] = append_jitter(df.x)
df['y'] = append_jitter(df.y)

biased_df['y'] = biased_df['x'] **2 / (df.y.max())
biased_df['x'] = append_jitter(biased_df.x)
biased_df['y'] = append_jitter(biased_df.y)

jitter_scale = 20
varied_df['x'] = append_jitter(varied_df.x)
varied_df['y'] = append_jitter(varied_df.y, scale=jitter_scale)

fig,ax = plt.subplots(1,3, figsize=(18,6), sharey=True)
sns.regplot('x','y',df, ax=ax[0])
sns.regplot('x','y',biased_df, ax=ax[1])
sns.regplot('x','y',varied_df, ax=ax[2])

## fit
lm = linear_model.LinearRegression().fit(df[['x']], df['y'])
## biased fit
lmb = linear_model.LinearRegression().fit(biased_df[['x']], biased_df['y'])
## varied fit
lmv = linear_model.LinearRegression().fit(varied_df[['x']], varied_df['y'])

print '{} : R2 for df'.format(lm.score(df[['x']],df['y']))
print '{} : R2 for biased_df'.format(lmb.score(biased_df[['x']],biased_df['y']))
print '{} : R2 for varied_df'.format(lmb.score(biased_df[['x']],varied_df['y']))

In [None]:
print '{} : MSE for df using lm'.format(metrics.mean_squared_error(df['y'], lm.predict(df[['x']])))
print '{} : MSE for biased_df using lmb'.format(metrics.mean_squared_error(biased_df['y'], lmb.predict(biased_df[['x']])))
print '{} : MSE for varied_df using lmv'.format(metrics.mean_squared_error(varied_df['y'], lmv.predict(varied_df[['x']])))

In [None]:
print '{} : MSE for biased_df using lm'.format(metrics.mean_squared_error(biased_df['y'], lm.predict(biased_df[['x']])))

In [None]:
sns.jointplot(df.x, df.y,kind='resid')
sns.jointplot(biased_df.x, biased_df.y, kind='resid')
sns.jointplot(varied_df.x, varied_df.y, kind='resid')

In [None]:
from sklearn import cross_validation
wd = '../../assets/dataset/'
bikeshare = pd.read_csv(wd + 'bikeshare.csv')
weather = pd.get_dummies(bikeshare.weathersit, prefix='weather')
modeldata = bikeshare[['temp', 'hum']].join(weather[['weather_1', 'weather_2', 'weather_3']])
y = bikeshare.casual

kf = cross_validation.KFold(len(modeldata), n_folds=5)
scores = []
for train_index, test_index in kf:
    lm = linear_model.LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
    scores.append(metrics.mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index])))

print np.mean(scores)

# this score will be lower, but we're trading off bias error for generalized error
lm = linear_model.LinearRegression().fit(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))

In [None]:
kf = cross_validation.KFold(len(modeldata), n_folds=5, shuffle=True)
scores = []
for train_index, test_index in kf:
    lm = linear_model.LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
    scores.append(metrics.mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index])))
    
print np.mean(scores)

In [None]:
lm = linear_model.LinearRegression().fit(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))
lm = linear_model.Lasso().fit(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))
lm = linear_model.Ridge().fit(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))

In [None]:
alphas = np.logspace(-10, 10, 21)
for a in alphas:
    print 'Alpha:', a
    lm = linear_model.Ridge(alpha=a)
    lm.fit(modeldata, y)
    print '                {}'.format(lm.coef_)
    print '{}\n'.format(metrics.mean_squared_error(y, lm.predict(modeldata)))

In [None]:
from sklearn import grid_search

alphas = np.logspace(-10, 10, 21)
gs = grid_search.GridSearchCV(
    estimator=linear_model.Ridge(),
    param_grid={'alpha': alphas},
    scoring='mean_squared_error')

gs.fit(modeldata, y)

print 'best score: {}'.format(-gs.best_score_) # mean squared error here comes in negative, so let's make it positive.
print 'best estimator: {}'.format(gs.best_estimator_) # explains which grid_search setup worked best
for s in gs.grid_scores_: # shows all the grid pairings and their performances.
    print s 

In [None]:
num_to_approach, start, steps, optimized = 6.2, 0., [-1, 1], False
while not optimized:
    current_distance = num_to_approach - start
    got_better = False
    next_steps = [start + i for i in steps]
    for n in next_steps:
        distance = np.abs(num_to_approach - n)
        if distance < current_distance:
            got_better = True
            print distance, 'is better than', current_distance
            current_distance = distance
            start = n
    if got_better:
        print 'found better solution! using', current_distance
        a += 1
    else:
        optimized = True
        print start, 'is closest to', num_to_approach


In [None]:
lm = linear_model.SGDRegressor()
lm.fit(modeldata, y)
print lm.score(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))

# Independent Practice

Use the following code to work through the problems given.

In [None]:
params = {} # put your gradient descent parameters here
gs = grid_search.GridSearchCV(
    estimator=linear_model.SGDRegressor(),
    cv=cross_validation.KFold(len(modeldata), n_folds=5, shuffle=True),
    param_grid=params,
    scoring='mean_squared_error',
    )

gs.fit(modeldata, y)

print 'BEST ESTIMATOR'
print -gs.best_score_
print gs.best_estimator_
print 'ALL ESTIMATORS'
print gs.grid_scores_