# Bias-variance lab

In this lab you'll explore how bias and variance changes using a dataset on college statistics.

---

In [75]:
import numpy as np
import scipy 
import seaborn as sns
import pandas as pd
import patsy

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.cross_validation import cross_val_score as CVS, KFold, train_test_split

import matplotlib
import matplotlib.pyplot as plt

from random import Random
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('fivethirtyeight')


---

### Load data

Feel free to choose a target variable on your own. I chose "Grad.Rate" as my target variable but it's not required.

You'll want to discard the name of the college, and if you're planning on using the "Private" variable it will have to be changed into 1s and 0s rather than yes/no.

In [51]:
college = pd.read_csv('/Users/tlee010/desktop/DSI-SF-2-timdavidlee/datasets/college_stats/College.csv')

In [52]:
print college.shape
college.head()



(777, 19)


Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [53]:

# split the variables
y = college['Grad.Rate'].copy()
x = college.iloc[:,range(1,len(college.columns)-1)].copy()

x['Private'].replace(to_replace=['Yes','No'], value = [1,0], inplace = True)



scalar = StandardScaler()
columns = x.columns
x.loc[:,columns] = scalar.fit_transform(x[columns])
x.head()



Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend
0,0.612553,-0.346882,-0.321205,-0.063509,-0.258583,-0.191827,-0.168116,-0.209207,-0.746356,-0.964905,-0.602312,1.270045,-0.163028,-0.115729,1.013776,-0.867574,-0.50191
1,0.612553,-0.210884,-0.038703,-0.288584,-0.655656,-1.353911,-0.209788,0.244307,0.457496,1.909208,1.21588,0.235515,-2.675646,-3.378176,-0.477704,-0.544572,0.16611
2,0.612553,-0.406866,-0.376318,-0.478121,-0.315307,-0.292878,-0.549565,-0.49709,0.201305,-0.554317,-0.905344,-0.259582,-1.204845,-0.931341,-0.300749,0.585935,-0.17729
3,0.612553,-0.668261,-0.681682,-0.692427,1.840231,1.677612,-0.658079,-0.520752,0.626633,0.996791,-0.602312,-0.688173,1.185206,1.175657,-1.615274,1.151188,1.792851
4,0.612553,-0.726176,-0.764555,-0.780735,-0.655656,-0.596031,-0.711924,0.009005,-0.716508,-0.216723,1.518912,0.235515,0.204672,-0.523535,-0.553542,-1.675079,0.241803


---

### Cross-validate a linear regression predicting your target variable from the other variables

How does it perform?

In [54]:
lm = LinearRegression()
score = CVS(lm, x,y,cv = 10)
print score.mean(), 'all scores: ', score

0.397422563045 all scores:  [ 0.44921644  0.35875931  0.53650887  0.48549145  0.17445097  0.38775448
  0.17393223  0.43964213  0.61095627  0.3575135 ]


---

### Create a function that will iteratively predict your target from different train-test splits

This will be used to calculate the bias and the variance after this.

Your function should:

1. Accept a model, X predictor matrix/dataframe, y target variable, and a number of random splits to do training and testing on.
2. The output should be a dataframe that has as its first column the true values of y, and all the other columns will be corresponding predicted values of y when that row was in the testing set.
3. It will iterate through the number of splits
4. Create a variable that is the list of row numbers. Use this with `train_test_split` to get out randomized training rows and testing rows for each iteration.
5. Subset your X and y into training and testing
6. Train your model on the training X and training y
7. Predict values of y using the testing X
8. Add the predicted values of y to the dataframe tracking y predictions - the predicted y values should be insert in the correct row so that they match the true value of y in the first column. You can index using the test indices you got out of train_test_split to do this. (The rest of the rows that were part of the training set can be nan for that iteration).


In [127]:

def multiple_pred(model, x, y ,random_splits_num ):
    all_y_pred = pd.DataFrame({'ytrue':y})

    #row indicies
    rows = range(len(y))

    for i in range(random_splits_num):
        train_rows_ind, test_rows_ind = train_test_split(rows, test_size=0.3)

        x_train = x.iloc[train_rows_ind,:]
        y_train = y.iloc[train_rows_ind]
        x_test = x.iloc[test_rows_ind,:]
        y_test = y.iloc[test_rows_ind]
        model.fit(x_train, y_train)
        y_hat = model.predict(x_test)

        all_y_pred['sample'+str(i)]  = np.nan
        all_y_pred.iloc[test_rows_ind, -1]  = y_hat

    return all_y_pred

print multiple_pred(lm,x,y,500)

     ytrue    sample0    sample1    sample2    sample3    sample4    sample5  \
0       60        NaN        NaN        NaN        NaN        NaN  56.751501   
1       56  58.885975        NaN  60.482685        NaN  62.233643        NaN   
2       54        NaN  68.043857  66.868737  68.381487        NaN        NaN   
3       59  78.587419        NaN  75.319267  78.104097  77.447515        NaN   
4       15        NaN        NaN  52.679777        NaN        NaN        NaN   
5       55        NaN  68.494207        NaN        NaN        NaN  65.151815   
6       63        NaN        NaN  71.793961        NaN  69.703579  70.338381   
7       73  78.965543        NaN        NaN  77.564717  77.690302        NaN   
8       80  72.172789  74.337398        NaN        NaN  73.171224        NaN   
9       52        NaN  61.792698  57.738831  60.097717        NaN  56.837028   
10      73        NaN  81.061370  79.683956        NaN        NaN        NaN   
11      76        NaN  83.149737        

---

### Create different predictor datasets

To see what happens to bias and variance as the predictors change, create a few versions of X that have different numbers of predictors in them.

For example, one could have all the other variables, and another one could be predicting only using private vs. public.

In [123]:
rn = Random()

print len(x.columns), x.columns
#random.sample(xrange(len(mylist)), sample_size)
col1 = rn.sample(xrange(len(x.columns)), rn.randint(2,len(x.columns)))
col2 = rn.sample(xrange(len(x.columns)), rn.randint(2,len(x.columns)))
col3 = rn.sample(xrange(len(x.columns)), rn.randint(2,len(x.columns)))
col4 = rn.sample(xrange(len(x.columns)), rn.randint(2,len(x.columns)))
print col1, col2,col3,col4
x1 = x.iloc[:,col1].copy()
x2 = x.iloc[:,col2].copy()
x3 = x.iloc[:,col3].copy()
x4 = x.iloc[:,col4].copy()

print 'x1:',x1.columns
print 'x2:',x2.columns
print 'x3:',x3.columns
print 'x4:',x4.columns


17 Index([u'Private', u'Apps', u'Accept', u'Enroll', u'Top10perc', u'Top25perc',
       u'F.Undergrad', u'P.Undergrad', u'Outstate', u'Room.Board', u'Books',
       u'Personal', u'PhD', u'Terminal', u'S.F.Ratio', u'perc.alumni',
       u'Expend'],
      dtype='object')
[1, 6, 14, 13, 8, 11, 10, 12, 9] [15, 16, 0, 11, 6, 8, 4, 14, 12, 7, 3, 2, 5, 9] [3, 8, 15, 7, 5, 9, 16, 6, 13, 2, 4] [13, 0, 10, 11, 16, 14, 3]
x1: Index([u'Apps', u'F.Undergrad', u'S.F.Ratio', u'Terminal', u'Outstate',
       u'Personal', u'Books', u'PhD', u'Room.Board'],
      dtype='object')
x2: Index([u'perc.alumni', u'Expend', u'Private', u'Personal', u'F.Undergrad',
       u'Outstate', u'Top10perc', u'S.F.Ratio', u'PhD', u'P.Undergrad',
       u'Enroll', u'Accept', u'Top25perc', u'Room.Board'],
      dtype='object')
x3: Index([u'Enroll', u'Outstate', u'perc.alumni', u'P.Undergrad', u'Top25perc',
       u'Room.Board', u'Expend', u'F.Undergrad', u'Terminal', u'Accept',
       u'Top10perc'],
      dtype='object')
x4:

---

### Use the predict function you wrote above to get the predicted values for each version of the data

Run each of your X through the function with the y target vector. As you recall the output of your function has the true values of y in a column and then predicted values of y in other columns for the different train-test splits

In [124]:
yhat1 = multiple_pred(lm,x1,y,150)
yhat2 = multiple_pred(lm,x2,y,150)
yhat3 = multiple_pred(lm,x3,y,150)
yhat4 = multiple_pred(lm,x4,y,150)



---

### Calculate bias and variance 

I've given you two functions below to calculate bias and variance if they are given the dataframe that has the first column as the true y values and the other column the predicted y values at each train/test split iteration.

You can use these to calculate the bias and variance of your different predictor variables. If you have more predictors variance of prediction should generally go up and bias goes down. Likewise, if you have few predictors variance should go down and bias goes up.

If you have an insanely bad model, they both might go up a lot!

In [109]:
def calculate_bias_sq(yhats_df):
    # Take out the true values of y that are in the first column:
    ytrue = yhats_df.iloc[:,0].values
    
    # Calculate the mean of the predictions, averaged across the columns.
    # So, all of the predictions for the true y at row 0 would be averaged together
    # and so on for all the rows.
    yhat_means = yhats_df.iloc[:,1:].mean(axis=1).values
    
    # Subtract the true value of y from the mean of the predicted values, and square it.
    elementwise_bias_sq = (yhat_means - ytrue)**2
    
    # Take the mean of those squared bias values (across all y)
    mean_bias_sq = np.mean(elementwise_bias_sq)
    return mean_bias_sq


def calculate_variance(yhats_df):
    # Calculate the mean of the predicted y's across the columns (mean of yhat for each row)
    yhats_means = yhats_df.iloc[:,1:].mean(axis=1)
    
    # subtract the mean of the yhats from the original yhat values (for each row)
    # and square the result. 
    yhats_devsq = yhats_df.iloc[:,1:].subtract(yhats_means, axis=0)**2
    
    # Take the mean of the squared deviations from the mean, then 
    # take the mean of those to get the overall variance across the y observations
    yhats_devsq_means = yhats_devsq.mean(axis=1).values
    return np.mean(yhats_devsq_means)


In [125]:
from pprint import pprint

print '\n',calculate_bias_sq(yhat1), calculate_variance(yhat1), len(x1.columns)
print '\n',calculate_bias_sq(yhat2), calculate_variance(yhat2), len(x2.columns)
print '\n',calculate_bias_sq(yhat3), calculate_variance(yhat3), len(x3.columns)
print '\n',calculate_bias_sq(yhat4), calculate_variance(yhat4), len(x4.columns)


191.354776293 1.28799586649 9

169.290422598 1.91336443027 14

168.877598962 1.47244525248 11

213.915816147 1.21850226523 7


---

### How does regularization affect bias and variance?

Use Lasso and/or Ridge on your dataset with all the predictor variables. You can feed the lasso or ridge model into the function you wrote earlier to get the predictions using regularization instead of just ordinary least squares regression.

How does using regularization affect bias and variance?

In [140]:
ridgeCVml = RidgeCV(alphas = (0.1,1,10,100,1000,10000,100000,1000000), cv = 8)
modelrm = ridgeCVml.fit(x,y)
print modelrm.score(x,y)
optimal_alpha = modelrm.alpha_
print optimal_alpha
#row indicies

ridgeml = Ridge(alpha = optimal_alpha)

yhat1_ridge = multiple_pred(ridgeml, x1, y ,150 )
yhat2_ridge = multiple_pred(ridgeml, x2, y ,150 )
yhat3_ridge = multiple_pred(ridgeml, x3, y ,150 )
yhat4_ridge = multiple_pred(ridgeml, x4, y ,150 )


print '\n',calculate_bias_sq(yhat1_ridge), calculate_variance(yhat1_ridge), len(x1.columns)
print '\n',calculate_bias_sq(yhat2_ridge), calculate_variance(yhat2_ridge), len(x2.columns)
print '\n',calculate_bias_sq(yhat3_ridge), calculate_variance(yhat3_ridge), len(x3.columns)
print '\n',calculate_bias_sq(yhat4_ridge), calculate_variance(yhat4_ridge), len(x4.columns)




0.456305952059
100

192.043515262 0.685230567819 9

167.225972157 0.744256072665 14

168.364827798 0.540687085884 11

214.519297194 0.731155007372 7


In [None]:
print multiple_pred_ridge