In this exercise we are going to use Boston Data

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

In [9]:
url = "https://raw.githubusercontent.com/ga-students/SF-DAT-20/master/Data/Boston.csv"
BostonData = pd.read_csv(url)
del BostonData['Unnamed: 0']
BostonData.head(5)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [10]:
X = BostonData[BostonData.columns.values]
del X['medv']
y = BostonData['medv']

In [11]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest - Regression Problems

#### For documentation of RandomForestRegressor please refer to:
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [12]:
RF = RandomForestRegressor(n_estimators = 100, #Number of trees - the more the better!
                           max_features = 4,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True)      #If you would like to record out of bage scores
# if you do not set maximum number of features, then you'll have bagging

RF.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=4, max_leaf_nodes=None, min_samples_leaf=5,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [13]:
sorted(zip(RF.feature_importances_,X.columns.values))

[(0.0017553561260160028, 'chas'),
 (0.0028028909611928056, 'zn'),
 (0.0075010214467878455, 'rad'),
 (0.013860830130932977, 'black'),
 (0.017600113373567049, 'age'),
 (0.02211963333669479, 'tax'),
 (0.045951209938868844, 'crim'),
 (0.05201391460903354, 'dis'),
 (0.069872961261913946, 'indus'),
 (0.069906878705695966, 'nox'),
 (0.074804807981179694, 'ptratio'),
 (0.31041606800687283, 'lstat'),
 (0.31139431412124391, 'rm')]

In [14]:
RF.oob_score_  #this value is like R^2
#R^2 is only good for a 

0.83897469883790032

#### A time-consuming way to tune your model on m - # of Features

In [16]:
from sklearn.cross_validation import cross_val_score
Features = range(1,14)
score = []
for i in Features:
      RF = RandomForestRegressor(n_estimators = 100, #Number of trees - the more the better!
                           max_features = i,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True) 
      scores = cross_val_score(RF, X, y, cv=10, scoring='mean_squared_error')
      score.append(np.mean(np.sqrt(-scores)))

Depth_Choice_df = pd.DataFrame({'cv_scores': score ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'cv_scores' )



# Remember in our previous lecture, we could only achieve error of 5.2. Here we reduced it to 4.1

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1120a0ad0>

#### Let's check if choosing 1000 for number of trees was a good choice. (This may take a few minutes to run)

In [17]:
from sklearn.cross_validation import cross_val_score
NumberOfTrees = [1000,1500,2000]
score = []
for i in NumberOfTrees:
      RF = RandomForestRegressor(n_estimators = i, #Number of trees - the more the better!
                           max_features = 7,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True) 
      scores = cross_val_score(RF, X, y, cv=10, scoring='mean_squared_error')
      score.append(np.mean(np.sqrt(-scores)))

Depth_Choice_df = pd.DataFrame({'cv_scores': score ,'Number of Trees': NumberOfTrees })
Depth_Choice_df.plot(x ='Number of Trees',y = 'cv_scores' )
#look at the magnitude of CV_error - it is so small. It sounds like we don't see that much improvement after 1000 trees

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1127799d0>

#### Using Out of Bag error to tune our models. This is so much faster than CV!

In [18]:
from sklearn.cross_validation import cross_val_score
Features = range(1,14)
oob_score_RF = []
for i in Features:
      RF = RandomForestRegressor(n_estimators = 1000, #Number of trees - the more the better!
                           max_features = i,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True)
      RF.fit(X,y)  
      oob_score_RF.append(RF.oob_score_)

Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'Out of Bag Score' )

# Out of bag score is R^2 - the larger the better. Again we can see our model works best when Number of features
# is around 6. Almost same story as CV - but in much faster time. 

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1157b2b90>

# Random Forest - Categorical Variables

#### let's first categorize medv to 4 groups. 
Lowest 20% medv is categorized as Level 1

next 30% medv is categorized as Level 2

next 30% medv is categorized as Level 3

Top 20% medv is categorized as Level 4

In [19]:
BostonData['medvCategory'] = '1'
mask_cat2 = ((BostonData['medv'] > BostonData['medv'].quantile(.2)) & 
            (BostonData['medv'] <= BostonData['medv'].quantile(.5)))   
mask_cat3 = ((BostonData['medv'] > BostonData['medv'].quantile(.5)) & 
            (BostonData['medv'] <= BostonData['medv'].quantile(.8)))  
mask_cat4 = (BostonData['medv'] > BostonData['medv'].quantile(.8)) 

BostonData.loc[mask_cat2,'medvCategory'] = '2'
BostonData.loc[mask_cat3,'medvCategory'] = '3'
BostonData.loc[mask_cat4,'medvCategory'] = '4'

In [20]:
y = BostonData['medvCategory']

Documentation for RandomForestClassifier http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [22]:
from sklearn.ensemble import RandomForestClassifier
RFClass = RandomForestClassifier(n_estimators = 1000, 
                                 max_features = 'sqrt', # You can set it to a number or 'sqrt', 'log2', etc
                                 min_samples_leaf = 5,
                                 oob_score = True)
RFClass.fit(X,y)
print(RFClass.oob_score_)
scores = cross_val_score(RFClass, X, y, cv=10)
print(scores.mean())
#out of bag error = 25% , CV_error is 35%

0.752964426877
0.644275786367


In [23]:
from sklearn.cross_validation import cross_val_score
Features = range(1,14)
oob_score_RF = []
for i in Features:
        RFClass = RandomForestClassifier(n_estimators = 1000, #Number of trees - the more the better!
                           max_features = i,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True)
        RFClass.fit(X,y)  
        oob_score_RF.append(RFClass.oob_score_)

Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'Out of Bag Score' )

# it sounds like choosing max_features as 5 is our best bet

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x115fbab50>

In [24]:
y_hat = RFClass.predict(X)
#print(y_hat)

# Boosting Algorithm - Regression

For full documention, please refer to: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn.ensemble.GradientBoostingRegressor

In [25]:
y = BostonData['medv']


In [26]:
from sklearn.ensemble import GradientBoostingRegressor
GBR_Tree = GradientBoostingRegressor(learning_rate = 0.01, # This is lambda, a tuning parameter, usually between 0.01 and 0.1
                                     n_estimators = 1000, #This is B, a tuning parameter, using large B can cause overfitting
                                     max_depth = 2, #This is d, another tuning parameter, usually max_depth < 5
                                     min_samples_leaf = 5  )
scores = cross_val_score(GBR_Tree, X, y, cv=10, scoring='mean_squared_error')
print (np.mean(np.sqrt(-scores))) #it gave us an error of 4.1

3.91443148442


#### Tuning 3 parameters simultanously is a very time-consuming task. Let's tune it step by step and via a semi-greedy approach

In [27]:
Depth = range(1,5)
Score = []
for i in Depth:
    GBR_Tree = GradientBoostingRegressor(learning_rate = 0.01, # This is lambda, a tuning parameter, usually between 0.01 and 0.1
                                 n_estimators = 1000, #This is B, a tuning parameter, using large B can cause overfitting
                                 max_depth = i, #This is d, another tuning parameter, usually max_depth < 5
                                 min_samples_leaf = 5  )  
    scores = cross_val_score(GBR_Tree, X, y, cv=10, scoring='mean_squared_error')
    Score.append(np.mean(np.sqrt(-scores)))

Depth_Choice_df = pd.DataFrame({'CV_Error': Score ,'Max_Depth': Depth})
Depth_Choice_df.plot(x ='Max_Depth',y = 'CV_Error' )

# it seems like Max_Depth = 2 gives us the lowest CV-Error

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x11687a450>

In [28]:
Score = []
NumberOfTrees = [1000, 1500, 2000]
for i in NumberOfTrees:
    GBR_Tree = GradientBoostingRegressor(learning_rate = 0.01, # This is lambda, a tuning parameter, usually between 0.01 and 0.1
                                         n_estimators = i, #This is B, a tuning parameter, using large B can cause overfitting
                                         max_depth = 2, #This is d, another tuning parameter, usually max_depth < 5
                                         min_samples_leaf = 5  )  
    scores = cross_val_score(GBR_Tree, X, y, cv=10, scoring='mean_squared_error')
    Score.append(np.mean(np.sqrt(-scores)))

Depth_Choice_df = pd.DataFrame({'CV_Error': Score ,'Number Of Trees': NumberOfTrees})
Depth_Choice_df.plot(x ='Number Of Trees',y = 'CV_Error' )

# it seems like at Tree_Size = 1000 we have reduced our error to 3.8 which is great! 
# You can localize your search. Even further. For example try size of tree from 500 to 5000 with increments of 100
# Remember Learning Rate and Number of trees should be tuned with one another. Lower leawrning rates require more trees

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x116bb73d0>

# Boosting Algorithm - Classification

Please refer to the following link for its documentation: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [29]:
y = BostonData['medvCategory']

In [32]:
from sklearn.ensemble import GradientBoostingClassifier


# Unfortunately we cannot use cross_val_score in this case. cross_val_score returns log of loss which is not a 
# measure we are interested in, in this course. 

from sklearn import cross_validation
def get_cross_val(X, y, GBC_Tree):
    kf = cross_validation.KFold(len(BostonData), n_folds = 10, shuffle = True) #10 fold CV
    scores = []
    for train_index, test_index in kf:        
        GBC_Tree.fit(X.iloc[train_index], y.iloc[train_index])
        y_hat_test = GBC_Tree.predict(X.iloc[test_index]) 
        scores.append(float(sum(y_hat_test == y.iloc[test_index]))/len(y_hat_test))
    Score_GBC_CV = np.mean(scores)    
    return Score_GBC_CV

print(Score_GBC_CV) #Based on this setting, 73.7% of time we predict correct outcomes
# in order to improve this result we need to tune our algorithm

0.753137254902


# Tune your algorithm to get better predictibility

In [34]:
import math
math.sqrt(58)

Depth = range(1,5)
Score = []
for i in Depth:
    GBC_Tree = GradientBoostingClassifier(learning_rate = 0.01,
                                      n_estimators = 1000,
                                      max_depth = i,
                                      min_samples_leaf = 5)
    Score.append(get_cross_val(X,y, GBC_Tree))

Depth_Choice_df = pd.DataFrame({'CV_Error': Score ,'Max_Depth': Depth})
Depth_Choice_df.plot(x ='Max_Depth',y = 'CV_Error' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x118573b90>