In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
# machine learning
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
#Azure Usage
from azureml import Workspace



In [2]:
#Acquite Data
ws = Workspace()
trainds = ws.datasets['Boston_Train.csv']
testds  = ws.datasets['Boston_Test.csv']
trainall = trainds.to_dataframe()
#The last 450 rows of data are split off into a validation dataset
traindf = trainall.head(300)
trainvl = trainall.tail(33)
traindf

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,22.9
5,11,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45,15.0
6,12,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.90,13.27,18.9
7,13,0.09378,12.5,7.87,0,0.524,5.889,39.0,5.4509,5,311,15.2,390.50,15.71,21.7
8,14,0.62976,0.0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21.0,396.90,8.26,20.4
9,15,0.63796,0.0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21.0,380.02,10.26,18.2


In [3]:
#Analyze Data Column Headers
print(traindf.columns.values)
traindf.info()

['ID' 'crim' 'zn' 'indus' 'chas' 'nox' 'rm' 'age' 'dis' 'rad' 'tax'
 'ptratio' 'black' 'lstat' 'medv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 15 columns):
ID         300 non-null int64
crim       300 non-null float64
zn         300 non-null float64
indus      300 non-null float64
chas       300 non-null int64
nox        300 non-null float64
rm         300 non-null float64
age        300 non-null float64
dis        300 non-null float64
rad        300 non-null int64
tax        300 non-null int64
ptratio    300 non-null float64
black      300 non-null float64
lstat      300 non-null float64
medv       300 non-null float64
dtypes: float64(11), int64(4)
memory usage: 35.2 KB


In [4]:
#Prepare training data
train_data = pd.get_dummies(traindf)
train_data

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,22.9
5,11,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45,15.0
6,12,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.90,13.27,18.9
7,13,0.09378,12.5,7.87,0,0.524,5.889,39.0,5.4509,5,311,15.2,390.50,15.71,21.7
8,14,0.62976,0.0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21.0,396.90,8.26,20.4
9,15,0.63796,0.0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21.0,380.02,10.26,18.2


In [5]:
#Identify target/label variable.  In this case it is the 'medv' parameter.
y = targets = labels = train_data["medv"].values
#Columns that we are considering for training purpose
#columns = ["ID", "crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat", "medv"]
columns = ["ID", "crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat"]
features = train_data[list(columns)].values
features
# TODO: Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.18, random_state=1)

In [6]:
#Instantiate and Fit the Decision Tree Regressor model
regressor = DecisionTreeRegressor(max_depth=32, random_state=0)
regressor.fit(X_train, y_train)
#Scoring the Decision Tree Regressor model
regressor.score(X_test, y_test)

0.77152717185183195

In [7]:
#Instantiate and Fit the Random Forest Regressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
#Scoring the Decision Tree Regressor model
rfr.score(X_test, y_test)

0.82706521966378554

In [8]:
#The impact of the various predictors on the predicted variable
list(zip(columns, rfr.feature_importances_))

[('ID', 0.037597171153596634),
 ('crim', 0.040727357846154395),
 ('zn', 0.0011889403665965108),
 ('indus', 0.0030695115024377533),
 ('chas', 0.0013553447264079638),
 ('nox', 0.0097463548074437252),
 ('rm', 0.40745416114360722),
 ('age', 0.026661181172964576),
 ('dis', 0.037803022231686294),
 ('rad', 0.0026664375861707485),
 ('tax', 0.0065492432227689645),
 ('ptratio', 0.005251316923417842),
 ('black', 0.0093378425640004512),
 ('lstat', 0.41059211475274698)]

In [54]:
#Prepare validation dataset
trainvl = pd.get_dummies(trainvl)

In [9]:
#columns2 = ["ID", "crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat", "medv"]
columns2 = ["ID", "crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat"]
features_validate = trainvl[list(columns2)].values
features_validate

array([[  4.62000000e+02,   3.69311000e+00,   0.00000000e+00,
          1.81000000e+01,   0.00000000e+00,   7.13000000e-01,
          6.37600000e+00,   8.84000000e+01,   2.56710000e+00,
          2.40000000e+01,   6.66000000e+02,   2.02000000e+01,
          3.91430000e+02,   1.46500000e+01],
       [  4.63000000e+02,   6.65492000e+00,   0.00000000e+00,
          1.81000000e+01,   0.00000000e+00,   7.13000000e-01,
          6.31700000e+00,   8.30000000e+01,   2.73440000e+00,
          2.40000000e+01,   6.66000000e+02,   2.02000000e+01,
          3.96900000e+02,   1.39900000e+01],
       [  4.64000000e+02,   5.82115000e+00,   0.00000000e+00,
          1.81000000e+01,   0.00000000e+00,   7.13000000e-01,
          6.51300000e+00,   8.99000000e+01,   2.80160000e+00,
          2.40000000e+01,   6.66000000e+02,   2.02000000e+01,
          3.93820000e+02,   1.02900000e+01],
       [  4.65000000e+02,   7.83932000e+00,   0.00000000e+00,
          1.81000000e+01,   0.00000000e+00,   6.55000000e-0

In [10]:
pred_vals = rfr.predict(features_validate)
trainvl['Predicted'] = pred_vals
trainvl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv,Predicted
300,462,3.69311,0.0,18.1,0,0.713,6.376,88.4,2.5671,24,666,20.2,391.43,14.65,17.7,21.06
301,463,6.65492,0.0,18.1,0,0.713,6.317,83.0,2.7344,24,666,20.2,396.9,13.99,19.5,20.11
302,464,5.82115,0.0,18.1,0,0.713,6.513,89.9,2.8016,24,666,20.2,393.82,10.29,20.2,24.01
303,465,7.83932,0.0,18.1,0,0.655,6.209,65.4,2.9634,24,666,20.2,396.9,13.22,21.4,18.2
304,466,3.1636,0.0,18.1,0,0.655,5.759,48.2,3.0665,24,666,20.2,334.4,14.13,19.9,20.74
305,467,3.77498,0.0,18.1,0,0.655,5.952,84.7,2.8715,24,666,20.2,22.01,17.15,19.0,14.61
306,468,4.42228,0.0,18.1,0,0.584,6.003,94.5,2.5403,24,666,20.2,331.29,21.32,19.1,15.32
307,469,15.5757,0.0,18.1,0,0.58,5.926,71.0,2.9084,24,666,20.2,368.74,18.13,19.1,14.06
308,470,13.0751,0.0,18.1,0,0.58,5.713,56.7,2.8237,24,666,20.2,396.9,14.76,20.1,18.16
309,472,4.03841,0.0,18.1,0,0.532,6.229,90.7,3.0993,24,666,20.2,395.33,12.87,19.6,20.65


In [11]:
#Prepare Test Dataset and add an empty medv column
testdf = testds.to_dataframe()
#testdf['medv'] = 0
print(testdf.columns.values)

['ID' 'crim' 'zn' 'indus' 'chas' 'nox' 'rm' 'age' 'dis' 'rad' 'tax'
 'ptratio' 'black' 'lstat']


In [12]:
#Set columns that would be tested
#columns3 = ["ID", "crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat","medv"]
columns3 = ["ID", "crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat"]
features_test = testdf[list(columns3)].values
features_test

array([[  3.00000000e+00,   2.72900000e-02,   0.00000000e+00, ...,
          1.78000000e+01,   3.92830000e+02,   4.03000000e+00],
       [  6.00000000e+00,   2.98500000e-02,   0.00000000e+00, ...,
          1.87000000e+01,   3.94120000e+02,   5.21000000e+00],
       [  8.00000000e+00,   1.44550000e-01,   1.25000000e+01, ...,
          1.52000000e+01,   3.96900000e+02,   1.91500000e+01],
       ..., 
       [  4.99000000e+02,   2.39120000e-01,   0.00000000e+00, ...,
          1.92000000e+01,   3.96900000e+02,   1.29200000e+01],
       [  5.01000000e+02,   2.24380000e-01,   0.00000000e+00, ...,
          1.92000000e+01,   3.96900000e+02,   1.43300000e+01],
       [  5.05000000e+02,   1.09590000e-01,   0.00000000e+00, ...,
          2.10000000e+01,   3.93450000e+02,   6.48000000e+00]])

In [13]:
#Run the model on the test data
test_vals = rfr.predict(features_test)
test_vals

array([ 37.8 ,  23.98,  16.83,  16.59,  19.04,  19.47,  19.51,  16.5 ,
        15.66,  18.45,  18.29,  23.26,  19.12,  15.26,  20.94,  21.02,
        21.77,  30.5 ,  18.63,  25.09,  21.01,  23.96,  21.76,  21.17,
        20.9 ,  21.26,  23.58,  22.87,  23.79,  27.42,  44.16,  44.22,
        30.74,  18.9 ,  18.84,  19.99,  20.38,  16.21,  19.72,  20.67,
        15.58,  18.91,  18.94,  16.63,  16.7 ,  13.66,  13.78,  15.26,
        19.72,  19.66,  20.02,  35.98,  47.93,  20.45,  22.29,  21.79,
        30.8 ,  44.37,  34.85,  31.91,  33.84,  43.79,  20.54,  16.91,
        19.31,  23.65,  24.25,  25.44,  16.8 ,  22.24,  31.06,  43.58,
        33.42,  34.07,  23.76,  29.17,  21.02,  21.23,  22.91,  36.7 ,
        22.81,  21.38,  42.43,  40.01,  35.27,  30.01,  40.64,  22.08,
        20.83,  32.31,  36.82,  31.13,  24.28,  44.58,  23.24,  25.34,
        35.66,  21.97,  27.03,  24.74,  33.77,  29.54,  29.18,  23.47,
        24.6 ,  19.59,  24.44,  22.18,  20.14,  24.89,  23.84,  19.42,
      

In [14]:
#Add the predicted column and delete the redundant initialized medv column
testdf['PredictedMedv'] = test_vals
#del testdf['medv']

In [15]:
testdf

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,PredictedMedv
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,37.80
1,6,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21,23.98
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15,16.83
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.59
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10,19.04
5,18,0.78420,0.0,8.14,0,0.538,5.990,81.7,4.2579,4,307,21.0,386.75,14.67,19.47
6,20,0.72580,0.0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21.0,390.95,11.28,19.51
7,25,0.75026,0.0,8.14,0,0.538,5.924,94.1,4.3996,4,307,21.0,394.33,16.30,16.50
8,26,0.84054,0.0,8.14,0,0.538,5.599,85.7,4.4546,4,307,21.0,303.42,16.51,15.66
9,27,0.67191,0.0,8.14,0,0.538,5.813,90.3,4.6820,4,307,21.0,376.88,14.81,18.45
