## Random Forest Model to Predict Heart Disease

#### Read in our data

In [100]:
import numpy as np
import pandas as pd

In [101]:
df = pd.read_csv('heart.csv')

In [102]:
df.shape

(303, 14)

In [103]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


#### Check our baseline model accuracy

In [105]:
df.target.value_counts(normalize = True)

1    0.544554
0    0.455446
Name: target, dtype: float64

### Import packages to train_test_split and model

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

#### Create X matrix from features

In [107]:
X = df.loc[:, 'age':'thal']

#### Create Y target vector

In [108]:
y = df['target']

#### Train / test split - important to stratify y and use a random state for reproducibility

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 15, stratify = y, test_size = .5)

In [110]:
# Double check that our y_test has same baseline accuracy

y_test.value_counts(normalize=True)

1    0.546053
0    0.453947
Name: target, dtype: float64

#### Instantiate our Random Forest Model

In [111]:
rf = RandomForestClassifier(n_estimators = 25, random_state = 15)

In [112]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=None,
            oob_score=False, random_state=15, verbose=0, warm_start=False)

#### Overfitting is a common problem with tree-based algorithms - let's check our score on X_train and y_train

In [113]:
rf.score(X_train, y_train)

1.0

#### vs. our test score

In [114]:
rf.score(X_test, y_test)

0.8289473684210527

#### Generate predictions with our model

In [115]:
predictions = rf.predict(X_test)

In [116]:
y_test

143    1
160    1
39     1
105    1
182    0
48     1
214    0
180    0
132    1
172    0
77     1
198    0
114    1
127    1
6      1
197    0
267    0
34     1
250    0
242    0
90     1
150    1
78     1
113    1
93     1
158    1
72     1
194    0
259    0
184    0
      ..
199    0
260    0
135    1
231    0
284    0
283    0
101    1
169    0
247    0
40     1
0      1
157    1
261    0
153    1
115    1
43     1
128    1
202    0
37     1
108    1
45     1
187    0
147    1
241    0
268    0
88     1
183    0
57     1
210    0
145    1
Name: target, Length: 152, dtype: int64

#### Throw predictions into a dataframe with our actual y_test values

In [117]:
pred_df = pd.DataFrame([predictions, y_test]).T

pred_df.columns = ['predictions', 'actual']

In [118]:
pred_df

Unnamed: 0,predictions,actual
0,0,1
1,1,1
2,1,1
3,1,1
4,1,0
5,1,1
6,0,0
7,0,0
8,1,1
9,1,0


#### Evaluate classification model with a confusion matrix

In [119]:
from sklearn.metrics import classification_report, confusion_matrix

In [120]:
print(classification_report(pred_df['actual'], pred_df['predictions']))

              precision    recall  f1-score   support

           0       0.85      0.75      0.80        69
           1       0.81      0.89      0.85        83

   micro avg       0.83      0.83      0.83       152
   macro avg       0.83      0.82      0.83       152
weighted avg       0.83      0.83      0.83       152



In [121]:
conf_df = pd.DataFrame(confusion_matrix(pred_df['actual'], pred_df['predictions']))

conf_df.columns = ['predicted 0', 'predicted 1']

conf_df.index = ['actual 0', 'actual 1']

#### In this case we have lots more False Positives than False Negatives. That's a good thing for heart disease screening! Let's see if we can improve our model with a gridsearch. OR improve it by pushing even more FPs to FNs

In [122]:
conf_df

Unnamed: 0,predicted 0,predicted 1
actual 0,52,17
actual 1,9,74


In [24]:
rf.predict_proba(X_test)

array([[0.52, 0.48],
       [0.28, 0.72],
       [0.24, 0.76],
       [0.12, 0.88],
       [0.16, 0.84],
       [0.24, 0.76],
       [0.96, 0.04],
       [1.  , 0.  ],
       [0.12, 0.88],
       [0.44, 0.56],
       [0.24, 0.76],
       [0.88, 0.12],
       [0.08, 0.92],
       [0.28, 0.72],
       [0.24, 0.76],
       [0.52, 0.48],
       [0.36, 0.64],
       [0.52, 0.48],
       [0.92, 0.08],
       [0.68, 0.32],
       [0.24, 0.76],
       [0.4 , 0.6 ],
       [0.08, 0.92],
       [0.36, 0.64],
       [0.6 , 0.4 ],
       [0.52, 0.48],
       [0.04, 0.96],
       [0.28, 0.72],
       [0.28, 0.72],
       [0.72, 0.28],
       [0.  , 1.  ],
       [0.36, 0.64],
       [0.88, 0.12],
       [0.32, 0.68],
       [0.48, 0.52],
       [0.  , 1.  ],
       [0.36, 0.64],
       [0.36, 0.64],
       [0.24, 0.76],
       [0.16, 0.84],
       [0.36, 0.64],
       [0.92, 0.08],
       [0.88, 0.12],
       [0.72, 0.28],
       [0.48, 0.52],
       [0.4 , 0.6 ],
       [0.36, 0.64],
       [0.96,

#### Feature Importance

In [25]:
rf.feature_importances_

array([0.12920876, 0.02454464, 0.1341542 , 0.06625545, 0.07539416,
       0.01460252, 0.0262667 , 0.13451526, 0.05878402, 0.1003495 ,
       0.02706086, 0.14687549, 0.06198844])

In [26]:
pd.DataFrame(rf.feature_importances_, X.columns)

Unnamed: 0,0
age,0.129209
sex,0.024545
cp,0.134154
trestbps,0.066255
chol,0.075394
fbs,0.014603
restecg,0.026267
thalach,0.134515
exang,0.058784
oldpeak,0.100349


#### Manually moving along auc/roc. I don't mind false positives as much!

![aucroc](aucroc.png)

#### TPR = TP / TP + FN
#### FPR = FP / FP + TN

In [123]:
rf_weighted = RandomForestClassifier(n_estimators = 25, random_state = 15) 

In [124]:
rf_weighted.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=None,
            oob_score=False, random_state=15, verbose=0, warm_start=False)

In [125]:
rf_weighted.score(X_train, y_train)

1.0

In [126]:
rf_weighted.score(X_test, y_test)

0.8289473684210527

#### Generate Predictions

In [127]:
rf_weighted_pred = rf_weighted.predict(X_test)

#### Create confusion matrix

In [128]:
rf_weighted_df = pd.DataFrame([rf_weighted_pred, y_test]).T

rf_weighted_df.columns = ['predictions', 'actual']

conf_w_df = pd.DataFrame(confusion_matrix(rf_weighted_df['actual'], rf_weighted_df['predictions']))

conf_w_df.columns = ['predicted 0', 'predicted 1']

conf_w_df.index = ['actual 0', 'actual 1']

conf_w_df

Unnamed: 0,predicted 0,predicted 1
actual 0,52,17
actual 1,9,74


#### Require a threshold of only 32% probability to fall into class 1, instead of the default 50%

In [129]:
predicts = []
for item in rf_weighted.predict_proba(X_test):
    if item[0] <= .68:
        predicts.append(1)
    else:
        predicts.append(0)

#### Pretty good results! Only a small loss in accuracy for a significant decrease in false negatives

In [130]:
pd.DataFrame(confusion_matrix(y_test, predicts), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])

Unnamed: 0,predicted 0,predicted 1
actual 0,38,31
actual 1,1,82


In [131]:
(38+82) / (38 + 82 + 31 + 1)

0.7894736842105263

#### class_weight parameter

In [147]:
rf_weighted_2 = RandomForestClassifier(n_estimators = 25, random_state = 15, class_weight = {0 : 1, 1 : 10000}) 

In [148]:
rf_weighted_2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 10000},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=None, oob_score=False, random_state=15,
            verbose=0, warm_start=False)

In [149]:
rf_weighted_2.score(X_test, y_test)

0.7631578947368421

#### predict and generate confusion matrix

In [150]:
rf_weighted_pred_2 = rf_weighted_2.predict(X_test)

In [151]:
rf_weighted_df2 = pd.DataFrame([rf_weighted_pred_2, y_test]).T

rf_weighted_df2.columns = ['predictions', 'actual']

conf_w_df2 = pd.DataFrame(confusion_matrix(rf_weighted_df2['actual'], rf_weighted_df2['predictions']))

conf_w_df2.columns = ['predicted 0', 'predicted 1']

conf_w_df2.index = ['actual 0', 'actual 1']

conf_w_df2

Unnamed: 0,predicted 0,predicted 1
actual 0,50,19
actual 1,17,66


## Grid Searching!

In [152]:
from sklearn.model_selection import GridSearchCV

In [42]:
parameters = {'n_estimators' : [25, 75],
             'max_depth' : [2, 4],
             'max_leaf_nodes' : [200, 800],
             'min_samples_split' : [2,6],
             'random_state' : [15],
             'class_weight' : ["balanced", "balanced_subsample"],
              'verbose' : [1]
             }

scores = 'accuracy'

gs = GridSearchCV(RandomForestClassifier(n_jobs = 3), param_grid=parameters, scoring=scores, cv = 3)

In [43]:
gs.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.7s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out 

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent worker

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent worker

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  75 out 

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out 

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [25, 75], 'max_depth': [2, 4], 'max_leaf_nodes': [200, 800], 'min_samples_split': [2, 6], 'random_state': [15], 'class_weight': ['balanced', 'balanced_subsample'], 'verbose': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [44]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=2, max_features='auto',
            max_leaf_nodes=200, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=75, n_jobs=3, oob_score=False, random_state=15,
            verbose=1, warm_start=False)

In [45]:
gs.best_score_

0.847682119205298

In [46]:
gs.score(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished


0.9271523178807947

In [47]:
gs.score(X_test, y_test)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed:    0.0s finished


0.8223684210526315

In [48]:
rf_grid_result = RandomForestClassifier(max_depth=2, max_leaf_nodes=200, min_samples_split=2, 
                                        n_estimators=75, random_state = 15, class_weight='balanced')

In [49]:
rf_grid_result.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=2, max_features='auto',
            max_leaf_nodes=200, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=75, n_jobs=None, oob_score=False, random_state=15,
            verbose=0, warm_start=False)

In [50]:
rf_grid_result.score(X_train, y_train)

0.9271523178807947

In [51]:
rf_grid_result.score(X_test,y_test)

0.8223684210526315

In [52]:
predictions_grid = rf_grid_result.predict(X_test)

grid_pred_df = pd.DataFrame([predictions_grid, y_test]).T

grid_pred_df.columns = ['predictions', 'actual']

grid_conf_df = pd.DataFrame(confusion_matrix(grid_pred_df['actual'], grid_pred_df['predictions']))

grid_conf_df

Unnamed: 0,0,1
0,50,19
1,8,75


## Random Forest Regressor - on your favorite dataset

In [157]:
from sklearn.ensemble import RandomForestRegressor

In [158]:
kc_data = pd.read_csv('kc_house_data.csv')

In [159]:
kc_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


### Drop cols

In [160]:
kc_data = kc_data.drop(columns = ['id', 'date', 'waterfront', 'view', 'yr_renovated', 'lat', 'long'])

In [161]:
rf_reg = RandomForestRegressor(n_estimators = 30, criterion = 'mae', random_state = 15)

In [162]:
kc_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,3,7,1180,0.0,1955,98178,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,3,7,2170,400.0,1951,98125,1690,7639
2,180000.0,2,1.0,770,10000,1.0,3,6,770,0.0,1933,98028,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,5,7,1050,910.0,1965,98136,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,3,8,1680,0.0,1987,98074,1800,7503


### Select X and y and train_test_split

In [163]:
X = kc_data.loc[:, 'bedrooms' : 'sqft_lot15']

In [164]:
y = kc_data['price']

In [165]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 15)

### Dummy and fix some nulls

In [166]:
X_train = X_train.replace('?', np.nan)

X_test = X_test.replace('?', np.nan)

X_train.drop(columns = 'sqft_basement', inplace=True)

X_test.drop(columns = 'sqft_basement', inplace=True)

X_train = pd.get_dummies(X_train, columns = ['zipcode'])

X_test = pd.get_dummies(X_test, columns = ['zipcode'])

### Run model and evaluate

In [69]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
           oob_score=False, random_state=15, verbose=0, warm_start=False)

In [70]:
rf_reg.score(X_train, y_train)

0.9719250983704143

In [71]:
rf_reg.score(X_test, y_test)

0.8117051709811356

In [72]:
rf_reg_pred = rf_reg.predict(X_test)

In [73]:
pd.DataFrame([rf_reg_pred, y_test]).T

Unnamed: 0,0,1
0,8.154600e+05,630000.0
1,2.309750e+05,223000.0
2,5.716967e+05,415000.0
3,2.384292e+05,384500.0
4,4.515181e+05,550000.0
5,1.244148e+06,1710000.0
6,2.836250e+05,263500.0
7,6.126583e+05,750000.0
8,4.139391e+05,375000.0
9,1.246767e+06,1140000.0


In [74]:
from sklearn.metrics import mean_absolute_error, median_absolute_error

In [75]:
median_absolute_error(y_test, rf_reg_pred)

48760.74999999997

In [76]:
mean_absolute_error(y_test, rf_reg_pred)

86961.2204382716

## Pipelines - series of transformers with an estimator at the end

In [79]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

In [167]:
ss = StandardScaler()
lasso = LassoCV(cv = 5)

steps = [('scaler', ss), ('reg', lasso)]

my_pipeline = Pipeline(steps)

In [168]:
my_pipeline.fit(X = X_train, y = y_train)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reg', LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False))])

In [169]:
my_pipeline.score(X_train,y_train)

  Xt = transform.transform(Xt)


0.7645785479793556

In [170]:
my_pipeline.score(X_test,y_test)

  Xt = transform.transform(Xt)


0.7591893619198097

In [98]:
my_pipeline.set_params(reg__eps=.1)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reg', LassoCV(alphas=None, copy_X=True, cv=5, eps=0.1, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False))])

In [99]:
my_pipeline.get_params()

{'memory': None,
 'steps': [('scaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('reg', LassoCV(alphas=None, copy_X=True, cv=5, eps=0.1, fit_intercept=True,
       max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=False))],
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'reg': LassoCV(alphas=None, copy_X=True, cv=5, eps=0.1, fit_intercept=True,
     max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
     positive=False, precompute='auto', random_state=None,
     selection='cyclic', tol=0.0001, verbose=False),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'reg__alphas': None,
 'reg__copy_X': True,
 'reg__cv': 5,
 'reg__eps': 0.1,
 'reg__fit_intercept': True,
 'reg__max_iter': 1000,
 'reg__n_alphas': 100,
 'reg__n_jobs': None,
 'reg__normalize': False,
 'reg__positive': False,
 'reg__pre

## Boosting with Decision Trees (AdaBoost)