In [28]:
# Libraries
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import math

import xgboost as xgb

# Homework

The goal of this homework is to create a regression model for predicting housing prices (column 'median_house_value').

In this homework we'll again use the California Housing Prices dataset - the same one we used in homework 2 and 3.

You can take it from Kaggle or download using wget link mentioned below:

wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

Note: sometimes your answer doesn't match one of the options exactly. That's fine. Select the option that's closest to your solution.

# Loading the data

Use only the following columns:

- 'latitude',

- 'longitude',

- 'housing_median_age',

- 'total_rooms',

- 'total_bedrooms',

- 'population',

- 'households',

- 'median_income',

- 'median_house_value',

- 'ocean_proximity'

- Fill NAs with 0.

- Apply the log tranform to median_house_value.

- Do train/validation/test split with 60%/20%/20% distribution.

- Use the train_test_split function and set the random_state parameter to 1.

- Use DictVectorizer to turn the dataframe into matrices.

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

In [3]:
!wget $data

--2022-10-17 15:00:20--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv.2’


2022-10-17 15:00:24 (577 KB/s) - ‘housing.csv.2’ saved [1423529/1423529]



In [4]:
df = pd.read_csv(data)

In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df['ocean_proximity'] = df['ocean_proximity'].str.lower().str.replace(' ', '_')

In [7]:
# Apply log1p transformation to median_house_value

df['median_house_value'] = np.log1p(df['median_house_value'])
df['median_house_value']

0        13.022766
1        12.789687
2        12.771673
3        12.740520
4        12.743154
           ...    
20635    11.265758
20636    11.252872
20637    11.432810
20638    11.346883
20639    11.400887
Name: median_house_value, Length: 20640, dtype: float64

In [8]:
# Fill NAs with zero
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [9]:
df = df.fillna(0)

In [10]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [11]:
# Do train/validation/test split with 60%/20%/20% distribution.

# Splitting the data into train/val/test dataset
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Resetting the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [12]:
# we make it our y vector
y_train = df_train['median_house_value'].values
y_val = df_val['median_house_value'].values
y_test = df_test['median_house_value'].values

In [13]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [14]:
# Use DictVectorizer to turn the dataframe into matrices.
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

# Question 1

Let's train a decision tree regressor to predict the median_house_value variable.

Train a model with max_depth=1.

Which feature is used for splitting the data?

- ocean_proximity=INLAND
- total_rooms
- latitude
- population

In [15]:
# Decision tree
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=1)

In [16]:
# Looking the rules of the tree
print(export_text(dt, feature_names=dv.get_feature_names()))

|--- ocean_proximity=inland <= 0.50
|   |--- value: [12.31]
|--- ocean_proximity=inland >  0.50
|   |--- value: [11.61]





# Question 2

Train a random forest model with these parameters:

- n_estimators=10
- random_state=1
- n_jobs=-1 (optional - to make training faster)

What's the RMSE of this model on validation?

- 0.05
- 0.25
- 0.55
- 0.85

In [17]:
# RandomForest Model
rf = RandomForestRegressor(n_estimators = 10, random_state = 1, n_jobs = -1)
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=1)

In [18]:
y_pred = rf.predict(X_val)

In [19]:
mse = mean_squared_error(y_val, y_pred)

In [20]:
rmse = round(math.sqrt(mse), 2)
rmse

0.25

# Question 3

Now let's experiment with the n_estimators parameter

Try different values of this parameter from 10 to 200 with step 10.

Set random_state to 1.

Evaluate the model on the validation dataset.

After which value of n_estimators does RMSE stop improving?

- 10
- 50
- 70
- 150

In [21]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = round(math.sqrt(mse), 2)
    scores.append((n, rmse))

In [22]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])
df_scores

Unnamed: 0,n_estimators,rmse
0,10,0.25
1,20,0.24
2,30,0.24
3,40,0.23
4,50,0.23
5,60,0.23
6,70,0.23
7,80,0.23
8,90,0.23
9,100,0.23


n_estimators 50

# Question 4

Let's select the best max_depth:

Try different values of max_depth: [10, 15, 20, 25]

For each of these values, try different values of n_estimators from 10 till 200 (with step 10)

Fix the random seed: random_state=1

What's the best max_depth:

- 10
- 15
- 20
- 25

In [23]:
scores = []

for d in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1)
        rf.fit(X_train, y_train)
    
        y_pred = rf.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        rmse = round(math.sqrt(mse), 2)
        scores.append((d, n, rmse))

In [29]:
df_scores = pd.DataFrame(scores, columns=['max_depth','n_estimators', 'rmse'])


In [30]:
pd.set_option('display.max_rows', None)


In [31]:
df_scores

Unnamed: 0,max_depth,n_estimators,rmse
0,10,10,0.25
1,10,20,0.25
2,10,30,0.25
3,10,40,0.25
4,10,50,0.25
5,10,60,0.25
6,10,70,0.25
7,10,80,0.25
8,10,90,0.25
9,10,100,0.25


The best max_depth is 15

# Question 5

We can extract feature importance information from tree-based models.

At each step of the decision tree learning algorith, it finds the best split. When doint it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the imporatant features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

Train the model with these parametes:

- n_estimators=10,
- max_depth=20,
- random_state=1,
- n_jobs=-1 (optional)

Get the feature importance information from this model

What's the most important feature?

- total_rooms
- median_income
- total_bedrooms
- longitude

In [32]:
# Using randomforest
rf = RandomForestRegressor(n_estimators=10, max_depth=20, n_jobs=-1, random_state=1)
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=20, n_estimators=10, n_jobs=-1, random_state=1)

In [36]:
dv.get_feature_names()



['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=<1h_ocean',
 'ocean_proximity=inland',
 'ocean_proximity=island',
 'ocean_proximity=near_bay',
 'ocean_proximity=near_ocean',
 'population',
 'total_bedrooms',
 'total_rooms']

In [37]:
# let's create a dictionary of features and their importance values
feat_dict= {}
for col, val in sorted(zip(dv.get_feature_names(), rf.feature_importances_),key=lambda x:x[1],reverse=True):
  feat_dict[col]=val

In [38]:
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

In [39]:
feat_df

Unnamed: 0,Feature,Importance
0,median_income,0.363224
1,ocean_proximity=inland,0.310901
2,latitude,0.101333
3,longitude,0.096341
4,housing_median_age,0.033197
5,population,0.030999
6,total_rooms,0.020465
7,total_bedrooms,0.019373
8,households,0.016141
9,ocean_proximity=near_ocean,0.004452


median_income is the most important information

# Question 6

Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

Install XGBoost

Create DMatrix for train and validation

Create a watchlist

Train a model with these parameters for 100 rounds:

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?

- 0.3
- 0.1
- Both gives same

In [44]:
features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)


In [45]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [46]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [47]:
pred = model.predict(dval)

In [49]:
mse = mean_squared_error(y_val, pred)
rmse = round(math.sqrt(mse), 2)
rmse

0.23

In [50]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)
pred = model.predict(dval)

mse = mean_squared_error(y_val, pred)
rmse = round(math.sqrt(mse), 2)
rmse

0.23

Both gives same