In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # for train test split
from sklearn.feature_extraction import DictVectorizer # for one-hot encoding
from sklearn.tree import DecisionTreeRegressor # decision tree regressor (for q1)
from sklearn.tree import export_text
#import matplotlib as mpl
#import seaborn as sb

In [74]:
df = pd.read_csv("housing.csv")

In [79]:
# 0 Preliminary tasks
# 0.1 Use only the following columns:
#     'latitude', 'longitude', 'housing_median_age', 'total_rooms',
#     'total_bedrooms', 'population', 'households', 'median_income',
#     'median_house_value', 'ocean_proximity'
df = df[['latitude', 'longitude', 'housing_median_age', 'total_rooms',
         'total_bedrooms', 'population', 'households', 'median_income',
         'median_house_value','ocean_proximity']].copy()
# 0.2 Fill NAs with 0.
# df.isna().sum()[df.isna().sum()>0] # --> 207 NAs in column 'total_bedrooms'
df['total_bedrooms'] = df['total_bedrooms'].fillna(0, inplace=False)

# 0.3 Apply the log tranform to median_house_value.
df['log_median_house_value'] = np.log1p(df['median_house_value'])

# 0.4 Do train/validation/test split with 60%/20%/20% distribution.
#     Use the train_test_split function and set the random_state parameter to 1.
X_train_valid, X_test, y_train_valid, y_test = train_test_split(
    df.drop(['median_house_value', 'log_median_house_value'], axis = 1),
    df[['log_median_house_value']],
    train_size = 0.8,
    test_size = 0.2,
    random_state = 1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid,
    y_train_valid,
    train_size = 0.75,
    test_size = 0.25,
    random_state = 1)

# extract column names to use as labels later
X_names = X_train.columns.values.tolist()

# 0.5 Use DictVectorizer to turn the dataframe into matrices.
# train_dict = X_train.to_dict(orient='records')
# dv_train = DictVectorizer(sparse=False)
# dv_train.fit(train_dict)
# X_train = dv.transform(train_dict)

# valid_dict = X_valid.to_dict(orient='records')
# dv_valid = DictVectorizer(sparse=False)
# dv_valid.fit(valid_dict)
# X_valid = dv.transform(valid_dict)

# test_dict = X_test.to_dict(orient='records')
# dv_test = DictVectorizer(sparse=False)
# dv_test.fit(test_dict)
# X_test = dv.transform(test_dict)

# TO DO
# concatenate X_train, X_valid, X_test
# run DictVectorizer over everything to catch all possible category values
# disassemble resulting array into X_train, X_valid, X_test



In [106]:
print(dv_train.get_feature_names_out(), "\n",
      dv_valid.get_feature_names_out(), "\n", 
      dv_test.get_feature_names_out())
print(X_train.shape, X_valid.shape, X_test.shape)

['households' 'housing_median_age' 'latitude' 'longitude' 'median_income'
 'ocean_proximity=<1H OCEAN' 'ocean_proximity=INLAND'
 'ocean_proximity=ISLAND' 'ocean_proximity=NEAR BAY'
 'ocean_proximity=NEAR OCEAN' 'population' 'total_bedrooms' 'total_rooms'] 
 ['households' 'housing_median_age' 'latitude' 'longitude' 'median_income'
 'ocean_proximity=<1H OCEAN' 'ocean_proximity=INLAND'
 'ocean_proximity=ISLAND' 'ocean_proximity=NEAR BAY'
 'ocean_proximity=NEAR OCEAN' 'population' 'total_bedrooms' 'total_rooms'] 
 ['households' 'housing_median_age' 'latitude' 'longitude' 'median_income'
 'ocean_proximity=<1H OCEAN' 'ocean_proximity=INLAND'
 'ocean_proximity=NEAR BAY' 'ocean_proximity=NEAR OCEAN' 'population'
 'total_bedrooms' 'total_rooms']
(12384, 12) (4128, 12) (4128, 12)


In [99]:
# Question 1

# Let's train a decision tree regressor to predict the median_house_value variable.
regressor = DecisionTreeRegressor(max_depth = 1)
# Train a model with max_depth=1.
regressor.fit(X_train, 
              y_train)

# Which feature is used for splitting the data?
#print(export_text(regressor))
export_text(regressor, feature_names=dv_train.get_feature_names_out())
#print("X_train feature at index position 6 is \n", dv_train.get_feature_names_out()[6])
# dv_train.get_feature_names_out()

# ocean_proximity=INLAND
# total_rooms
# latitude
# population

ValueError: feature_names must contain 12 elements, got 13

In [92]:
dv_train.get_feature_names_out()

array(['households', 'housing_median_age', 'latitude', 'longitude',
       'median_income', 'ocean_proximity=<1H OCEAN',
       'ocean_proximity=INLAND', 'ocean_proximity=ISLAND',
       'ocean_proximity=NEAR BAY', 'ocean_proximity=NEAR OCEAN',
       'population', 'total_bedrooms', 'total_rooms'], dtype=object)

array([1., 1., 0., ..., 0., 1., 1.])

In [7]:
# Question 2

# Train a random forest model with these parameters:

# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)
# What's the RMSE of this model on validation?

# 0.05
# 0.25
# 0.55
# 0.85

In [8]:
# Question 3

# Now let's experiment with the n_estimators parameter

# Try different values of this parameter from 10 to 200 with step 10.
# Set random_state to 1.
# Evaluate the model on the validation dataset.
# After which value of n_estimators does RMSE stop improving?

# 10
# 55
# 75
# 150

In [9]:
# Question 4

# Let's select the best max_depth:

# Try different values of max_depth: [10, 15, 20, 25]
# For each of these values, try different values of n_estimators from 10 till 200 (with step 10)
# Fix the random seed: random_state=1
# What's the best max_depth:

# 10
# 15
# 20
# 25

In [10]:
# Question 5

# We can extract feature importance information from tree-based models.

# At each step of the decision tree learning algorith, it finds the best split. When doint it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the imporatant features for tree-based models.

# In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

# For this homework question, we'll find the most important feature:

# Train the model with these parametes:
# n_estimators=10,
# max_depth=20,
# random_state=1,
# n_jobs=-1 (optional)
# Get the feature importance information from this model
# What's the most important feature?

# total_rooms
# median_income
# total_bedrooms
# longitude


In [None]:
# Question 6

# Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

# Install XGBoost
# Create DMatrix for train and validation
# Create a watchlist
# Train a model with these parameters for 100 rounds:
# xgb_params = {
#     'eta': 0.3, 
#     'max_depth': 6,
#     'min_child_weight': 1,
#     
#     'objective': 'reg:squarederror',
#     'nthread': 8,
#     
#     'seed': 1,
#     'verbosity': 1,
# }
# Now change eta first to 0.1 and then to 0.01

# Which eta leads to the best RMSE score on the validation dataset?

# 0.3
# 0.1
# Both gives same