In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import numpy as np

# Step 1: Load the data
data = pd.read_csv('housing.csv')

# Step 2: Filter the data
data = data[data['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

# Step 3: Handle missing values and apply log transform
data = data.fillna(0)
data['median_house_value'] = np.log1p(data['median_house_value'])

# Step 4: Split the data
df_train, df_test = train_test_split(data, test_size=0.4, random_state=1)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=1)

# Step 5: Convert dataframes to matrices
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
y_train = df_train['median_house_value']

# Step 6: Initialize and train the model
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

# Step 7: Investigate the trained model
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))


|--- median_house_value <= 11.95
|   |--- value: [11.49]
|--- median_house_value >  11.95
|   |--- value: [12.41]

