In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

# Step 1: Import Libraries and Load Data
# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
data = pd.read_csv(url)

# Step 2: Filter the Data
# Keep only the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'
data = data[data['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

# Step 3: Handle Missing Values and Transform Target Variable
# Fill missing values with zeros
data = data.fillna(0)

# Apply the log transform to median_house_value
data['median_house_value'] = np.log1p(data['median_house_value'])

# Step 4: Train/Validation/Test Split
# Do train/validation/test split with 60%/20%/20% distribution
df_train_full, df_test = train_test_split(data, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

# Separate the target variable
y_train = df_train['median_house_value'].values
y_val = df_val['median_house_value'].values
y_test = df_test['median_house_value'].values

# Drop the target variable from the dataframes
df_train = df_train.drop(columns=['median_house_value'])
df_val = df_val.drop(columns=['median_house_value'])
df_test = df_test.drop(columns=['median_house_value'])

# Step 5: Convert DataFrames to Matrices
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))

# Now the dataset is ready for modeling!


In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor, export_text

# Assuming you have already prepared your data and it's stored in X_train and y_train
# (following the previous steps and code)

# Initialize the DecisionTreeRegressor model
tree_model = DecisionTreeRegressor(max_depth=1)

# Train the model
tree_model.fit(X_train, y_train)

# Get the tree's decision rules
tree_rules = export_text(tree_model, feature_names=list(dv.get_feature_names_out()))

# Print the decision rules
print(tree_rules)


|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



# 2

In [4]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the validation data
y_pred = rf_model.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(rmse)


0.24518772479229903


### 3.

In [6]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Initialize variables to store the best RMSE and corresponding n_estimators
best_rmse = float('inf')
best_n = None

# Iterate over n_estimators from 10 to 200 with a step of 10
for n in range(10, 201, 10):
    # Initialize the Random Forest model
    rf_model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    
    # Train the model on the training data
    rf_model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = rf_model.predict(X_val)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(f'n_estimators: {n}, RMSE: {rmse}')
    
    # Update the best RMSE and n_estimators if the current RMSE is lower
    if rmse < best_rmse:
        best_rmse = rmse
        best_n = n
#     else:
#         # Stop if RMSE is not improving
#         break

print(f'Best n_estimators: {best_n}, Best RMSE: {best_rmse}')


n_estimators: 10, RMSE: 0.24518772479229908
n_estimators: 20, RMSE: 0.23867351078447516
n_estimators: 30, RMSE: 0.23688677870552852
n_estimators: 40, RMSE: 0.23526544363339155
n_estimators: 50, RMSE: 0.23486891959219142
n_estimators: 60, RMSE: 0.23441399445846697
n_estimators: 70, RMSE: 0.234331236384555
n_estimators: 80, RMSE: 0.23447831103515132
n_estimators: 90, RMSE: 0.2343177360969172
n_estimators: 100, RMSE: 0.23419468428037377
n_estimators: 110, RMSE: 0.23414103115617693
n_estimators: 120, RMSE: 0.23396413774464705
n_estimators: 130, RMSE: 0.23378469210996636
n_estimators: 140, RMSE: 0.23360640922635745
n_estimators: 150, RMSE: 0.2334964135185024
n_estimators: 160, RMSE: 0.2333384875091416
n_estimators: 170, RMSE: 0.23329859841692557
n_estimators: 180, RMSE: 0.2334700171311343
n_estimators: 190, RMSE: 0.23371183202257467
n_estimators: 200, RMSE: 0.23365273265963638
Best n_estimators: 170, Best RMSE: 0.23329859841692557


### 4.

In [7]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define the list of max_depth values
max_depths = [10, 15, 20, 25]

# Initialize variables to store the best mean RMSE and corresponding max_depth
best_mean_rmse = float('inf')
best_depth = None

# Iterate over different values of max_depth
for depth in max_depths:
    rmses = []
    # Iterate over n_estimators from 10 to 200 with a step of 10
    for n in range(10, 201, 10):
        # Initialize the Random Forest model
        rf_model = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        
        # Train the model on the training data
        rf_model.fit(X_train, y_train)
        
        # Make predictions on the validation data
        y_pred = rf_model.predict(X_val)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmses.append(rmse)
    
    # Calculate mean RMSE for the current max_depth
    mean_rmse = np.mean(rmses)
    print(f'max_depth: {depth}, Mean RMSE: {mean_rmse}')
    
    # Update the best mean RMSE and max_depth if the current mean RMSE is lower
    if mean_rmse < best_mean_rmse:
        best_mean_rmse = mean_rmse
        best_depth = depth

print(f'Best max_depth: {best_depth}, Best Mean RMSE: {best_mean_rmse}')


max_depth: 10, Mean RMSE: 0.24545499602392812
max_depth: 15, Mean RMSE: 0.2359206336413971
max_depth: 20, Mean RMSE: 0.23515134491947065
max_depth: 25, Mean RMSE: 0.2348258660390293
Best max_depth: 25, Best Mean RMSE: 0.2348258660390293


### 5.

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Define the model
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Get feature importance information
feature_importances = rf_model.feature_importances_

# Get feature names
feature_names = dv.get_feature_names_out()

# Create a dictionary to hold feature names and their importance scores
importance_dict = dict(zip(feature_names, feature_importances))

# Print the importance of the specified features
for feature in ['total_rooms', 'median_income', 'total_bedrooms', 'longitude']:
    print(f'{feature}: {importance_dict[feature]}')


total_rooms: 0.02151782701191468
median_income: 0.33559170042109715
total_bedrooms: 0.015894018592137876
longitude: 0.08627637578575545


In [4]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Create a watchlist
watchlist = [(dtrain, 'train'), (dval, 'val')]

# Train a model with eta=0.3
xgb_params_03 = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=5)

# Predictions and RMSE for eta=0.3
y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))
print('RMSE for eta=0.3:', rmse_03)

# Train a model with eta=0.1
xgb_params_01 = xgb_params_03.copy()
xgb_params_01['eta'] = 0.1

model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=5)

# Predictions and RMSE for eta=0.1
y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))
print('RMSE for eta=0.1:', rmse_01)

# Compare RMSE values
if rmse_03 < rmse_01:
    print('0.3 leads to the best RMSE')
elif rmse_01 < rmse_03:
    print('0.1 leads to the best RMSE')
else:
    print('Both give equal value')

    
    

[0]	train-rmse:8.07362	val-rmse:8.07348
[5]	train-rmse:1.38983	val-rmse:1.38852
[10]	train-rmse:0.33195	val-rmse:0.34802
[15]	train-rmse:0.22197	val-rmse:0.25614
[20]	train-rmse:0.20036	val-rmse:0.24508
[25]	train-rmse:0.18991	val-rmse:0.24021
[30]	train-rmse:0.18204	val-rmse:0.23833
[35]	train-rmse:0.17091	val-rmse:0.23452
[40]	train-rmse:0.16422	val-rmse:0.23379
[45]	train-rmse:0.15756	val-rmse:0.23302
[50]	train-rmse:0.15210	val-rmse:0.23262
[55]	train-rmse:0.14633	val-rmse:0.23167
[60]	train-rmse:0.14218	val-rmse:0.23160
[65]	train-rmse:0.13898	val-rmse:0.23169
[70]	train-rmse:0.13471	val-rmse:0.23108
[75]	train-rmse:0.13072	val-rmse:0.23036
[80]	train-rmse:0.12835	val-rmse:0.23045
[85]	train-rmse:0.12534	val-rmse:0.23010
[90]	train-rmse:0.12174	val-rmse:0.22957
[95]	train-rmse:0.11894	val-rmse:0.22906
[99]	train-rmse:0.11656	val-rmse:0.22897
RMSE for eta=0.3: 0.22897404244864047
[0]	train-rmse:10.37456	val-rmse:10.37545
[5]	train-rmse:6.13433	val-rmse:6.13236
[10]	train-rmse:3.632