Libraries

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor

Read in Data

In [None]:

boston = pd.read_csv("boston.csv")

print(boston.head())


      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

    black  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  


Linear Regression

In [None]:

X = boston.drop("medv", axis=1)
y = boston["medv"]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8555)

# Linear Regression 
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

# Predictions 
y_pred_lin = lin_model.predict(X_test)

mse_lin = mean_squared_error(y_test, y_pred_lin)
print(f"Linear Regression MSE: {mse_lin:.4f}")


Linear Regression MSE: 22.7640


Logistic Regression

In [None]:

# Categorical: 1 if medv above median, else 0
y_binary = (boston["medv"] > boston["medv"].median()).astype(int)


X = boston.drop("medv", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=8555)

# Logistic Regression 
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Predictions
y_pred_log = log_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_log)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")


Logistic Regression Accuracy: 0.8922


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Boosted Model

In [None]:

X = boston.drop("medv", axis=1)
y = boston["medv"]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8555)

# Boosting Model
boost_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=8555)
boost_model.fit(X_train, y_train)

# Predictions 
y_pred_boost = boost_model.predict(X_test)


mse_boost = mean_squared_error(y_test, y_pred_boost)
print(f"Boosting MSE: {mse_boost:.4f}")


Boosting MSE: 6.2044


Bagged Model

In [None]:

# Bag Model
bag_model = BaggingRegressor(n_estimators=100, random_state=8555)
bag_model.fit(X_train, y_train)

# Predictions
y_pred_bag = bag_model.predict(X_test)

mse_bag = mean_squared_error(y_test, y_pred_bag)
print(f"Bagging MSE: {mse_bag:.4f}")

Bagging MSE: 7.5906


Random Forest Model

In [None]:

# Random Forest 
rf_model = RandomForestRegressor(n_estimators=100, random_state=8555)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf:.4f}")


Random Forest MSE: 7.9702


BART Model:

NOTE: was unable to get BART to work in python. However, I used R to produce a BART model with an MSE of 10.81. 

R CODE BELOW

library(BayesTree)

library(BART)

boston<-read.csv("boston.csv")


\# Split training and testing data

set.seed(8555)

sample_index <- sample(1:nrow(boston), 0.8 * nrow(boston))

train_data <- boston[sample_index, ]

test_data <- boston[-sample_index, ]


X_train <- train_data[, !names(train_data) %in% c("medv")]

y_train <- train_data$medv

X_test <- test_data[, !names(test_data) %in% c("medv")]

y_test <- test_data$medv

\# Fit BART

bart_model <- bart(x.train = X_train,
                   y.train = y_train,
                   x.test = X_test,
                   verbose = FALSE)

\# predictions

y_pred <- bart_model$yhat.test.mean

\# Calculate Mean Squared Error

mse <- mean((y_test - y_pred)^2)

mse