# Case 1 - Trees

### Table of Contents

1. **Importing Libraries**

2. **Loading Data**

3. **Random Forest**

4. **AdaBoosting**

## 1. Importing Libraries

In [20]:
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # Set searborn as default

from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import RandomizedSearchCV

# Set seed for reproducibility
import random
random.seed(42)

## 2. Loading Data

In [21]:
# Loading the data into numpy arrays
X_train = np.loadtxt('../data/case1Data_Xtrain.csv', delimiter=',')
X_test = np.loadtxt('../data/case1Data_Xtest.csv', delimiter=',')
y_train = np.loadtxt('../data/case1Data_ytrain.csv', delimiter=',')
y_test = np.loadtxt('../data/case1Data_ytest.csv', delimiter=',')

### Summary of the data

This should be similar to the summary in case_1_data_wrangling.ipynb.

In [22]:
# Printing the shape of the data
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

# Size of the training and test data
n_train = X_train.shape[0]
n_test = X_test.shape[0]
p = X_train.shape[1]

# Printing the size of the training and test data
print("n_train: ", n_train) # number of training samples
print("n_test: ", n_test) # number of test samples
print("p: ", p) # number of features/variables/columns/parameters

# Checking for missing values in the wrangled data
missing_values_X_train = np.isnan(X_train)
print("Number of missing values in X_train: ", np.sum(missing_values_X_train))
missing_values_X_test = np.isnan(X_test)
print("Number of missing values in X_test: ", np.sum(missing_values_X_test))
missing_values_y_train = np.isnan(y_train)
print("Number of missing values in y_train: ", np.sum(missing_values_y_train))
missing_values_y_test = np.isnan(y_test)
print("Number of missing values in y_test: ", np.sum(missing_values_y_test))

X_train:  (80, 116)
X_test:  (20, 116)
y_train:  (80,)
y_test:  (20,)
n_train:  80
n_test:  20
p:  116
Number of missing values in X_train:  0
Number of missing values in X_test:  0
Number of missing values in y_train:  0
Number of missing values in y_test:  0


# Models

## 3. Random Forest

In [None]:
# Initializing RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=42)

# Performing cross-validation
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Training the model
rf.fit(X_train, y_train)

# Making predictions
y_hat = rf.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_hat))
print(f'RMSE for random forest: {rmse:.4f}')

RMSE for random forest: 0.8210


## 4. AdaBoosting

In [None]:
# Define parameter distributions
param_dist = {
    'n_estimators': [10, 50, 100, 150, 200],
    'estimator__max_depth': [1, 3, 5, 10, 15],
    'learning_rate': np.logspace(-2, 1, 50),
}

# Create AdaBoostRegressor with DecisionTreeRegressor as base estimator
boost = AdaBoostRegressor(estimator=DecisionTreeRegressor())

# Use RandomizedSearchCV for efficiency
boost_search = RandomizedSearchCV(
    boost,
    param_distributions=param_dist,
    n_iter=300,  # Adjust based on computational resources
    cv=3,
    verbose=1,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit the model
boost_search.fit(X_train, y_train)

# Get best parameters and score
best_params = boost_search.best_params_
best_score = -boost_search.best_score_  # Converting back to positive MSE

# Use the best model to make predictions
y_hat = boost_search.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_hat))
print(f'Test RMSE for AdaBoost: {rmse:.4f}')


Fitting 3 folds for each of 300 candidates, totalling 900 fits
Test RMSE for AdaBoost: 0.6350


In [19]:
### AdaBoost only using the continuous features

# Define parameter distributions
param_dist = {
    'n_estimators': [10, 50, 100, 150, 200],
    'estimator__max_depth': [1, 3, 5, 10, 15],
    'learning_rate': np.logspace(-2, 1, 50),
}

# Create AdaBoostRegressor with DecisionTreeRegressor as base estimator
boost = AdaBoostRegressor(estimator=DecisionTreeRegressor())

# Use RandomizedSearchCV for efficiency
boost_search = RandomizedSearchCV(
    boost,
    param_distributions=param_dist,
    n_iter=10,  # Adjust based on computational resources
    cv=3,
    verbose=1,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit the model by only using the continuous features (first 95 columns)
boost_search.fit(X_train[:, :95], y_train)

# Get best parameters and score
best_params = boost_search.best_params_
best_score = -boost_search.best_score_  # Converting back to positive MSE

# Use the best model to make predictions
y_hat = boost_search.predict(X_test[:, :95])

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_hat))
print(f'Test RMSE for AdaBoost: {rmse:.4f}')


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Test RMSE for AdaBoost: 0.7477


## Regression Tree

In [10]:
# create a decisiontreeregressor/classifier
dtree = DecisionTreeRegressor()

# Fit the tree regressor/classifier
dtree.fit(X_train, y_train)

# Predict the target variable
y_pred = dtree.predict(X_test)

# Create a Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, max_features='sqrt')

# Fit the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Calculating the root mean squared error (RMSE)
mse = mean_squared_error(y_test, y_pred)
print(f'RMSE: {rmse}')

RMSE: 0.7436675716918414
