# Case 1 - Trees

### Table of Contents

1. **Importing Libraries**

2. **Random Forest**

3. **AdaBoosting**

4. **Regression Tree**

## 1. Importing Libraries

In [3]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # Set searborn as default

from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LinearRegression

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import KFold

import warnings

# Set seed for reproducibility
import random
random.seed(42)

## 2. Random Forest

In [4]:
# Loading the data into numpy arrays
X = np.loadtxt('../data/case1Data_X.csv', delimiter=',')
y = np.loadtxt('../data/case1Data_y.csv', delimiter=',')

# Function to solve the OLS
def ols_solver(X, y):
    betas, res, rnk, s = lng.lstsq(X, y)
    return betas, res, rnk, s

# Outer 5-fold cross-validation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

#### Outer loop ####
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    #### Data Preprocessing ####

    # Convert the numpy arrays to pandas DataFrames
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Using StandardScaler from scikit-learn to standardize the data
    scaler = StandardScaler()

    # Standardizing the numerical features (all columns exept the last five)
    X_train.iloc[:, :95] = scaler.fit_transform(X_train.iloc[:, :95])
    X_test.iloc[:, :95] = scaler.transform(X_test.iloc[:, :95])

    # class sklearn.impute.KNNImputer(*, missing_values=nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False, keep_empty_features=False)
    continuous_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)

    # Fitting the imputer on the training data and transforming the training and test data
    X_train.iloc[:, :95] = pd.DataFrame(continuous_imputer.fit_transform(X_train.iloc[:, :95]))
    X_test.iloc[:, :95] = pd.DataFrame(continuous_imputer.transform(X_test.iloc[:, :95]))

    # Mode Imputation: Using SimpleImputer from scikit-learn to impute the missing values in the data (for categorical variables) with the most frequent value
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    # Fitting the imputer on the training data and transforming the training and test data
    X_train.iloc[:, -5:] = categorical_imputer.fit_transform(X_train.iloc[:, -5:])
    X_test.iloc[:, -5:] = categorical_imputer.transform(X_test.iloc[:, -5:])
    
    # One-hot encoding the categorical variables using get_dummies from pandas library (for the last five columns)
    X_train = pd.get_dummies(X_train, columns=X_train.columns[-5:])
    X_test = pd.get_dummies(X_test, columns=X_test.columns[-5:])

    # Align the columns of the training and test data
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    # Convert the pandas DataFrames back to numpy arrays
    X_train = np.asarray(X_train, dtype=np.float64)
    X_test = np.asarray(X_test, dtype=np.float64)

    #### Inner loop ####

    with warnings.catch_warnings():  # Suppress convergence warnings
        warnings.simplefilter("ignore")
        # Initializing RandomForestRegressor
        rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=42)

        # Performing cross-validation
        cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        
        # Training the model
        rf.fit(X_train, y_train)

    #### Evaluation ####

    # Predicting the target values on the test set using the optimal model
    y_pred = rf.predict(X_test)

    # Calculating the RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    # Saving the optimal model
    if rmse == min(rmse_scores):
        best_model = rf

    # Printing the RMSE for each fold
    print(f'Fold RMSE: {rmse:.4f}')

# Final performance
print(f'Average RMSE across outer folds: {np.mean(rmse_scores):.4f}')
# Standard deviation of the RMSE tells us how much the RMSE varies between the folds (i.e., how stable the model is)
print(f'Standard deviation of RMSE: {np.std(rmse_scores):.4f}')

Fold RMSE: 69.4093
Fold RMSE: 48.6544
Fold RMSE: 37.0148
Fold RMSE: 74.0206
Fold RMSE: 64.0553
Average RMSE across outer folds: 58.6309
Standard deviation of RMSE: 13.7784


## 3. AdaBoosting

In [5]:
# Loading the data into numpy arrays
X = np.loadtxt('../data/case1Data_X.csv', delimiter=',')
y = np.loadtxt('../data/case1Data_y.csv', delimiter=',')

# Outer 5-fold cross-validation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

#### Outer loop ####
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    #### Data Preprocessing ####

    # Convert the numpy arrays to pandas DataFrames
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Using StandardScaler from scikit-learn to standardize the data
    scaler = StandardScaler()

    # Standardizing the numerical features (all columns exept the last five)
    X_train.iloc[:, :95] = scaler.fit_transform(X_train.iloc[:, :95])
    X_test.iloc[:, :95] = scaler.transform(X_test.iloc[:, :95])

    # class sklearn.impute.KNNImputer(*, missing_values=nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False, keep_empty_features=False)
    continuous_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)

    # Fitting the imputer on the training data and transforming the training and test data
    X_train.iloc[:, :95] = pd.DataFrame(continuous_imputer.fit_transform(X_train.iloc[:, :95]))
    X_test.iloc[:, :95] = pd.DataFrame(continuous_imputer.transform(X_test.iloc[:, :95]))

    # Mode Imputation: Using SimpleImputer from scikit-learn to impute the missing values in the data (for categorical variables) with the most frequent value
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    # Fitting the imputer on the training data and transforming the training and test data
    X_train.iloc[:, -5:] = categorical_imputer.fit_transform(X_train.iloc[:, -5:])
    X_test.iloc[:, -5:] = categorical_imputer.transform(X_test.iloc[:, -5:])
    
    # One-hot encoding the categorical variables using get_dummies from pandas library (for the last five columns)
    X_train = pd.get_dummies(X_train, columns=X_train.columns[-5:])
    X_test = pd.get_dummies(X_test, columns=X_test.columns[-5:])

    # Align the columns of the training and test data
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    # Convert the pandas DataFrames back to numpy arrays
    X_train = np.asarray(X_train, dtype=np.float64)
    X_test = np.asarray(X_test, dtype=np.float64)

    #### Inner loop ####

    with warnings.catch_warnings():  # Suppress convergence warnings
        warnings.simplefilter("ignore")
        # Define parameter distributions
        param_dist = {
            'n_estimators': [10, 50, 100, 150, 200],
            'estimator__max_depth': [1, 3, 5, 10, 15],
            'learning_rate': np.logspace(-2, 1, 50),
        }
        
        # Creating AdaBoostRegressor with DecisionTreeRegressor as base estimator
        boost = AdaBoostRegressor(estimator=DecisionTreeRegressor())
        
        # Use RandomizedSearchCV for efficiency
        boost_search = RandomizedSearchCV(
            boost,
            param_distributions=param_dist,
            n_iter=300,  # Adjust based on computational resources
            cv=5,
            verbose=1,
            n_jobs=-1,
            scoring='neg_mean_squared_error',
            random_state=42
        )
        
        # Fitting the model
        boost_search.fit(X_train, y_train)

        # Getting best parameters and score
        best_params = boost_search.best_params_
        best_score = -boost_search.best_score_  # Converting back to positive MSE


    #### Evaluation ####

    # Predicting the target values on the test set using the optimal model
    y_pred = boost_search.predict(X_test)

    # Calculating the RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    # Saving the optimal model
    if rmse == min(rmse_scores):
        best_model = boost_search

    # Printing the RMSE for each fold
    print(f'Fold RMSE: {rmse:.4f}')

# Final performance
print(f'Average RMSE across outer folds: {np.mean(rmse_scores):.4f}')
# Standard deviation of the RMSE tells us how much the RMSE varies between the folds (i.e., how stable the model is)
print(f'Standard deviation of RMSE: {np.std(rmse_scores):.4f}')

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Fold RMSE: 55.7436
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Fold RMSE: 37.1720
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Fold RMSE: 27.8826
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Fold RMSE: 60.3156
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Fold RMSE: 36.7883
Average RMSE across outer folds: 43.5804
Standard deviation of RMSE: 12.3421


## 4. Regression Tree

In [6]:
# Loading the data into numpy arrays
X = np.loadtxt('../data/case1Data_X.csv', delimiter=',')
y = np.loadtxt('../data/case1Data_y.csv', delimiter=',')

# Outer 5-fold cross-validation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

#### Outer loop ####
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    #### Data Preprocessing ####

    # Convert the numpy arrays to pandas DataFrames
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Using StandardScaler from scikit-learn to standardize the data
    scaler = StandardScaler()

    # Standardizing the numerical features (all columns exept the last five)
    X_train.iloc[:, :95] = scaler.fit_transform(X_train.iloc[:, :95])
    X_test.iloc[:, :95] = scaler.transform(X_test.iloc[:, :95])

    # class sklearn.impute.KNNImputer(*, missing_values=nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False, keep_empty_features=False)
    continuous_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)

    # Fitting the imputer on the training data and transforming the training and test data
    X_train.iloc[:, :95] = pd.DataFrame(continuous_imputer.fit_transform(X_train.iloc[:, :95]))
    X_test.iloc[:, :95] = pd.DataFrame(continuous_imputer.transform(X_test.iloc[:, :95]))

    # Mode Imputation: Using SimpleImputer from scikit-learn to impute the missing values in the data (for categorical variables) with the most frequent value
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    # Fitting the imputer on the training data and transforming the training and test data
    X_train.iloc[:, -5:] = categorical_imputer.fit_transform(X_train.iloc[:, -5:])
    X_test.iloc[:, -5:] = categorical_imputer.transform(X_test.iloc[:, -5:])
    
    # One-hot encoding the categorical variables using get_dummies from pandas library (for the last five columns)
    X_train = pd.get_dummies(X_train, columns=X_train.columns[-5:])
    X_test = pd.get_dummies(X_test, columns=X_test.columns[-5:])

    # Align the columns of the training and test data
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    # Convert the pandas DataFrames back to numpy arrays
    X_train = np.asarray(X_train, dtype=np.float64)
    X_test = np.asarray(X_test, dtype=np.float64)

    #### Inner loop ####

    with warnings.catch_warnings():  # Suppress convergence warnings
        warnings.simplefilter("ignore")
        # Creating a decisiontreeregressor/classifier
        dtree = DecisionTreeRegressor()

        # Fitting the tree regressor/classifier
        dtree.fit(X_train, y_train)

    #### Evaluation ####

    # Predicting the target values on the test set using the optimal model
    y_pred = dtree.predict(X_test)

    # Calculating the RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    # Saving the optimal model
    if rmse == min(rmse_scores):
        best_model = dtree

    # Print the results of the inner loop
    print(f'Fold RMSE: {rmse:.4f}')

# Final performance
print(f'Average RMSE across outer folds: {np.mean(rmse_scores):.4f}')
# Standard deviation of the RMSE tells us how much the RMSE varies between the folds (i.e., how stable the model is)
print(f'Standard deviation of RMSE: {np.std(rmse_scores):.4f}')

Fold RMSE: 58.7276
Fold RMSE: 64.4635
Fold RMSE: 68.2987
Fold RMSE: 64.3112
Fold RMSE: 63.6584
Average RMSE across outer folds: 63.8919
Standard deviation of RMSE: 3.0544
