# Capstone 2: Modeling

## Step 1: Load the dataset and import libraries

In [29]:
import pandas as pd

from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

# Load the dataset
file_path = '../data/modeling_data/with_outliers.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,74401000000.0,-0.0713,39030000000.0,35371000000.0,0.0,21461000000.0,21461000000.0,13910000000.0,709000000.0,14494000000.0,...,0,0,1,0,0,0,0,0,0,0
1,3734148000.0,1.1737,2805625000.0,928522600.0,108330300.0,344141400.0,793926700.0,134595900.0,12148690.0,175382300.0,...,0,0,1,0,0,0,0,0,0,0
2,98375000000.0,0.0182,78138000000.0,20237000000.0,0.0,15196000000.0,17512000000.0,2725000000.0,443000000.0,2270000000.0,...,0,0,1,0,0,0,0,0,0,0
3,25526410000.0,0.0053,18202680000.0,7323734000.0,0.0,6561162000.0,6586482000.0,737252000.0,424591000.0,250218000.0,...,0,0,1,0,0,0,0,0,0,0
4,17909600000.0,0.0076,11539800000.0,6369800000.0,0.0,3474300000.0,3412400000.0,2957400000.0,302400000.0,2707700000.0,...,0,0,1,0,0,0,0,0,0,0


## Step 2: Preprocess the Data
We'll create functions to drop unnecessary columns, standardize the features, and split the dataset.

In [30]:
def preprocess_data(df, target_variable):
    # Drop unnecessary dependent variables and Symbol column
    drop_columns = ['PRICE VAR [%]', 'Alpha', 'Alpha_gt_3', 'Alpha_gt_5', 'Alpha_gt_10', 'Symbol']
    df = df.drop(columns=[col for col in drop_columns if col != target_variable])
    
    # Drop non-numeric columns
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    df = df.drop(columns=non_numeric_columns)
    
    # Separate features and target variable
    X = df.drop(columns=[target_variable])
    y = df[target_variable]
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y

## Step 3: Split Dataset

In [31]:
def split_dataset(X, y, test_size=0.2, val_size=0.1, random_state=42):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, random_state=random_state)
    val_split = val_size / (1 - test_size)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_split, random_state=random_state)
    
    return X_train, X_val, X_test, y_train, y_val, y_test


## Step 4: Perform PCA

In [32]:
def perform_pca(X, n_components=None, variance_ratio=None):
    if variance_ratio is not None:
        pca = PCA(n_components=variance_ratio)
    else:
        pca = PCA(n_components=n_components)
        
    X_pca = pca.fit_transform(X)
    return X_pca, pca

## Step 5: Model Data

In [33]:
def model_data(model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression'):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation and test sets
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    if problem_type == 'regression':
        # Calculate metrics for regression problems
        val_mse = mean_squared_error(y_val, y_val_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        val_rmse = val_mse ** 0.5
        test_rmse = test_mse ** 0.5
        val_mae = mean_absolute_error(y_val, y_val_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        val_r2 = r2_score(y_val, y_val_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        return model, val_mse, test_mse, val_rmse, test_rmse, val_mae, test_mae, val_r2, test_r2
        
    elif problem_type == 'classification':
        # Calculate accuracy for classification problems
        val_metric = accuracy_score(y_val, y_val_pred)
        test_metric = accuracy_score(y_test, y_test_pred)
        
        return model, val_metric, test_metric
    else:
        raise ValueError("Invalid problem_type. Use 'regression' or 'classification'.")

# Hands on modeling!

## Prediction Models (dependent variable: Alphas over S&P500)

###  **1. All features**

In [34]:
target_variable = 'Alpha'

# Preprocess the dataset without PCA
X_scaled, y = preprocess_data(df, target_variable)

# Split the dataset
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X_scaled, y)

### Model: LinearRegression()

#### Validation Metrics:
- **Validation Mean Squared Error (MSE): 424868590.8620736**
  - Indicates significant average squared error, suggesting predictions are far off from actual values.
- **Validation Root Mean Squared Error (RMSE): 20612.340741945674**
  - Average prediction error is approximately 20612.34 units.
- **Validation Mean Absolute Error (MAE): 690.3685437225114**
  - Average absolute difference between predicted and actual values is 690.37 units.
- **Validation R-squared (R²): 0.08903082875369961**
  - Only 8.9% of the variance in the dependent variable is predictable from the independent variables.

#### Test Metrics:
- **Test Mean Squared Error (MSE): 117946528.76904885**
  - Lower than the validation MSE, indicating slightly better performance on the test set.
- **Test Root Mean Squared Error (RMSE): 10860.318999414743**
  - Average prediction error on the test set is 10860.32 units.
- **Test Mean Absolute Error (MAE): 931.2547378517871**
  - Average prediction error on the test set is 931.25 units, higher than the validation MAE.
- **Test R-squared (R²): -0.12495956822412402**
  - Negative value indicates the model performs worse than a horizontal line (mean of the target variable), suggesting poor generalization to unseen data.


In [35]:
# Linear Regression for regression using all features
linear_reg_model = LinearRegression(random_state=42)
model_ls, val_mse_ls, test_mse_ls, val_rmse_ls, test_rmse_ls, val_mae_ls, test_mae_ls, val_r2_ls, test_r2_ls = model_data(linear_reg_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression')

print(f"Model: {model_ls}")
print(f"Validation Mean Squared Error (MSE): {val_mse_ls}")
print(f"Test Mean Squared Error (MSE): {test_mse_ls} - ")
print(f"Validation Root Mean Squared Error (RMSE): {val_rmse_ls}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse_ls}")
print(f"Validation Mean Absolute Error (MAE): {val_mae_ls}")
print(f"Test Mean Absolute Error (MAE): {test_mae_ls}")
print(f"Validation R-squared (R2): {val_r2_ls}")
print(f"Test R-squared (R2): {test_r2_ls}")

Model: LinearRegression()
Validation Mean Squared Error (MSE): 424868590.8620736
Test Mean Squared Error (MSE): 117946528.76904885 - 
Validation Root Mean Squared Error (RMSE): 20612.340741945674
Test Root Mean Squared Error (RMSE): 10860.318999414743
Validation Mean Absolute Error (MAE): 690.3685437225114
Test Mean Absolute Error (MAE): 931.2547378517871
Validation R-squared (R2): 0.08903082875369961
Test R-squared (R2): -0.12495956822412402


### Model: DecisionTreeRegressor()

#### Validation Metrics:
- **Validation Mean Squared Error (MSE): 422634579.64744693**
  - Indicates significant average squared error, suggesting predictions are far off from actual values.
- **Validation Root Mean Squared Error (RMSE): 20558.0782090021**
  - Average prediction error is approximately 20558.08 units.
- **Validation Mean Absolute Error (MAE): 521.9184764343948**
  - Average absolute difference between predicted and actual values is 521.92 units.
- **Validation R-squared (R²): 0.09382081650170915**
  - Only 9.38% of the variance in the dependent variable is predictable from the independent variables.

#### Test Metrics:
- **Test Mean Squared Error (MSE): 31725652.61656887**
  - Lower than the validation MSE, indicating better performance on the test set.
- **Test Root Mean Squared Error (RMSE): 5632.552939526523**
  - Average prediction error on the test set is 5632.55 units.
- **Test Mean Absolute Error (MAE): 372.1737044296878**
  - Average prediction error on the test set is 372.17 units, lower than the validation MAE.
- **Test R-squared (R²): 0.6974046049371354**
  - Indicates that 69.74% of the variance in the dependent variable is predictable from the independent variables on the test set, showing good generalization to unseen data.


In [36]:
# Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
model_dt, val_mse_dt, test_mse_dt, val_rmse_dt, test_rmse_dt, val_mae_dt, test_mae_dt, val_r2_dt, test_r2_dt = model_data(dt_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression')

# Printing formatted descriptions of each metric
print(f"Model: {model_dt}")
print(f"Validation Mean Squared Error (MSE): {val_mse_dt}")
print(f"Test Mean Squared Error (MSE): {test_mse_dt}")
print(f"Validation Root Mean Squared Error (RMSE): {val_rmse_dt}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse_dt}")
print(f"Validation Mean Absolute Error (MAE): {val_mae_dt}")
print(f"Test Mean Absolute Error (MAE): {test_mae_dt}")
print(f"Validation R-squared (R2): {val_r2_dt}")
print(f"Test R-squared (R2): {test_r2_dt}")

Model: DecisionTreeRegressor(random_state=42)
Validation Mean Squared Error (MSE): 422634579.64744693
Test Mean Squared Error (MSE): 31725652.61656887
Validation Root Mean Squared Error (RMSE): 20558.0782090021
Test Root Mean Squared Error (RMSE): 5632.552939526523
Validation Mean Absolute Error (MAE): 521.9184764343948
Test Mean Absolute Error (MAE): 372.1737044296878
Validation R-squared (R2): 0.09382081650170915
Test R-squared (R2): 0.6974046049371354


### Model: RandomForestRegressor()

#### Validation Metrics:
- **Validation Mean Squared Error (MSE): 423518093.06988245**
  - Indicates significant average squared error, suggesting predictions are far off from actual values.
- **Validation Root Mean Squared Error (RMSE): 20579.555220409464**
  - Average prediction error is approximately 20579.56 units.
- **Validation Mean Absolute Error (MAE): 482.3488209959373**
  - Average absolute difference between predicted and actual values is 482.35 units.
- **Validation R-squared (R²): 0.09192645785168985**
  - Only 9.19% of the variance in the dependent variable is predictable from the independent variables.

#### Test Metrics:
- **Test Mean Squared Error (MSE): 53276205.37256798**
  - Lower than the validation MSE, indicating better performance on the test set.
- **Test Root Mean Squared Error (RMSE): 7299.05510135168**
  - Average prediction error on the test set is 7299.06 units.
- **Test Mean Absolute Error (MAE): 484.5341503893868**
  - Average prediction error on the test set is 484.53 units, slightly higher than the validation MAE.
- **Test R-squared (R²): 0.49185806807506993**
  - Indicates that 49.19% of the variance in the dependent variable is predictable from the independent variables on the test set, showing moderate generalization to unseen data.


In [37]:
# Random Forest for regression using all features
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf, val_mse_rf, test_mse_rf, val_rmse_rf, test_rmse_rf, val_mae_rf, test_mae_rf, val_r2_rf, test_r2_rf = model_data(rf_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression')

print(f"Model: {model_rf}")
print(f"Validation Mean Squared Error (MSE): {val_mse_rf}")
print(f"Test Mean Squared Error (MSE): {test_mse_rf}")
print(f"Validation Root Mean Squared Error (RMSE): {val_rmse_rf}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse_rf}")
print(f"Validation Mean Absolute Error (MAE): {val_mae_rf}")
print(f"Test Mean Absolute Error (MAE): {test_mae_rf}")
print(f"Validation R-squared (R2): {val_r2_rf}")
print(f"Test R-squared (R2): {test_r2_rf}")

Model: RandomForestRegressor(random_state=42)
Validation Mean Squared Error (MSE): 423518093.06988245
Test Mean Squared Error (MSE): 53276205.37256798
Validation Root Mean Squared Error (RMSE): 20579.555220409464
Test Root Mean Squared Error (RMSE): 7299.05510135168
Validation Mean Absolute Error (MAE): 482.3488209959373
Test Mean Absolute Error (MAE): 484.5341503893868
Validation R-squared (R2): 0.09192645785168985
Test R-squared (R2): 0.49185806807506993


### Model: GradientBoostingRegressor()

#### Validation Metrics:
- **Validation Mean Squared Error (MSE): 436359227.3918143**
  - Indicates significant average squared error, suggesting predictions are far off from actual values.
- **Validation Root Mean Squared Error (RMSE): 20889.213182688673**
  - Average prediction error is approximately 20889.21 units.
- **Validation Mean Absolute Error (MAE): 466.10759370344084**
  - Average absolute difference between predicted and actual values is 466.11 units.
- **Validation R-squared (R²): 0.06439352709920176**
  - Only 6.44% of the variance in the dependent variable is predictable from the independent variables.

#### Test Metrics:
- **Test Mean Squared Error (MSE): 32930608.52063153**
  - Lower than the validation MSE, indicating better performance on the test set.
- **Test Root Mean Squared Error (RMSE): 5738.519715103498**
  - Average prediction error on the test set is 5738.52 units.
- **Test Mean Absolute Error (MAE): 376.4730851737857**
  - Average prediction error on the test set is 376.47 units, lower than the validation MAE.
- **Test R-squared (R²): 0.6859118828730746**
  - Indicates that 68.59% of the variance in the dependent variable is predictable from the independent variables on the test set, showing good generalization to unseen data.


In [38]:
gbr_model = GradientBoostingRegressor(random_state=42)
model_gbr, val_mse_gbr, test_mse_gbr, val_rmse_gbr, test_rmse_gbr, val_mae_gbr, test_mae_gbr, val_r2_gbr, test_r2_gbr = model_data(gbr_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression')

# Printing formatted descriptions of each metric
print(f"Model: {model_gbr}")
print(f"Validation Mean Squared Error (MSE): {val_mse_gbr}")
print(f"Test Mean Squared Error (MSE): {test_mse_gbr}")
print(f"Validation Root Mean Squared Error (RMSE): {val_rmse_gbr}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse_gbr}")
print(f"Validation Mean Absolute Error (MAE): {val_mae_gbr}")
print(f"Test Mean Absolute Error (MAE): {test_mae_gbr}")
print(f"Validation R-squared (R2): {val_r2_gbr}")
print(f"Test R-squared (R2): {test_r2_gbr}")

Model: GradientBoostingRegressor(random_state=42)
Validation Mean Squared Error (MSE): 436359227.3918143
Test Mean Squared Error (MSE): 32930608.52063153
Validation Root Mean Squared Error (RMSE): 20889.213182688673
Test Root Mean Squared Error (RMSE): 5738.519715103498
Validation Mean Absolute Error (MAE): 466.10759370344084
Test Mean Absolute Error (MAE): 376.4730851737857
Validation R-squared (R2): 0.06439352709920176
Test R-squared (R2): 0.6859118828730746


### Model: AdaBoostRegressor(random_state=42)

#### Validation Metrics:
- **Validation Mean Squared Error (MSE): 480781268.27375156**
  - Indicates significant average squared error, suggesting predictions are far off from actual values.
- **Validation Root Mean Squared Error (RMSE): 21926.724978294216**
  - Average prediction error is approximately 21926.72 units.
- **Validation Mean Absolute Error (MAE): 4172.772963213562**
  - Average absolute difference between predicted and actual values is 4172.77 units.
- **Validation R-squared (R²): -0.030852651690288058**
  - Negative value indicates that the model performs worse than a horizontal line (mean of the target variable), suggesting poor fit.

#### Test Metrics:
- **Test Mean Squared Error (MSE): 121406799.91783379**
  - Lower than the validation MSE, indicating better performance on the test set, but still high.
- **Test Root Mean Squared Error (RMSE): 11018.475389900083**
  - Average prediction error on the test set is 11018.48 units.
- **Test Mean Absolute Error (MAE): 4338.59938769091**
  - Average prediction error on the test set is 4338.60 units, slightly higher than the validation MAE.
- **Test R-squared (R²): -0.15796321129951885**
  - Negative value indicates that the model performs worse than a horizontal line (mean of the target variable), suggesting very poor generalization to unseen data.


In [39]:
# AdaBoost Regressor
ada_model = AdaBoostRegressor(random_state=42)
model_ada, val_mse_ada, test_mse_ada, val_rmse_ada, test_rmse_ada, val_mae_ada, test_mae_ada, val_r2_ada, test_r2_ada = model_data(ada_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression')

# Printing formatted descriptions of each metric
print(f"Model: {model_ada}")
print(f"Validation Mean Squared Error (MSE): {val_mse_ada}")
print(f"Test Mean Squared Error (MSE): {test_mse_ada}")
print(f"Validation Root Mean Squared Error (RMSE): {val_rmse_ada}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse_ada}")
print(f"Validation Mean Absolute Error (MAE): {val_mae_ada}")
print(f"Test Mean Absolute Error (MAE): {test_mae_ada}")
print(f"Validation R-squared (R2): {val_r2_ada}")
print(f"Test R-squared (R2): {test_r2_ada}")

Model: AdaBoostRegressor(random_state=42)
Validation Mean Squared Error (MSE): 480781268.27375156
Test Mean Squared Error (MSE): 121406799.91783379
Validation Root Mean Squared Error (RMSE): 21926.724978294216
Test Root Mean Squared Error (RMSE): 11018.475389900083
Validation Mean Absolute Error (MAE): 4172.772963213562
Test Mean Absolute Error (MAE): 4338.59938769091
Validation R-squared (R2): -0.030852651690288058
Test R-squared (R2): -0.15796321129951885


### Model: KNeighborsRegressor()

#### Validation Metrics:
- **Validation Mean Squared Error (MSE): 472800689.7506276**
  - Indicates significant average squared error, suggesting predictions are far off from actual values.
- **Validation Root Mean Squared Error (RMSE): 21743.980540614626**
  - Average prediction error is approximately 21743.98 units.
- **Validation Mean Absolute Error (MAE): 488.05026590111294**
  - Average absolute difference between predicted and actual values is 488.05 units.
- **Validation R-squared (R²): -0.013741335015819311**
  - Negative value indicates that the model performs worse than a horizontal line (mean of the target variable), suggesting poor fit.

#### Test Metrics:
- **Test Mean Squared Error (MSE): 103910312.58790064**
  - Lower than the validation MSE, indicating better performance on the test set, but still high.
- **Test Root Mean Squared Error (RMSE): 10193.640791586715**
  - Average prediction error on the test set is 10193.64 units.
- **Test Mean Absolute Error (MAE): 619.2002796620392**
  - Average prediction error on the test set is 619.20 units, higher than the validation MAE.
- **Test R-squared (R²): 0.00891614528300011**
  - Very low positive value indicates that only 0.89% of the variance in the dependent variable is predictable from the independent variables on the test set, showing very poor generalization to unseen data.


In [40]:
# K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()
model_knn, val_mse_knn, test_mse_knn, val_rmse_knn, test_rmse_knn, val_mae_knn, test_mae_knn, val_r2_knn, test_r2_knn = model_data(knn_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression')
# Printing formatted descriptions of each metric
print(f"Model: {model_knn}")
print(f"Validation Mean Squared Error (MSE): {val_mse_knn}")
print(f"Test Mean Squared Error (MSE): {test_mse_knn}")
print(f"Validation Root Mean Squared Error (RMSE): {val_rmse_knn}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse_knn}")
print(f"Validation Mean Absolute Error (MAE): {val_mae_knn}")
print(f"Test Mean Absolute Error (MAE): {test_mae_knn}")
print(f"Validation R-squared (R2): {val_r2_knn}")
print(f"Test R-squared (R2): {test_r2_knn}")

Model: KNeighborsRegressor()
Validation Mean Squared Error (MSE): 472800689.7506276
Test Mean Squared Error (MSE): 103910312.58790064
Validation Root Mean Squared Error (RMSE): 21743.980540614626
Test Root Mean Squared Error (RMSE): 10193.640791586715
Validation Mean Absolute Error (MAE): 488.05026590111294
Test Mean Absolute Error (MAE): 619.2002796620392
Validation R-squared (R2): -0.013741335015819311
Test R-squared (R2): 0.00891614528300011


### Model: Ridge()

#### Validation Metrics:
- **Validation Mean Squared Error (MSE): 424779490.73995525**
  - Indicates significant average squared error, suggesting predictions are far off from actual values.
- **Validation Root Mean Squared Error (RMSE): 20610.179299073436**
  - Average prediction error is approximately 20610.18 units.
- **Validation Mean Absolute Error (MAE): 688.009724814428**
  - Average absolute difference between predicted and actual values is 688.01 units.
- **Validation R-squared (R²): 0.08922187009248017**
  - Indicates that only 8.92% of the variance in the dependent variable is predictable from the independent variables.

#### Test Metrics:
- **Test Mean Squared Error (MSE): 117983257.0892654**
  - Lower than the validation MSE, indicating better performance on the test set.
- **Test Root Mean Squared Error (RMSE): 10862.009808928797**
  - Average prediction error on the test set is 10862.01 units.
- **Test Mean Absolute Error (MAE): 931.8132296909181**
  - Average prediction error on the test set is 931.81 units, higher than the validation MAE.
- **Test R-squared (R²): -0.12530987845100072**
  - Negative value indicates that the model performs worse than a horizontal line (mean of the target variable), suggesting poor generalization to unseen data.


In [41]:
# Ridge Regressor
ridge_model = Ridge()
model_ridge, val_mse_ridge, test_mse_ridge, val_rmse_ridge, test_rmse_ridge, val_mae_ridge, test_mae_ridge, val_r2_ridge, test_r2_ridge = model_data(ridge_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression')

# Printing formatted descriptions of each metric
print(f"Model: {model_ridge}")
print(f"Validation Mean Squared Error (MSE): {val_mse_ridge}")
print(f"Test Mean Squared Error (MSE): {test_mse_ridge}")
print(f"Validation Root Mean Squared Error (RMSE): {val_rmse_ridge}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse_ridge}")
print(f"Validation Mean Absolute Error (MAE): {val_mae_ridge}")
print(f"Test Mean Absolute Error (MAE): {test_mae_ridge}")
print(f"Validation R-squared (R2): {val_r2_ridge}")
print(f"Test R-squared (R2): {test_r2_ridge}")

Model: Ridge()
Validation Mean Squared Error (MSE): 424779490.73995525
Test Mean Squared Error (MSE): 117983257.0892654
Validation Root Mean Squared Error (RMSE): 20610.179299073436
Test Root Mean Squared Error (RMSE): 10862.009808928797
Validation Mean Absolute Error (MAE): 688.009724814428
Test Mean Absolute Error (MAE): 931.8132296909181
Validation R-squared (R2): 0.08922187009248017
Test R-squared (R2): -0.12530987845100072


### Model: Lasso()

#### Validation Metrics:
- **Validation Mean Squared Error (MSE): 424544348.06936824**
  - Indicates significant average squared error, suggesting predictions are far off from actual values.
- **Validation Root Mean Squared Error (RMSE): 20604.4739818654**
  - Average prediction error is approximately 20604.47 units.
- **Validation Mean Absolute Error (MAE): 681.7016201854344**
  - Average absolute difference between predicted and actual values is 681.70 units.
- **Validation R-squared (R²): 0.08972604415560559**
  - Indicates that only 8.97% of the variance in the dependent variable is predictable from the independent variables.

#### Test Metrics:
- **Test Mean Squared Error (MSE): 117932980.72551265**
  - Lower than the validation MSE, indicating better performance on the test set.
- **Test Root Mean Squared Error (RMSE): 10859.695240913194**
  - Average prediction error on the test set is 10859.70 units.
- **Test Mean Absolute Error (MAE): 914.5247471184288**
  - Average prediction error on the test set is 914.52 units, higher than the validation MAE.
- **Test R-squared (R²): -0.1248303486416078**
  - Negative value indicates that the model performs worse than a horizontal line (mean of the target variable), suggesting poor generalization to unseen data.


In [42]:
# Lasso Regressor
lasso_model = Lasso()
model_lasso, val_mse_lasso, test_mse_lasso, val_rmse_lasso, test_rmse_lasso, val_mae_lasso, test_mae_lasso, val_r2_lasso, test_r2_lasso = model_data(lasso_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression')

# Printing formatted descriptions of each metric
print(f"Model: {model_lasso}")
print(f"Validation Mean Squared Error (MSE): {val_mse_lasso}")
print(f"Test Mean Squared Error (MSE): {test_mse_lasso}")
print(f"Validation Root Mean Squared Error (RMSE): {val_rmse_lasso}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse_lasso}")
print(f"Validation Mean Absolute Error (MAE): {val_mae_lasso}")
print(f"Test Mean Absolute Error (MAE): {test_mae_lasso}")
print(f"Validation R-squared (R2): {val_r2_lasso}")
print(f"Test R-squared (R2): {test_r2_lasso}")

Model: Lasso()
Validation Mean Squared Error (MSE): 424544348.06936824
Test Mean Squared Error (MSE): 117932980.72551265
Validation Root Mean Squared Error (RMSE): 20604.4739818654
Test Root Mean Squared Error (RMSE): 10859.695240913194
Validation Mean Absolute Error (MAE): 681.7016201854344
Test Mean Absolute Error (MAE): 914.5247471184288
Validation R-squared (R2): 0.08972604415560559
Test R-squared (R2): -0.1248303486416078


### Analisys

#### Model Performance Summary:
- The validation and test metrics across all models indicate varying degrees of prediction error, with all models showing room for improvement in terms of accuracy and generalization.

#### Key Observations:
1. **High Validation and Test Errors**:
   - All models have high Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) values, indicating substantial prediction errors. The RMSE values for most models are in the range of thousands, showing that predictions are far from actual values.
   - Mean Absolute Error (MAE) values are also high, indicating significant average differences between predicted and actual values.

2. **R-squared (R²) Values**:
   - Most models have low R² values on the validation set, indicating that they explain a small percentage of the variance in the dependent variable.
   - Negative R² values on the test set for Ridge, Lasso, and AdaBoost models suggest poor generalization, with the models performing worse than a simple mean prediction.

3. **Model Comparison**:
   - **DecisionTreeRegressor** and **GradientBoostingRegressor** show relatively better R² values on the test set (0.697 and 0.686 respectively), indicating better generalization compared to other models.
   - **AdaBoostRegressor** and **KNeighborsRegressor** have particularly poor performance, with negative R² values on both validation and test sets.

## 2. PCA 70% explained variance

In [44]:
# Preprocess the dataset
target_variable = 'Alpha'
X_scaled, y = preprocess_data(df, target_variable)

# Split the dataset
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X_scaled, y)

# Perform PCA to explain 70% variance
X_train_pca_70, pca_70 = perform_pca(X_train, variance_ratio=0.70)
X_val_pca_70 = pca_70.transform(X_val)
X_test_pca_70 = pca_70.transform(X_test)

# Define models to test
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet()
}

# Dictionary to store results
results_pca_70 = {}

# Evaluate each model
for name, model in models.items():
    model_fitted, val_mse, test_mse, val_rmse, test_rmse, val_mae, test_mae, val_r2, test_r2 = model_data(
        model, X_train_pca_70, y_train, X_val_pca_70, y_val, X_test_pca_70, y_test, problem_type='regression')
    
    # Store results
    results_pca_70[name] = {
        "Validation MSE": val_mse,
        "Test MSE": test_mse,
        "Validation RMSE": val_rmse,
        "Test RMSE": test_rmse,
        "Validation MAE": val_mae,
        "Test MAE": test_mae,
        "Validation R^2": val_r2,
        "Test R^2": test_r2
    }

# Print results
for name, result in results_pca_70.items():
    print(f"Model: {name}")
    for metric, value in result.items():
        print(f"  {metric}: {value}")
    print("\n")

Model: Linear Regression
  Validation MSE: 450430666.54793316
  Test MSE: 186369689.78265843
  Validation RMSE: 21223.351915942334
  Test RMSE: 13651.728454033153
  Validation MAE: 3721.02556373154
  Test MAE: 4331.200633827988
  Validation R^2: 0.03422267535352941
  Test R^2: -0.7775713107970772


Model: Random Forest
  Validation MSE: 483456474.62453485
  Test MSE: 128353024.7671503
  Validation RMSE: 21987.64368058876
  Test RMSE: 11329.299394364609
  Validation MAE: 580.9085372180151
  Test MAE: 851.3119686621785
  Validation R^2: -0.03658861467908192
  Test R^2: -0.2242154544882593


Model: Decision Tree
  Validation MSE: 477544751.58349013
  Test MSE: 98802987.43546648
  Validation RMSE: 21852.797340008674
  Test RMSE: 9939.969186846933
  Validation MAE: 541.261524441502
  Test MAE: 629.7607221208797
  Validation R^2: -0.023913172071260647
  Test R^2: 0.0576291880340325


Model: Gradient Boosting
  Validation MSE: 472530384.10588205
  Test MSE: 101402317.93315727
  Validation RMS