# Capstone 2: Modeling

1. Functions to choose best models
2. model testing and selection
3. hyperparameter tunning

# 1. Create the functions to choose the model

## Step 1: Load the dataset and import libraries

In [28]:
import pandas as pd

from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
# Models
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, Ridge, Lasso, ElasticNet 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

# Load the dataset
outliers = '../data/modeling_data/with_outliers.csv'

df = pd.read_csv(outliers)

df.head()

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,74401000000.0,-0.0713,39030000000.0,35371000000.0,0.0,21461000000.0,21461000000.0,13910000000.0,709000000.0,14494000000.0,...,0,0,1,0,0,0,0,0,0,0
1,3734148000.0,1.1737,2805625000.0,928522600.0,108330300.0,344141400.0,793926700.0,134595900.0,12148690.0,175382300.0,...,0,0,1,0,0,0,0,0,0,0
2,98375000000.0,0.0182,78138000000.0,20237000000.0,0.0,15196000000.0,17512000000.0,2725000000.0,443000000.0,2270000000.0,...,0,0,1,0,0,0,0,0,0,0
3,25526410000.0,0.0053,18202680000.0,7323734000.0,0.0,6561162000.0,6586482000.0,737252000.0,424591000.0,250218000.0,...,0,0,1,0,0,0,0,0,0,0
4,17909600000.0,0.0076,11539800000.0,6369800000.0,0.0,3474300000.0,3412400000.0,2957400000.0,302400000.0,2707700000.0,...,0,0,1,0,0,0,0,0,0,0


## Step 2: Preprocess the Data
We'll create functions to drop unnecessary columns, standardize the features, and split the dataset.

In [29]:
def preprocess_data(df, target_variable):
    # Drop unnecessary dependent variables and Symbol column
    drop_columns = ['PRICE VAR [%]', 'Alpha', 'Alpha_gt_3', 'Alpha_gt_5', 'Alpha_gt_10', 'Symbol']
    df = df.drop(columns=[col for col in drop_columns if col != target_variable])
    
    # Drop non-numeric columns
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    df = df.drop(columns=non_numeric_columns)
    
    # Separate features and target variable
    X = df.drop(columns=[target_variable])
    y = df[target_variable]
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y

## Step 3: Split Dataset

In [30]:
def split_dataset(X, y, test_size=0.2, val_size=0.1, random_state=42):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, random_state=random_state)
    val_split = val_size / (1 - test_size)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_split, random_state=random_state)
    
    return X_train, X_val, X_test, y_train, y_val, y_test


## Step 4: Perform PCA

In [31]:
def perform_pca(X, n_components=None, variance_ratio=None):
    if variance_ratio is not None:
        pca = PCA(n_components=variance_ratio)
    else:
        pca = PCA(n_components=n_components)
        
    X_pca = pca.fit_transform(X)
    return X_pca, pca

## Step 5: Model Data

In [32]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def model_data(model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type='regression'):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation and test sets
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    if problem_type == 'regression':
        # Calculate metrics for regression problems
        val_mse = mean_squared_error(y_val, y_val_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        val_rmse = val_mse ** 0.5
        test_rmse = test_mse ** 0.5
        val_mae = mean_absolute_error(y_val, y_val_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        val_r2 = r2_score(y_val, y_val_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        return model, val_mse, test_mse, val_rmse, test_rmse, val_mae, test_mae, val_r2, test_r2
        
    elif problem_type == 'classification':
        # Calculate metrics for classification problems
        val_accuracy = accuracy_score(y_val, y_val_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        val_precision = precision_score(y_val, y_val_pred)
        test_precision = precision_score(y_test, y_test_pred)
        val_recall = recall_score(y_val, y_val_pred)
        test_recall = recall_score(y_test, y_test_pred)
        val_f1 = f1_score(y_val, y_val_pred)
        test_f1 = f1_score(y_test, y_test_pred)
        val_auc = roc_auc_score(y_val, y_val_pred)
        test_auc = roc_auc_score(y_test, y_test_pred)
        
        return model, val_accuracy, test_accuracy, val_precision, test_precision, val_recall, test_recall, val_f1, test_f1, val_auc, test_auc
    else:
        raise ValueError("Invalid problem_type. Use 'regression' or 'classification'.")


# 2. Model testting and selection

Hands on modeling!

## Prediction Models (dependent variable: Alphas over S&P500)

###  **1.1 Modeling with all features including outliers**

In [33]:
target_variable = 'Alpha_gt_3'
X_scaled, y = preprocess_data(df, target_variable)

# Split the dataset
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X_scaled, y)

In [34]:
# Define models to test with their corresponding problem types
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000, random_state=42), "classification"),  # Increased max_iter
    "Random Forest Classifier": (RandomForestClassifier(n_estimators=100, random_state=42), "classification"),
    "Decision Tree Classifier": (DecisionTreeClassifier(random_state=42), "classification"),
    "Gradient Boosting Classifier": (GradientBoostingClassifier(random_state=42), "classification"),
    "AdaBoost Classifier": (AdaBoostClassifier(random_state=42), "classification"),
    "Support Vector Classifier": (SVC(), "classification"),
    "K-Nearest Neighbors Classifier": (KNeighborsClassifier(), "classification"),
    "Ridge Classifier": (RidgeClassifier(), "classification"),
    "Random Forest Regressor": (RandomForestRegressor(n_estimators=100, random_state=42), "regression"),
    "Decision Tree Regressor": (DecisionTreeRegressor(random_state=42), "regression"),
    "Gradient Boosting Regressor": (GradientBoostingRegressor(random_state=42), "regression"),
    "AdaBoost Regressor": (AdaBoostRegressor(random_state=42), "regression"),
}

# Dictionary to store results
results = {}

# Evaluate each model
for name, (model, problem_type) in models.items():
    if problem_type == "classification":
        model_fitted, val_accuracy, test_accuracy, val_precision, test_precision, val_recall, test_recall, val_f1, test_f1, val_auc, test_auc = model_data(
            model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type=problem_type)
        
        # Store classification results
        results[name] = {
            "Validation Accuracy": val_accuracy,
            "Test Accuracy": test_accuracy,
            "Validation Precision": val_precision,
            "Test Precision": test_precision,
            "Validation Recall": val_recall,
            "Test Recall": test_recall,
            "Validation F1 Score": val_f1,
            "Test F1 Score": test_f1,
            "Validation AUC": val_auc,
            "Test AUC": test_auc
        }
    elif problem_type == "regression":
        model_fitted, val_mse, test_mse, val_rmse, test_rmse, val_mae, test_mae, val_r2, test_r2 = model_data(
            model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type=problem_type)
        
        # Store regression results
        results[name] = {
            "Validation MSE": val_mse,
            "Test MSE": test_mse,
            "Validation RMSE": val_rmse,
            "Test RMSE": test_rmse,
            "Validation MAE": val_mae,
            "Test MAE": test_mae,
            "Validation R^2": val_r2,
            "Test R^2": test_r2
        }

# Print results
for name, result in results.items():
    print(f"Model: {name}")
    for metric, value in result.items():
        print(f"  {metric}: {value}")
    print("\n")




Model: Logistic Regression
  Validation Accuracy: 0.6120600414078675
  Test Accuracy: 0.5670289855072463
  Validation Precision: 0.4720670391061452
  Test Precision: 0.3888888888888889
  Validation Recall: 0.11426639621365788
  Test Recall: 0.09251101321585903
  Validation F1 Score: 0.1839956450734894
  Test F1 Score: 0.1494661921708185
  Validation AUC: 0.517510556597395
  Test AUC: 0.49548627583869875


Model: Random Forest Classifier
  Validation Accuracy: 0.6493271221532091
  Test Accuracy: 0.6485507246376812
  Validation Precision: 0.5895953757225434
  Test Precision: 0.6571428571428571
  Validation Recall: 0.27586206896551724
  Test Recall: 0.3039647577092511
  Validation F1 Score: 0.375863657300783
  Test F1 Score: 0.41566265060240964
  Validation AUC: 0.5783922504156727
  Test AUC: 0.5965977634700101


Model: Decision Tree Classifier
  Validation Accuracy: 0.5913561076604554
  Test Accuracy: 0.5597826086956522
  Validation Precision: 0.46630727762803237
  Test Precision: 0.4636

### Classification Model Performance Comparison

In this analysis, we compare the performance of different classification models using the `alpha_gt_3` target variable. The models evaluated include Logistic Regression, Random Forest, Decision Tree, Gradient Boosting, AdaBoost, Support Vector Classifier, K-Nearest Neighbors, and Ridge Classifier.

#### Summary of Key Metrics

1. **Accuracy:** Reflects the proportion of correct predictions among the total predictions. 
   - **Highest Accuracy:** **Gradient Boosting Classifier** 
     - Validation Accuracy: **66.41%**
     - Test Accuracy: **63.95%**

2. **Precision:** Measures the accuracy of positive predictions (i.e., the proportion of true positives among the predicted positives).
   - **Highest Precision:** **Support Vector Classifier**
     - Validation Precision: **75.56%**
     - Test Precision: **80.00%**

3. **Recall:** Indicates how many of the actual positives were correctly identified (i.e., true positive rate).
   - **Highest Recall:** **Decision Tree Classifier**
     - Validation Recall: **46.79%**
     - Test Recall: **44.93%**

4. **F1 Score:** The harmonic mean of Precision and Recall, providing a balance between the two.
   - **Highest F1 Score:** **Random Forest Classifier**
     - Validation F1 Score: **37.59%**
     - Test F1 Score: **41.57%**

5. **AUC (Area Under the Curve):** Measures the ability of the classifier to distinguish between classes.
   - **Highest AUC:** **Random Forest Classifier**
     - Validation AUC: **57.84%**
     - Test AUC: **59.66%**

#### Observations

- **Random Forest Classifier** generally performed well across most metrics, particularly in terms of F1 Score and AUC, which are critical for imbalanced datasets.
- **Gradient Boosting Classifier** also showed strong overall performance, especially in Accuracy.
- **Support Vector Classifier** achieved high Precision but had very low Recall, indicating that it was highly conservative in predicting the positive class.
- **K-Nearest Neighbors** and **Decision Tree Classifier** provided a balance between Precision and Recall but did not excel in any specific area.

#### Recommendations

- **Model Selection:** Based on these results, **Random Forest Classifier** and **Gradient Boosting Classifier** are strong candidates for further tuning and potential deployment, as they provide a good balance of accuracy and robustness across different metrics.
- **Hyperparameter Tuning:** Consider performing hyperparameter tuning on Random Forest and Gradient Boosting to potentially improve their performance further.


###  **1.2 Modeling with all features dropping outliers**

In [35]:
drop_outliers = '../data/modeling_data/drop_outliers.csv'
df_drop = pd.read_csv(drop_outliers)

target_variable = 'Alpha_gt_3'
X_scaled, y = preprocess_data(df_drop, target_variable)

# Split the dataset
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X_scaled, y)

In [36]:
# Define models to test with their corresponding problem types
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000, random_state=42), "classification"),  # Increased max_iter
    "Random Forest Classifier": (RandomForestClassifier(n_estimators=100, random_state=42), "classification"),
    "Decision Tree Classifier": (DecisionTreeClassifier(random_state=42), "classification"),
    "Gradient Boosting Classifier": (GradientBoostingClassifier(random_state=42), "classification"),
    "AdaBoost Classifier": (AdaBoostClassifier(random_state=42), "classification"),
    "Support Vector Classifier": (SVC(), "classification"),
    "K-Nearest Neighbors Classifier": (KNeighborsClassifier(), "classification"),
    "Ridge Classifier": (RidgeClassifier(), "classification"),
    "Random Forest Regressor": (RandomForestRegressor(n_estimators=100, random_state=42), "regression"),
    "Decision Tree Regressor": (DecisionTreeRegressor(random_state=42), "regression"),
    "Gradient Boosting Regressor": (GradientBoostingRegressor(random_state=42), "regression"),
    "AdaBoost Regressor": (AdaBoostRegressor(random_state=42), "regression"),
}

# Dictionary to store results
results = {}

# Evaluate each model
for name, (model, problem_type) in models.items():
    if problem_type == "classification":
        model_fitted, val_accuracy, test_accuracy, val_precision, test_precision, val_recall, test_recall, val_f1, test_f1, val_auc, test_auc = model_data(
            model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type=problem_type)
        
        # Store classification results
        results[name] = {
            "Validation Accuracy": val_accuracy,
            "Test Accuracy": test_accuracy,
            "Validation Precision": val_precision,
            "Test Precision": test_precision,
            "Validation Recall": val_recall,
            "Test Recall": test_recall,
            "Validation F1 Score": val_f1,
            "Test F1 Score": test_f1,
            "Validation AUC": val_auc,
            "Test AUC": test_auc
        }
    elif problem_type == "regression":
        model_fitted, val_mse, test_mse, val_rmse, test_rmse, val_mae, test_mae, val_r2, test_r2 = model_data(
            model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type=problem_type)
        
        # Store regression results
        results[name] = {
            "Validation MSE": val_mse,
            "Test MSE": test_mse,
            "Validation RMSE": val_rmse,
            "Test RMSE": test_rmse,
            "Validation MAE": val_mae,
            "Test MAE": test_mae,
            "Validation R^2": val_r2,
            "Test R^2": test_r2
        }

# Print results
for name, result in results.items():
    print(f"Model: {name}")
    for metric, value in result.items():
        print(f"  {metric}: {value}")
    print("\n")



Model: Logistic Regression
  Validation Accuracy: 0.6120600414078675
  Test Accuracy: 0.5670289855072463
  Validation Precision: 0.4720670391061452
  Test Precision: 0.3888888888888889
  Validation Recall: 0.11426639621365788
  Test Recall: 0.09251101321585903
  Validation F1 Score: 0.1839956450734894
  Test F1 Score: 0.1494661921708185
  Validation AUC: 0.517510556597395
  Test AUC: 0.49548627583869875


Model: Random Forest Classifier
  Validation Accuracy: 0.6493271221532091
  Test Accuracy: 0.6485507246376812
  Validation Precision: 0.5895953757225434
  Test Precision: 0.6571428571428571
  Validation Recall: 0.27586206896551724
  Test Recall: 0.3039647577092511
  Validation F1 Score: 0.375863657300783
  Test F1 Score: 0.41566265060240964
  Validation AUC: 0.5783922504156727
  Test AUC: 0.5965977634700101


Model: Decision Tree Classifier
  Validation Accuracy: 0.5913561076604554
  Test Accuracy: 0.5597826086956522
  Validation Precision: 0.46630727762803237
  Test Precision: 0.4636

### Classification Model Performance Comparison After Removing Outliers

In this analysis, we compare the performance of different classification models using the `alpha_gt_3` target variable. The dataset was preprocessed to remove outliers using the 1st and 99th quantiles (`quantile_1=0.01`, `quantile_3=0.99`). The models evaluated include Logistic Regression, Random Forest, Decision Tree, Gradient Boosting, AdaBoost, Support Vector Classifier, K-Nearest Neighbors, and Ridge Classifier.

#### Observations

- **Random Forest Classifier** continues to perform well across most metrics, particularly in terms of F1 Score and AUC, which are critical for imbalanced datasets.
- **Gradient Boosting Classifier** also shows strong overall performance, especially in Accuracy.
- **Support Vector Classifier** achieved high Precision but had very low Recall, indicating that it was highly conservative in predicting the positive class.
- **K-Nearest Neighbors** and **Decision Tree Classifier** provided a balance between Precision and Recall but did not excel in any specific area.

#### Recommendations

- **Model Selection:** Based on these results, **Random Forest Classifier** and **Gradient Boosting Classifier** remain strong candidates for further tuning and potential deployment, as they provide a good balance of accuracy and robustness across different metrics.
- **Hyperparameter Tuning:** Consider performing hyperparameter tuning on Random Forest and Gradient Boosting to potentially improve their performance further.

###  **1.3 Modeling with all features capping outliers**

In [37]:
cap_outliers = '../data/modeling_data/cap_outliers.csv'
df_cap = pd.read_csv(cap_outliers)

target_variable = 'Alpha_gt_3'
X_scaled, y = preprocess_data(df_cap, target_variable)

# Split the dataset
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X_scaled, y)

In [38]:
# Define models to test with their corresponding problem types
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000, random_state=42), "classification"),  # Increased max_iter
    "Random Forest Classifier": (RandomForestClassifier(n_estimators=100, random_state=42), "classification"),
    "Decision Tree Classifier": (DecisionTreeClassifier(random_state=42), "classification"),
    "Gradient Boosting Classifier": (GradientBoostingClassifier(random_state=42), "classification"),
    "AdaBoost Classifier": (AdaBoostClassifier(random_state=42), "classification"),
    "Support Vector Classifier": (SVC(), "classification"),
    "K-Nearest Neighbors Classifier": (KNeighborsClassifier(), "classification"),
    "Ridge Classifier": (RidgeClassifier(), "classification"),
    "Random Forest Regressor": (RandomForestRegressor(n_estimators=100, random_state=42), "regression"),
    "Decision Tree Regressor": (DecisionTreeRegressor(random_state=42), "regression"),
    "Gradient Boosting Regressor": (GradientBoostingRegressor(random_state=42), "regression"),
    "AdaBoost Regressor": (AdaBoostRegressor(random_state=42), "regression"),
}

# Dictionary to store results
results = {}

# Evaluate each model
for name, (model, problem_type) in models.items():
    if problem_type == "classification":
        model_fitted, val_accuracy, test_accuracy, val_precision, test_precision, val_recall, test_recall, val_f1, test_f1, val_auc, test_auc = model_data(
            model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type=problem_type)
        
        # Store classification results
        results[name] = {
            "Validation Accuracy": val_accuracy,
            "Test Accuracy": test_accuracy,
            "Validation Precision": val_precision,
            "Test Precision": test_precision,
            "Validation Recall": val_recall,
            "Test Recall": test_recall,
            "Validation F1 Score": val_f1,
            "Test F1 Score": test_f1,
            "Validation AUC": val_auc,
            "Test AUC": test_auc
        }
    elif problem_type == "regression":
        model_fitted, val_mse, test_mse, val_rmse, test_rmse, val_mae, test_mae, val_r2, test_r2 = model_data(
            model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type=problem_type)
        
        # Store regression results
        results[name] = {
            "Validation MSE": val_mse,
            "Test MSE": test_mse,
            "Validation RMSE": val_rmse,
            "Test RMSE": test_rmse,
            "Validation MAE": val_mae,
            "Test MAE": test_mae,
            "Validation R^2": val_r2,
            "Test R^2": test_r2
        }

# Print results
for name, result in results.items():
    print(f"Model: {name}")
    for metric, value in result.items():
        print(f"  {metric}: {value}")
    print("\n")



Model: Logistic Regression
  Validation Accuracy: 0.6120600414078675
  Test Accuracy: 0.5670289855072463
  Validation Precision: 0.4720670391061452
  Test Precision: 0.3888888888888889
  Validation Recall: 0.11426639621365788
  Test Recall: 0.09251101321585903
  Validation F1 Score: 0.1839956450734894
  Test F1 Score: 0.1494661921708185
  Validation AUC: 0.517510556597395
  Test AUC: 0.49548627583869875


Model: Random Forest Classifier
  Validation Accuracy: 0.6493271221532091
  Test Accuracy: 0.6485507246376812
  Validation Precision: 0.5895953757225434
  Test Precision: 0.6571428571428571
  Validation Recall: 0.27586206896551724
  Test Recall: 0.3039647577092511
  Validation F1 Score: 0.375863657300783
  Test F1 Score: 0.41566265060240964
  Validation AUC: 0.5783922504156727
  Test AUC: 0.5965977634700101


Model: Decision Tree Classifier
  Validation Accuracy: 0.5913561076604554
  Test Accuracy: 0.5597826086956522
  Validation Precision: 0.46630727762803237
  Test Precision: 0.4636

### Classification Model Performance Comparison After Capping Outliers

In this analysis, we compare the performance of different classification models using the `alpha_gt_3` target variable. The dataset was preprocessed to cap outliers using the 1st and 99th quantiles (`quantile_1=0.01`, `quantile_3=0.99`). The models evaluated include Logistic Regression, Random Forest, Decision Tree, Gradient Boosting, AdaBoost, Support Vector Classifier, K-Nearest Neighbors, and Ridge Classifier.

#### Observations

- **Random Forest Classifier** remains a strong performer across most metrics, especially in terms of F1 Score and AUC, which are critical for imbalanced datasets.
- **Gradient Boosting Classifier** continues to show strong overall performance, particularly in Accuracy.
- **Support Vector Classifier** shows high Precision but very low Recall, indicating that it is highly conservative in predicting the positive class.
- **K-Nearest Neighbors** and **Decision Tree Classifier** offer a balance between Precision and Recall, though they do not excel in any specific area.

#### Recommendations

- **Model Selection:** Based on these results, **Random Forest Classifier** and **Gradient Boosting Classifier** remain strong candidates for further tuning and potential deployment, as they offer a good balance of accuracy and robustness across different metrics.
- **Hyperparameter Tuning:** We should consider hyperparameter tuning for Random Forest and Gradient Boosting to potentially improve their performance further.


### Conclusion: Dataset Selection for Classification Models

#### Overview

We compared the performance of several classification models using three different approaches to handling outliers in the dataset:
1. **With Outliers:** The original dataset without any modifications to outliers.
2. **Capping Outliers:** The dataset where outliers were capped at the 1st and 99th percentiles.
3. **Dropping Outliers:** The dataset where outliers beyond the 1st and 99th percentiles were removed.

#### Key Findings

##### 1. **With Outliers**
   - Models struggled with outliers, showing lower F1 scores and inconsistent AUC values across both the validation and test sets.
   - The presence of extreme values made it difficult for models to generalize, leading to overfitting in some cases.

##### 2. **Capping Outliers**
   - **Random Forest Classifier** and **Gradient Boosting Classifier** continued to perform well with capped outliers, maintaining high F1 scores and AUC.
   - **Support Vector Classifier** achieved high precision but very low recall, indicating that it was conservative in predicting the positive class.
   - Capping outliers generally led to more consistent model performance, particularly in metrics like F1 Score and AUC, which are crucial for imbalanced datasets.

##### 3. **Dropping Outliers**
   - Removing outliers improved the overall model performance, particularly for models like **Random Forest** and **Gradient Boosting**, which showed higher accuracy and AUC compared to the capped and uncapped datasets.
   - However, dropping outliers reduced the sample size, which could potentially limit the model's ability to generalize.

#### Recommendations

- **Use the Dataset with Capped Outliers:** Based on the results, the dataset with capped outliers at the 1st and 99th percentiles offers the best balance between model performance and data integrity. 
  - **Random Forest Classifier** and **Gradient Boosting Classifier** showed strong, consistent performance across key metrics such as F1 Score and AUC, making this dataset a reliable choice for further model tuning and deployment.
  
- **Avoid Dropping Outliers:** Although dropping outliers led to slightly better performance in some models, the reduced sample size could pose a risk of overfitting. Therefore, it is recommended to avoid dropping outliers unless the outliers are extremely impactful.

- **Further Steps:** 
  - Consider hyperparameter tuning, particularly for the Random Forest and Gradient Boosting models, to further enhance their performance.
  - Explore feature engineering and dimensionality reduction techniques (e.g., PCA) to see if they can improve model accuracy and robustness.

#### Final Decision

The dataset with capped outliers is going to be used for future model development and deployment, given its balance of performance and sample integrity.


# 3. Hyper Parameter tunning

Let's create a function for hyper parameter tunning

In [39]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def tune_and_evaluate(models, param_grids, X_train, y_train, X_val, y_val, X_test, y_test, search_method='grid', n_iter=10):
    """
    Tunes hyperparameters, trains models, and evaluates them.

    Parameters:
    - models: dict of models with their corresponding problem types ('classification' or 'regression')
    - param_grids: dict of hyperparameter grids for each model
    - X_train, y_train: Training data
    - X_val, y_val: Validation data
    - X_test, y_test: Test data
    - search_method: 'grid' for GridSearchCV, 'random' for RandomizedSearchCV
    - n_iter: Number of iterations for RandomizedSearchCV (ignored if using GridSearchCV)

    Returns:
    - best_models: dict of best models and their best hyperparameters
    - results: dict of evaluation metrics for each model
    """
    
    best_models = {}
    results = {}
    
    for name, (model, problem_type) in models.items():
        print(f"Tuning and evaluating model: {name}")
        
        if name in param_grids:
            if search_method == 'grid':
                search = GridSearchCV(estimator=model, param_grid=param_grids[name], 
                                      cv=5, scoring='f1' if problem_type == 'classification' else 'neg_mean_squared_error')
            elif search_method == 'random':
                search = RandomizedSearchCV(estimator=model, param_distributions=param_grids[name], 
                                            n_iter=n_iter, cv=5, random_state=42, scoring='f1' if problem_type == 'classification' else 'neg_mean_squared_error')
            else:
                raise ValueError("search_method must be 'grid' or 'random'")
            
            search.fit(X_train, y_train)
            best_model = search.best_estimator_
            best_params = search.best_params_
        else:
            best_model = model
            best_params = {}

        # Store the best model and its parameters
        best_models[name] = (best_model, best_params)

        # Evaluate the model
        if problem_type == "classification":
            model_fitted, val_accuracy, test_accuracy, val_precision, test_precision, val_recall, test_recall, val_f1, test_f1, val_auc, test_auc = model_data(
                best_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type=problem_type)
            
            # Store classification results
            results[name] = {
                "Validation Accuracy": val_accuracy,
                "Test Accuracy": test_accuracy,
                "Validation Precision": val_precision,
                "Test Precision": test_precision,
                "Validation Recall": val_recall,
                "Test Recall": test_recall,
                "Validation F1 Score": val_f1,
                "Test F1 Score": test_f1,
                "Validation AUC": val_auc,
                "Test AUC": test_auc
            }
        
        elif problem_type == "regression":
            model_fitted, val_mse, test_mse, val_rmse, test_rmse, val_mae, test_mae, val_r2, test_r2 = model_data(
                best_model, X_train, y_train, X_val, y_val, X_test, y_test, problem_type=problem_type)
            
            # Store regression results
            results[name] = {
                "Validation MSE": val_mse,
                "Test MSE": test_mse,
                "Validation RMSE": val_rmse,
                "Test RMSE": test_rmse,
                "Validation MAE": val_mae,
                "Test MAE": test_mae,
                "Validation R^2": val_r2,
                "Test R^2": test_r2
            }

        print(f"Best Parameters for {name}: {best_params}\n")

    return best_models, results

Let's execute the hyperparameter tunning

In [None]:
models = {
    "Random Forest Classifier": (RandomForestClassifier(random_state=42), "classification"),
    "Gradient Boosting Classifier": (GradientBoostingClassifier(random_state=42), "classification")
}

param_grids = {
    "Random Forest Classifier": {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    "Gradient Boosting Classifier": {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
}

# Call the function with 'grid' search
#best_models_grid, results_grid = tune_and_evaluate(models, param_grids, X_train, y_train, X_val, y_val, X_test, y_test, search_method='grid')

# Call the function with 'random' search
best_models_random, results_random = tune_and_evaluate(models, param_grids, X_train, y_train, X_val, y_val, X_test, y_test, search_method='random', n_iter=20)

# Print results for grid search
for name, result in best_models_grid.items():
    print(f"Best Model (Grid Search): {name}")
    print(f"  Best Parameters: {result[1]}")
    print("\n")

for name, result in results_grid.items():
    print(f"Model (Grid Search): {name}")
    for metric, value in result.items():
        print(f"  {metric}: {value}")
    print("\n")

# Print results for random search
for name, result in best_models_random.items():
    print(f"Best Model (Random Search): {name}")
    print(f"  Best Parameters: {result[1]}")
    print("\n")

for name, result in results_random.items():
    print(f"Model (Random Search): {name}")
    for metric, value in result.items():
        print(f"  {metric}: {value}")
    print("\n")

Tuning and evaluating model: Random Forest Classifier
Best Parameters for Random Forest Classifier: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

Tuning and evaluating model: Gradient Boosting Classifier
