<a href="https://colab.research.google.com/github/ydblank/ModelSelectionExercise/blob/main/Model_Selection_and_Hypertuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Selection and Hypertuning





## Load the necessary packages

In [1]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import PassiveAggressiveRegressor

## Load the data

In [2]:
df = pd.read_csv('Advertising.csv')
df.head()

FileNotFoundError: ignored

In [None]:
X = df.drop('Sales', axis=1)
y = df['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

## Apply model selection

In [None]:
# Creating a dictionary of regression models to compare
models = {
    'LinearRegression': LinearRegression(),
    'HuberRegression': HuberRegressor(),
    'PassiveAggressiveRegression': PassiveAggressiveRegressor()
}

# List to store results of model evaluations
results = []

# Evaluating each model
for name, model in models.items():
    start_time = time.time()  # Recording start time for performance measurement
    model.fit(X_train, y_train)  # Fitting the model on the training data

    # Predicting on the test set and calculating RMSE
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Calculating RMSE using cross-validation
    rmse_cv = np.mean(np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)))

    # Storing results in a dictionary
    results.append({
        'Model': name,
        'Run Time': format(round((time.time() - start_time)/60, 2)),  # Calculating runtime in minutes
        'RMSE': rmse,
        'RMSE_CV': rmse_cv})

In [None]:
# Creating a DataFrame to display model evaluation results
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Selecting the best performing model based on criteria
best_model_name = results_df.sort_values(by=['Run Time', 'RMSE', 'RMSE_CV']).iloc[0]['Model']
best_model = models[best_model_name]

In [None]:
param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [-1, 1, 2, 4],  # Expanded n_jobs values
    'positive': [True, False],
    'tol': [1e-3, 1e-4, 1e-5]  # Added tolerance values for HuberRegressor
}

In [None]:
# Performing grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=best_model,
                           param_grid=param_grid,
                           scoring='neg_root_mean_squared_error',
                           cv=10)

In [None]:
# Fitting the best model found through grid search on the training data
best_model = grid_search.fit(X_train, y_train)
print('Optimum parameters', best_model.best_params_)

In [None]:
# Initializing the best model with the obtained best parameters
best_model = LinearRegression(copy_X=True,
                              fit_intercept=True,
                              n_jobs=-1,
                              positive=True)
best_model.fit(X_train, y_train)  # Fitting the best model on the training data
y_pred = best_model.predict(X_test)  # Predicting using the best model

In [None]:
# Calculating RMSE for the best model
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Best Model RMSE: {rmse_best}")

In [None]:
# Plotting the predicted vs actual sales for the best model
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title(f'Actual vs Predicted Sales for {best_model_name}')
plt.show()