## 1: Regularization:

    - Use the `diabetes` dataset from `sklearn.datasets`.
    - Compare the performance (Mean Squared Error) of `LinearRegression`, `Ridge`, and `Lasso` models.
    - Tune the `alpha` parameter for `Ridge` and `Lasso` using `GridSearchCV` with cross-validation to find the optimal regularization strength.

    ```python
    from sklearn.datasets import load_diabetes

    # Load the diabetes dataset
    diabetes = load_diabetes()

In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_diabetes
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
# Load the diabetes dataset
diabetes = load_diabetes()

# Convert to DataFrame
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.Series(diabetes.target, name="target")
df = pd.concat([X, y], axis=1)
print("Dataset loaded successfully.")
print("Shape of dataset:", df.shape)

# Preview data
print("\nFirst 5 rows:")
print(df.head())

# Dataset structure
print("\nDiabetes Dataset information:")
print(df.info())

# Missing values
print("\nMissing values per column:")
print(df.isna().sum())

# Statistical summary
print("\nStatistical summary:")
print(df.describe())

Dataset loaded successfully.
Shape of dataset: (442, 11)

First 5 rows:
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  

Diabetes Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     44

In [5]:
## Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#Linear Regression
pipeline = LinearRegression()
pipeline.fit(X_train, y_train) #fit for training data
preds = pipeline.predict(X_test) #predict for testing data
mde =  mean_squared_error(y_test, preds) #mean squared error


print ("\nLinear Regression Mean Squared Error:", mde)



Linear Regression Mean Squared Error: 3424.259334298692


In [6]:
#Ridge Regression with Hyperparameter Tuning

# Initialize a Ridge regression model with regularization strength (alpha) of 1.0
# and fit it to the training data.

ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)
# Generate predictions on the test set using the trained Ridge model
# and calculate the Mean Squared Error (MSE).

ridge_preds = ridge_reg.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_preds)

print(f'Ridge MSE: {ridge_mse}')

Ridge MSE: 3379.4063076042657


In [8]:
#Lasso Regression with Hyperparameter Tuning

# Initialize a Lasso regression model with alpha=0.1, train it on the
# training set, make predictions on the test set, and compute the MSE.

lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train, y_train)

lasso_preds = lasso_reg.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_preds)

print(f'Lasso MSE: {lasso_mse}')

Lasso MSE: 3383.5084900141464


In [10]:
# Lasso using GridSearchCV


# 1. Define a range of alpha values to test
param_grid = {'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso(max_iter=10000))
])

#lasso_params = {
#    'lasso__alpha': np.logspace(-3, 1, 20)
#}

lasso_grid = GridSearchCV(
    lasso_pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error'
)

lasso_grid.fit(X_train, y_train)

best_lasso = lasso_grid.best_estimator_
y_pred_lasso = best_lasso.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print("Best Lasso alpha:", lasso_grid.best_params_['lasso__alpha'])
print("Lasso Regression MSE:", mse_lasso)



Best Lasso alpha: 1
Lasso Regression MSE: 3412.664193575005


In [11]:
# Ridger using GridSearchCV

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

#ridge_params = {
#    'ridge__alpha': np.logspace(-3, 3, 20)
#}

param_gride = {'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

ridge_grid = GridSearchCV(
    ridge_pipeline,
    param_gride,
    cv=5,
    scoring='neg_mean_squared_error'
)

ridge_grid.fit(X_train, y_train)

best_ridge = ridge_grid.best_estimator_
y_pred_ridge = best_ridge.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

print("Best Ridge alpha:", ridge_grid.best_params_['ridge__alpha'])
print("Ridge Regression MSE:", mse_ridge)


Best Ridge alpha: 10
Ridge Regression MSE: 3429.2362283874895


## 2: Ensemble Methods**:

    - Use the `breast_cancer` dataset from `sklearn.datasets`.
    - Compare the performance (F1 Score and AUC) of `DecisionTreeClassifier`, `RandomForestClassifier`, and `GradientBoostingClassifier`.
    - Tune the hyperparameters of each classifier using `GridSearchCV` with cross-validation.

    ```python
    from sklearn.datasets import load_breast_cancer

    # Load the breast cancer dataset
    breast_cancer = load_breast_cancer()

In [14]:
#Load the breast cancer dataset

breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [15]:
#Define the model - Decision Tree

dt_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', DecisionTreeClassifier(random_state=42))
])

dt_params = {
    'model__max_depth': [None, 3, 5, 10],
    'model__min_samples_split': [2, 5, 10]
}


In [16]:
# Define Model - Random Forest
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])

rf_params = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2, 5]
}


In [17]:
#Define model - Gradient Boosting

gb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingClassifier(random_state=42))
])

gb_params = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [3, 5]
}


In [18]:
#Hyperparameter Tuning with GridSearchCV

models = {
    'Decision Tree': (dt_pipeline, dt_params),
    'Random Forest': (rf_pipeline, rf_params),
    'Gradient Boosting': (gb_pipeline, gb_params)
}

results = []

for name, (pipeline, params) in models.items():
    grid = GridSearchCV(
        pipeline,
        params,
        cv=5,
        scoring='f1',
        n_jobs=-1
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_

    # Predictions
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Metrics
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'F1 Score': f1,
        'ROC AUC': auc
    })



In [None]:
#Compare model performance
results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:")
print(results_df.to_string()) #to clean the data and prevent truncation


Model Performance Comparison:
               Model                                                                            Best Params  F1 Score   ROC AUC
0      Decision Tree                                 {'model__max_depth': 5, 'model__min_samples_split': 5}  0.936170  0.916336
1      Random Forest  {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}  0.965517  0.993221
2  Gradient Boosting       {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}  0.965986  0.989749
