# 1 Regularization:
Use the diabetes dataset from sklearn.datasets.
Compare the performance (Mean Squared Error) of LinearRegression, Ridge, and Lasso models.
Tune the alpha parameter for Ridge and Lasso using GridSearchCV with cross-validation to find the optimal regularization strength.

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

In [2]:
diabetes = load_diabetes()

X = diabetes.data
y = diabetes.target

df = pd.DataFrame(X, columns=diabetes.feature_names)
df['target'] = y

print(df.head())

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [3]:
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

In [5]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_preds)
print(f"Linear Regression MSE: {lr_mse:.2f}")

Linear Regression MSE: 2900.19


In [6]:
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)

In [7]:
ridge_preds = ridge_reg.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_preds)

print(f"Ridge_MSE : {ridge_mse:.2f}")

Ridge_MSE : 3077.42


In [8]:
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train, y_train)

lasso_preds = lasso_reg.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_preds)

print(f'Lasso MSE: {lasso_mse:.2F}')


Lasso MSE: 2798.19


In [9]:
ridge = Ridge()
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_grid = GridSearchCV(ridge, ridge_params, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)
ridge_best = ridge_grid.best_estimator_
ridge_preds = ridge_best.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_preds)
print(f"Ridge_MSE : {ridge_mse:.2f}")

Ridge_MSE : 2856.49


In [10]:
lasso = Lasso(max_iter=10000)
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_grid = GridSearchCV(lasso, lasso_params, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)
lasso_best = lasso_grid.best_estimator_
lasso_preds = lasso_best.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_preds)
print(f'Lasso MSE: {lasso_mse:.2F}')

Lasso MSE: 2798.19


In [11]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'MSE': [lr_mse, ridge_mse, lasso_mse],
    'Best Alpha': ['N/A', ridge_grid.best_params_['alpha'], lasso_grid.best_params_['alpha']]
})
print(results)

               Model          MSE Best Alpha
0  Linear Regression  2900.193628        N/A
1   Ridge Regression  2856.486888        0.1
2   Lasso Regression  2798.193485        0.1


# 2 Ensemble Methods:

Use the breast_cancer dataset from sklearn.datasets.
Compare the performance (F1 Score and AUC) of DecisionTreeClassifier, RandomForestClassifier, and GradientBoostingClassifier.
Tune the hyperparameters of each classifier using GridSearchCV with cross-validation.

In [12]:
# Re-import required modules after code reset
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score
import pandas as pd

In [14]:
# Load the dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

df = pd.DataFrame(X, columns=breast_cancer.feature_names)
df['target'] = y

print(df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_params = {'max_depth': [3, 5, 10, None]}
dt_grid = GridSearchCV(dt_model, dt_params, cv=5, scoring='f1')
dt_grid.fit(X_train, y_train)
dt_best = dt_grid.best_estimator_
dt_pred = dt_best.predict(X_test)
dt_proba = dt_best.predict_proba(X_test)[:, 1]

print("\n🔹 Decision Tree")
print("Best Params:", dt_grid.best_params_)
print("F1 Score:", f1_score(y_test, dt_pred))
print("ROC AUC Score:", roc_auc_score(y_test, dt_proba))


🔹 Decision Tree
Best Params: {'max_depth': 3}
F1 Score: 0.9583333333333334
ROC AUC Score: 0.9574189321978381


In [25]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [50, 100], 'max_depth': [3, 5, None]}
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='f1')
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
rf_pred = rf_best.predict(X_test)
rf_proba = rf_best.predict_proba(X_test)[:, 1]

print("\n🔹 Random Forest")
print("Best Params:", rf_grid.best_params_)
print("F1 Score:", f1_score(y_test, rf_pred))
print("ROC AUC Score:", roc_auc_score(y_test, rf_proba))


🔹 Random Forest
Best Params: {'max_depth': 5, 'n_estimators': 100}
F1 Score: 0.9722222222222222
ROC AUC Score: 0.9963969865705863


In [23]:
# Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_params = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
gb_grid = GridSearchCV(gb_model, gb_params, cv=5, scoring='f1')
gb_grid.fit(X_train, y_train)
gb_best = gb_grid.best_estimator_
gb_pred = gb_best.predict(X_test)
gb_proba = gb_best.predict_proba(X_test)[:, 1]

print("\n🔹 Gradient Boosting")
print("Best Params:", gb_grid.best_params_)
print("F1 Score:", f1_score(y_test, gb_pred))
print("ROC AUC Score:", roc_auc_score(y_test, gb_proba))


🔹 Gradient Boosting
Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
F1 Score: 0.965034965034965
ROC AUC Score: 0.9950867998689813


In [32]:
results = []

for name, mp in models_and_params.items():
    print(f"\n🔹 {name}")
  
    grid = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='f1')
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    results.append({
        'Model': name,
        'F1 Score': f1,
        'ROC AUC Score': auc,
        'Best Params': grid.best_params_
    })

    print("Best Params:", grid.best_params_)
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {auc:.4f}")



🔹 Decision Tree
Best Params: {'max_depth': 3}
F1 Score: 0.9583
ROC AUC Score: 0.9574

🔹 Random Forest
Best Params: {'max_depth': 5, 'n_estimators': 100}
F1 Score: 0.9722
ROC AUC Score: 0.9964

🔹 Gradient Boosting
Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
F1 Score: 0.9650
ROC AUC Score: 0.9951
