# Assignment

1.  **Regularization**:

    - Use the `diabetes` dataset from `sklearn.datasets`.
    - Compare the performance (Mean Squared Error) of `LinearRegression`, `Ridge`, and `Lasso` models.
    - Tune the `alpha` parameter for `Ridge` and `Lasso` using `GridSearchCV` with cross-validation to find the optimal regularization strength.

    ```python
    from sklearn.datasets import load_diabetes

    # Load the diabetes dataset
    diabetes = load_diabetes()
    ```


In [1]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Load the diabetes dataset
diabetes = load_diabetes()

# Dataset description
print("\n--- Dataset Description ---")
print("Full dataset description:")
print(diabetes.DESCR)


--- Dataset Description ---
Full dataset description:
.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      

In [2]:
# Split the data into training and test sets
X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Dataset exploration - Features and Target information
print("\n=== DATASET EXPLORATION ===")

# Target variable information
print("\n--- Target Variable ---")
print("Target represents: Disease progression after one year")
print("Target type:", type(y))
print("Target shape:", y.shape)
print("Target statistics:")
print(f"  Min: {y.min():.2f}")
print(f"  Max: {y.max():.2f}")
print(f"  Mean: {y.mean():.2f}")
print(f"  Std: {y.std():.2f}")
print(f"  Median: {np.median(y):.2f}")

# Feature information
print("\n--- Features ---")
print("Number of features:", len(diabetes.feature_names))
print("Feature names:", diabetes.feature_names)
print("Feature matrix shape:", X.shape)
print("Feature matrix type:", type(X))

# Create DataFrame for better visualization
features_df = pd.DataFrame(X, columns=diabetes.feature_names)
print("\nFeature statistics:")
print(features_df.describe().round(3))

print("\nFirst 5 rows of features:")
print(features_df.head())


=== DATASET EXPLORATION ===

--- Target Variable ---
Target represents: Disease progression after one year
Target type: <class 'numpy.ndarray'>
Target shape: (442,)
Target statistics:
  Min: 25.00
  Max: 346.00
  Mean: 152.13
  Std: 77.01
  Median: 140.50

--- Features ---
Number of features: 10
Feature names: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
Feature matrix shape: (442, 10)
Feature matrix type: <class 'numpy.ndarray'>

Feature statistics:
           age      sex      bmi       bp       s1       s2       s3       s4  \
count  442.000  442.000  442.000  442.000  442.000  442.000  442.000  442.000   
mean    -0.000    0.000   -0.000   -0.000   -0.000    0.000   -0.000   -0.000   
std      0.048    0.048    0.048    0.048    0.048    0.048    0.048    0.048   
min     -0.107   -0.045   -0.090   -0.112   -0.127   -0.116   -0.102   -0.076   
25%     -0.037   -0.045   -0.034   -0.037   -0.034   -0.030   -0.035   -0.039   
50%      0.005   -0.045   -0.007   -0.0

In [3]:
# Linear Regression (baseline)
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

linear_preds = linear_reg.predict(X_test)
linear_mse = mean_squared_error(y_test, linear_preds)

print(f'Linear Regression MSE: {linear_mse:.2f}')

Linear Regression MSE: 3424.26


In [4]:
# Ridge Regression with hyperparameter tuning
ridge_param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]}

ridge_grid_search = GridSearchCV(
    estimator=Ridge(random_state=0), 
    param_grid=ridge_param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

ridge_grid_search.fit(X_train, y_train)

# Best Ridge model
best_ridge = ridge_grid_search.best_estimator_
ridge_preds = best_ridge.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_preds)

print(f"Ridge - Best alpha: {ridge_grid_search.best_params_['alpha']}")
print(f"Ridge - Best CV score: {-ridge_grid_search.best_score_:.2f}")
print(f'Ridge Regression MSE: {ridge_mse:.2f}')

Ridge - Best alpha: 0.1
Ridge - Best CV score: 2886.90
Ridge Regression MSE: 3372.61


In [5]:
# Lasso Regression with hyperparameter tuning
lasso_param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0]}

lasso_grid_search = GridSearchCV(
    estimator=Lasso(random_state=0), 
    param_grid=lasso_param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

lasso_grid_search.fit(X_train, y_train)

# Best Lasso model
best_lasso = lasso_grid_search.best_estimator_
lasso_preds = best_lasso.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_preds)

print(f"Lasso - Best alpha: {lasso_grid_search.best_params_['alpha']}")
print(f"Lasso - Best CV score: {-lasso_grid_search.best_score_:.2f}")
print(f'Lasso Regression MSE: {lasso_mse:.2f}')

Lasso - Best alpha: 0.01
Lasso - Best CV score: 2872.37
Lasso Regression MSE: 3445.81


In [6]:
# Compare results
results_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'MSE': [linear_mse, ridge_mse, lasso_mse],
    'Best_Alpha': ['N/A', ridge_grid_search.best_params_['alpha'], lasso_grid_search.best_params_['alpha']]
})

print("=== Regularization Results Comparison ===")
print(results_df.to_string(index=False))

# Show feature coefficients for Lasso (feature selection effect)
print(f"\n=== Lasso Feature Selection ===")
print(f"Number of non-zero coefficients in Lasso: {sum(best_lasso.coef_ != 0)} out of {len(best_lasso.coef_)}")
print(f"Number of features zeroed out: {sum(best_lasso.coef_ == 0)}")

=== Regularization Results Comparison ===
            Model         MSE Best_Alpha
Linear Regression 3424.259334        N/A
 Ridge Regression 3372.612250        0.1
 Lasso Regression 3445.806809       0.01

=== Lasso Feature Selection ===
Number of non-zero coefficients in Lasso: 9 out of 10
Number of features zeroed out: 1


2.  **Ensemble Methods**:

    - Use the `breast_cancer` dataset from `sklearn.datasets`.
    - Compare the performance (F1 Score and AUC) of `DecisionTreeClassifier`, `RandomForestClassifier`, and `GradientBoostingClassifier`.
    - Tune the hyperparameters of each classifier using `GridSearchCV` with cross-validation.

    ```python
    from sklearn.datasets import load_breast_cancer

    # Load the breast cancer dataset
    breast_cancer = load_breast_cancer()
    ```

In [7]:
# Question 2: Ensemble Methods
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, roc_curve, auc, accuracy_score, precision_score, recall_score
import pandas as pd

# Load the breast cancer dataset
breast_cancer = load_breast_cancer()

# Dataset description
print("\n--- Dataset Description ---")
print("Full dataset description:")
print(breast_cancer.DESCR)


--- Dataset Description ---
Full dataset description:
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        result

In [8]:
# Split the data into training and test sets
X = breast_cancer.data
y = breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Dataset exploration - Features and Target information
print("\n=== DATASET EXPLORATION ===")

# Target variable information
print("\n--- Target Variable ---")
if hasattr(breast_cancer, 'target_names'):
    print("Target names:", breast_cancer.target_names)
    print("Target classes and their encoded values:")
    for idx, name in enumerate(breast_cancer.target_names):
        print(f"  {idx}: {name}")
else:
    print("No target_names attribute. Check DESCR for details.")
print("Target type:", type(y))
print("Target shape:", y.shape)
print("Target sample values:", y[:10])
print("Class distribution:", pd.Series(y).value_counts().sort_index().to_dict())

# Feature information
print("\n--- Features ---")
print("Number of features:", len(breast_cancer.feature_names))
print("Feature names:", breast_cancer.feature_names)
print("Feature matrix shape:", X.shape)
print("Feature matrix type:", type(X))

features_df = pd.DataFrame(X, columns=breast_cancer.feature_names)
print("\nFeature statistics:")
print(features_df.describe().round(3))

print("\nFirst 5 rows of features:")
print(features_df.head())



=== DATASET EXPLORATION ===

--- Target Variable ---
Target names: ['malignant' 'benign']
Target classes and their encoded values:
  0: malignant
  1: benign
Target type: <class 'numpy.ndarray'>
Target shape: (569,)
Target sample values: [0 0 0 0 0 0 0 0 0 0]
Class distribution: {0: 212, 1: 357}

--- Features ---
Number of features: 30
Feature names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Feature matrix shape: (569, 30)
Feature matrix type: <class 'numpy.ndarray'>

Feature stati

In [9]:
# 2.1 Decision Tree Classifier with hyperparameter tuning
dt_param_grid = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=0), 
    param_grid=dt_param_grid, 
    cv=5, 
    scoring='f1',
    n_jobs=-1
)

dt_grid_search.fit(X_train, y_train)

# Best Decision Tree model
best_dt = dt_grid_search.best_estimator_
dt_preds = best_dt.predict(X_test)
dt_proba = best_dt.predict_proba(X_test)[:, 1]

dt_f1 = f1_score(y_test, dt_preds)
dt_fpr, dt_tpr, _ = roc_curve(y_test, dt_proba)
dt_auc = auc(dt_fpr, dt_tpr)

print(f"Decision Tree - Best params: {dt_grid_search.best_params_}")
print(f"Decision Tree - Best CV F1: {dt_grid_search.best_score_:.4f}")
print(f'Decision Tree - Test F1: {dt_f1:.4f}')
print(f'Decision Tree - Test AUC: {dt_auc:.4f}')

Decision Tree - Best params: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
Decision Tree - Best CV F1: 0.9438
Decision Tree - Test F1: 0.9697
Decision Tree - Test AUC: 0.9725


In [10]:
# 2.2 Random Forest Classifier with hyperparameter tuning
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=0), 
    param_grid=rf_param_grid, 
    cv=5, 
    scoring='f1',
    n_jobs=-1
)

rf_grid_search.fit(X_train, y_train)

# Best Random Forest model
best_rf = rf_grid_search.best_estimator_
rf_preds = best_rf.predict(X_test)
rf_proba = best_rf.predict_proba(X_test)[:, 1]

rf_f1 = f1_score(y_test, rf_preds)
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_proba)
rf_auc = auc(rf_fpr, rf_tpr)

print(f"Random Forest - Best params: {rf_grid_search.best_params_}")
print(f"Random Forest - Best CV F1: {rf_grid_search.best_score_:.4f}")
print(f'Random Forest - Test F1: {rf_f1:.4f}')
print(f'Random Forest - Test AUC: {rf_auc:.4f}')

Random Forest - Best params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest - Best CV F1: 0.9663
Random Forest - Test F1: 0.9697
Random Forest - Test AUC: 0.9965


In [11]:
# 2.3 Gradient Boosting Classifier with hyperparameter tuning
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=0), 
    param_grid=gb_param_grid, 
    cv=5, 
    scoring='f1',
    n_jobs=-1
)

gb_grid_search.fit(X_train, y_train)

# Best Gradient Boosting model
best_gb = gb_grid_search.best_estimator_
gb_preds = best_gb.predict(X_test)
gb_proba = best_gb.predict_proba(X_test)[:, 1]

gb_f1 = f1_score(y_test, gb_preds)
gb_fpr, gb_tpr, _ = roc_curve(y_test, gb_proba)
gb_auc = auc(gb_fpr, gb_tpr)

print(f"Gradient Boosting - Best params: {gb_grid_search.best_params_}")
print(f"Gradient Boosting - Best CV F1: {gb_grid_search.best_score_:.4f}")
print(f'Gradient Boosting - Test F1: {gb_f1:.4f}')
print(f'Gradient Boosting - Test AUC: {gb_auc:.4f}')

Gradient Boosting - Best params: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Gradient Boosting - Best CV F1: 0.9781
Gradient Boosting - Test F1: 0.9706
Gradient Boosting - Test AUC: 0.9975


In [12]:
# 2.4 Compare ensemble methods results
ensemble_results_df = pd.DataFrame({
    'Model': ['Decision Tree', 'Random Forest', 'Gradient Boosting'],
    'Test_F1_Score': [dt_f1, rf_f1, gb_f1],
    'Test_AUC': [dt_auc, rf_auc, gb_auc],
    'CV_F1_Score': [dt_grid_search.best_score_, rf_grid_search.best_score_, gb_grid_search.best_score_]
})

print("=== Ensemble Methods Results Comparison ===")
print(ensemble_results_df.to_string(index=False))

# Additional metrics for completeness
print("\n=== Additional Metrics ===")
for name, preds in [('Decision Tree', dt_preds), ('Random Forest', rf_preds), ('Gradient Boosting', gb_preds)]:
    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    print(f"{name}: Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}")

=== Ensemble Methods Results Comparison ===
            Model  Test_F1_Score  Test_AUC  CV_F1_Score
    Decision Tree       0.969697  0.972531     0.943763
    Random Forest       0.969697  0.996507     0.966261
Gradient Boosting       0.970588  0.997460     0.978064

=== Additional Metrics ===
Decision Tree: Accuracy=0.9649, Precision=0.9846, Recall=0.9552
Random Forest: Accuracy=0.9649, Precision=0.9846, Recall=0.9552
Gradient Boosting: Accuracy=0.9649, Precision=0.9565, Recall=0.9851
