<a href="https://colab.research.google.com/github/kushum-coder/2501460_kushum/blob/main/Worksheet7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

----------------------------------------------------------------------------------------------
Part - 1: Regression Task California Housing

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [11]:
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)

X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]


In [12]:
# Column types
num_features = X.drop("ocean_proximity", axis=1).columns
cat_features = ["ocean_proximity"]

# Numerical pipeline (IMPUTE NaN)
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Categorical pipeline
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first"))
])

# Combine
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_features)
    ]
)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [14]:
lin_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

lin_model.fit(X_train, y_train)

train_pred = lin_model.predict(X_train)
test_pred = lin_model.predict(X_test)

print("Linear Regression Train MSE:", mean_squared_error(y_train, train_pred))
print("Linear Regression Test MSE:", mean_squared_error(y_test, test_pred))


Linear Regression Train MSE: 4683203783.504253
Linear Regression Test MSE: 4908290571.346397


In [15]:
ridge_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", Ridge())
])

param_grid_ridge = {
    "model__alpha": [0.01, 0.1, 1, 10, 100]
}

ridge_cv = GridSearchCV(
    ridge_pipe,
    param_grid_ridge,
    cv=5,
    scoring="neg_mean_squared_error"
)

ridge_cv.fit(X_train, y_train)

print("Best Ridge alpha:", ridge_cv.best_params_)


Best Ridge alpha: {'model__alpha': 1}


In [16]:
lasso_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", Lasso(max_iter=10000))
])

param_grid_lasso = {
    "model__alpha": [0.001, 0.01, 0.1, 1, 10]
}

lasso_cv = GridSearchCV(
    lasso_pipe,
    param_grid_lasso,
    cv=5,
    scoring="neg_mean_squared_error"
)

lasso_cv.fit(X_train, y_train)

print("Best Lasso alpha:", lasso_cv.best_params_)


Best Lasso alpha: {'model__alpha': 0.001}


In [17]:
best_ridge = ridge_cv.best_estimator_
best_lasso = lasso_cv.best_estimator_

print("Ridge Test MSE:", mean_squared_error(y_test, best_ridge.predict(X_test)))
print("Lasso Test MSE:", mean_squared_error(y_test, best_lasso.predict(X_test)))


Ridge Test MSE: 4909851273.941725
Lasso Test MSE: 4908290765.821696


----------------------------------------------------------------------------------------------
Part-2: Classification Task Breast Cancer
----------------------------------------------------------------------------------------------

In [18]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [19]:
X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [20]:
log_model = LogisticRegression(max_iter=10000)
log_model.fit(X_train, y_train)

print("Train Accuracy:", accuracy_score(y_train, log_model.predict(X_train)))
print("Test Accuracy:", accuracy_score(y_test, log_model.predict(X_test)))


Train Accuracy: 0.9582417582417583
Test Accuracy: 0.956140350877193


In [21]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"]
}

log_cv = GridSearchCV(
    LogisticRegression(solver="liblinear", max_iter=10000),
    param_grid,
    cv=5,
    scoring="accuracy"
)

log_cv.fit(X_train, y_train)

print("Best parameters:", log_cv.best_params_)


Best parameters: {'C': 100, 'penalty': 'l1'}


In [22]:
best_log = log_cv.best_estimator_

print("Train Accuracy:", accuracy_score(y_train, best_log.predict(X_train)))
print("Test Accuracy:", accuracy_score(y_test, best_log.predict(X_test)))


Train Accuracy: 0.989010989010989
Test Accuracy: 0.9824561403508771
