In [3]:
import numpy as np

In [4]:
rng = np.random.default_rng(42)
n, p = 500, 8  # 500 samples, 8 features (>=7 highly correlated)

# Create latent factor for high correlation
z = rng.normal(0, 1, size=(n, 1))
X = np.hstack([z + rng.normal(0, 0.05, size=(n, 1)) for _ in range(p)])  # Strong correlation
true_w = rng.normal(0, 1, size=(p, 1))
y = (X @ true_w + rng.normal(0, 0.2, size=(n, 1))).ravel()

# Standardize features (important for gradient descent)
X_mean, X_std = X.mean(axis=0), X.std(axis=0) + 1e-12
X = (X - X_mean) / X_std

# Center target variable
y_mean = y.mean()
y = y - y_mean

print(f"Dataset shape: {X.shape}")
print(f"Correlation matrix (first 3x3):")
print(np.corrcoef(X[:, :3].T))


Dataset shape: (500, 8)
Correlation matrix (first 3x3):
[[1.         0.997093   0.99704355]
 [0.997093   1.         0.99719576]
 [0.99704355 0.99719576 1.        ]]


In [5]:
# 2) Ridge Regression with Gradient Descent

def ridge_cost(X, y, w, b, alpha):
    """Calculate Ridge regression cost (MSE + L2 regularization)"""
    n = len(y)
    pred = X @ w + b
    mse_term = np.mean((y - pred) ** 2)
    reg_term = alpha * np.sum(w ** 2) 
    return mse_term + reg_term

def r2_score(y, yhat):
    """Calculate R¬≤ score"""
    ss_res = np.sum((y - yhat) ** 2)
    ss_tot = np.sum((y - y.mean()) ** 2) + 1e-12
    return 1.0 - ss_res / ss_tot

def ridge_gd(X, y, alpha=0.0, lr=1e-2, max_iter=5000, tol=1e-8):
    """Ridge regression using gradient descent"""
    n, p = X.shape
    w = np.zeros((p,))
    b = 0.0
    prev_cost = np.inf
    
    for it in range(max_iter):
        pred = X @ w + b
        resid = y - pred
        
        grad_w = (-2.0 / n) * (X.T @ resid) + 2.0 * alpha * w
        grad_b = (-2.0 / n) * resid.sum()
        
        w -= lr * grad_w
        b -= lr * grad_b
        
        cost = ridge_cost(X, y, w, b, alpha)
        if not np.isfinite(cost) or cost > 1e8:  
            return None, None, np.inf, -np.inf
        if abs(prev_cost - cost) < tol:
            break
        prev_cost = cost
    
    return w, b, cost, r2_score(y, X @ w + b)


In [6]:
# 3) Hyperparameter Grid Search

learning_rates = [1e-4, 1e-3, 1e-2, 0.1, 1.0, 10.0]
alphas = [1e-15, 1e-10, 1e-5, 1e-3, 0.0, 1.0, 10.0, 20.0]

results = []
for lr in learning_rates:
    for alpha in alphas:
        w, b, cost, r2 = ridge_gd(X, y, alpha=alpha, lr=lr, max_iter=5000)
        results.append((cost, -r2, lr, alpha, r2))

results.sort(key=lambda t: (t[0], t[1]))
best_cost, _, best_lr, best_alpha, best_r2 = results[0]

print("Top 5 settings (by cost, tie-broken by R¬≤):")
for row in results[:5]:
    print(f"cost={row[0]:.6f} | R¬≤={-row[1]:.4f} | lr={row[2]} | alpha={row[3]}")

print("\nBest parameters:")
print(f"  Learning rate (lr): {best_lr}")
print(f"  Regularization (alpha): {best_alpha}")
print(f"Best Ridge Cost: {best_cost:.6f}")
print(f"Best R¬≤ Score:  {best_r2:.6f}")

w_best, b_best, _, _ = ridge_gd(X, y, alpha=best_alpha, lr=best_lr, max_iter=5000)
print("\nWeights (first 5):", np.round(w_best[:5], 4), "...  Bias:", round(b_best, 4))


Top 5 settings (by cost, tie-broken by R¬≤):
cost=0.040399 | R¬≤=0.6724 | lr=0.1 | alpha=0.0
cost=0.040399 | R¬≤=0.6724 | lr=0.1 | alpha=1e-15
cost=0.040399 | R¬≤=0.6724 | lr=0.1 | alpha=1e-10
cost=0.040479 | R¬≤=0.6723 | lr=0.1 | alpha=1e-05
cost=0.046116 | R¬≤=0.6566 | lr=0.1 | alpha=0.001

Best parameters:
  Learning rate (lr): 0.1
  Regularization (alpha): 0.0
Best Ridge Cost: 0.040399
Best R¬≤ Score:  0.672416

Weights (first 5): [-1.2584  0.7628 -1.1189  0.6002 -0.6045] ...  Bias: 0.0


Question - 2

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error


In [11]:
# url = "https://drive.google.com/uc?id=1qzCKF6JKKMB0p7ul_lLy8tdmRk3vE_bG"
file_path = "Hitters.csv"
df = pd.read_csv(file_path)

print("Initial shape:", df.shape)
print("Missing values per column:\n", df.isnull().sum())

# Drop rows with null target (Salary)
df = df.dropna(subset=['Salary'])

# Fill other missing values with mode (categorical) or mean (numeric)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].mean())

# Convert categoricals to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

print(f"\nAfter preprocessing - Shape: {df.shape}")
print("Missing values after preprocessing:\n", df.isnull().sum().sum())


Initial shape: (322, 20)
Missing values per column:
 AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64

After preprocessing - Shape: (263, 20)
Missing values after preprocessing:
 0


In [12]:
# Separate input and output features
X = df.drop('Salary', axis=1)
y = df['Salary']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTrain set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")
print(f"Target statistics - Mean: {y.mean():.2f}, Std: {y.std():.2f}")


Features shape: (263, 19)
Target shape: (263,)

Train set: (210, 19)
Test set: (53, 19)
Target statistics - Mean: 535.93, Std: 451.12


In [13]:
# Set regularization parameter as specified (0.5748)
alpha = 0.5748

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=alpha),
    'Lasso Regression': Lasso(alpha=alpha)
}

# Train and evaluate each model
results = {}
print("Training and evaluating models...")
print("-" * 50)

for name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # Store results
    results[name] = {'R2': r2, 'MSE': mse}
    
    # Print results
    print(f"{name}:")
    print(f"  R¬≤ Score: {r2:.4f}")
    print(f"  MSE: {mse:.2f}")
    print()


Training and evaluating models...
--------------------------------------------------
Linear Regression:
  R¬≤ Score: 0.2907
  MSE: 128284.35

Ridge Regression:
  R¬≤ Score: 0.3000
  MSE: 126603.90

Lasso Regression:
  R¬≤ Score: 0.2993
  MSE: 126739.57



  model = cd_fast.enet_coordinate_descent(


In [14]:
# Find the best performing model
best_model = max(results.items(), key=lambda x: x[1]['R2'])

print("=" * 60)
print("MODEL COMPARISON RESULTS")
print("=" * 60)

# Display all results in a table format
print(f"{'Model':<20} {'R¬≤ Score':<10} {'MSE':<10}")
print("-" * 40)
for name, metrics in results.items():
    print(f"{name:<20} {metrics['R2']:<10.4f} {metrics['MSE']:<10.2f}")

print("\n" + "=" * 60)
print(f"BEST MODEL: {best_model[0]}")
print(f"R¬≤ Score: {best_model[1]['R2']:.4f}")
print(f"MSE: {best_model[1]['MSE']:.2f}")
print("=" * 60)

# ------------------------------
# Explanation of Results
# ------------------------------
print("\nEXPLANATION:")
print("-" * 20)
print("""
‚Ä¢ Linear Regression: 
  - No regularization, can overfit with correlated features
  - High variance, may not generalize well

‚Ä¢ Ridge Regression (L2): 
  - Shrinks coefficients towards zero but doesn't eliminate them
  - Balances bias-variance tradeoff effectively
  - Good for correlated features (like in Hitters dataset)

‚Ä¢ Lasso Regression (L1): 
  - Can zero out coefficients (feature selection)
  - May be too aggressive for this dataset
  - Good when you want automatic feature selection

Typically, Ridge performs best on highly correlated datasets
because it handles multicollinearity well while maintaining
all features with reduced coefficients.
""")


MODEL COMPARISON RESULTS
Model                R¬≤ Score   MSE       
----------------------------------------
Linear Regression    0.2907     128284.35 
Ridge Regression     0.3000     126603.90 
Lasso Regression     0.2993     126739.57 

BEST MODEL: Ridge Regression
R¬≤ Score: 0.3000
MSE: 126603.90

EXPLANATION:
--------------------

‚Ä¢ Linear Regression: 
  - No regularization, can overfit with correlated features
  - High variance, may not generalize well

‚Ä¢ Ridge Regression (L2): 
  - Shrinks coefficients towards zero but doesn't eliminate them
  - Balances bias-variance tradeoff effectively
  - Good for correlated features (like in Hitters dataset)

‚Ä¢ Lasso Regression (L1): 
  - Can zero out coefficients (feature selection)
  - May be too aggressive for this dataset
  - Good when you want automatic feature selection

Typically, Ridge performs best on highly correlated datasets
because it handles multicollinearity well while maintaining
all features with reduced coefficients.

# Question 3: Cross Validation for Ridge and Lasso Regression


In [16]:
# -----------------------------
# Question 3: RidgeCV & LassoCV on Boston Housing
# -----------------------------
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import r2_score, mean_squared_error


In [17]:
# -----------------------------
# Load Boston Housing Dataset
# -----------------------------
def load_boston_data():
    """Load Boston housing dataset with fallback for deprecated load_boston"""
    try:
        # Try the original load_boston (deprecated in newer sklearn)
        from sklearn.datasets import load_boston
        boston = load_boston()
        X, y = boston.data, boston.target
        feature_names = boston.feature_names
        print("Loaded Boston dataset using load_boston")
    except Exception:
        # Fallback to OpenML if load_boston is not available
        from sklearn.datasets import fetch_openml
        boston = fetch_openml(name="boston", version=1, as_frame=True)
        X = boston.data.values
        y = boston.target.values.astype(float)
        feature_names = boston.data.columns.values
        print("Loaded Boston dataset using fetch_openml")
    
    return X, y, feature_names

# Load the dataset
X, y, feature_names = load_boston_data()

print(f"Dataset shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Features: {list(feature_names)}")
print(f"Target statistics - Mean: {y.mean():.2f}, Std: {y.std():.2f}")


Loaded Boston dataset using fetch_openml
Dataset shape: (506, 13)
Target shape: (506,)
Features: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
Target statistics - Mean: 22.53, Std: 9.19


In [18]:
# -----------------------------
# Train/Test Split and Alpha Grid
# -----------------------------
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create alpha grid for cross-validation
alphas = np.logspace(-3, 3, 50)  # 50 values from 0.001 to 1000

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Alpha range: {alphas[0]:.3f} to {alphas[-1]:.3f}")
print(f"Number of alpha values: {len(alphas)}")


Train set: (404, 13)
Test set: (102, 13)
Alpha range: 0.001 to 1000.000
Number of alpha values: 50


In [24]:
# -----------------------------
# RidgeCV Implementation
# -----------------------------
# Create RidgeCV pipeline with scaling
ridge_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=alphas, cv=5))
])

# Fit RidgeCV model
ridge_pipeline.fit(X_train, y_train)

# Make predictions
ridge_pred = ridge_pipeline.predict(X_test)

# Calculate metrics
ridge_r2 = r2_score(y_test, ridge_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_rmse = np.sqrt(ridge_mse)  # Calculate RMSE manually
best_ridge_alpha = ridge_pipeline.named_steps["ridge"].alpha_

print("RidgeCV Results:")
print(f"  Best alpha: {best_ridge_alpha:.5f}")
print(f"  R¬≤ Score: {ridge_r2:.4f}")
print(f"  RMSE: {ridge_rmse:.3f}")
print(f"  Cross-validation: 5-fold")


RidgeCV Results:
  Best alpha: 2.68270
  R¬≤ Score: 0.6680
  RMSE: 4.935
  Cross-validation: 5-fold


In [23]:
# -----------------------------
# LassoCV Implementation
# -----------------------------
# Create LassoCV pipeline with scaling
lasso_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", LassoCV(alphas=alphas, cv=5, max_iter=20000, random_state=42))
])

# Fit LassoCV model
lasso_pipeline.fit(X_train, y_train)

# Make predictions
lasso_pred = lasso_pipeline.predict(X_test)

# Calculate metrics
lasso_r2 = r2_score(y_test, lasso_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_rmse = np.sqrt(lasso_mse)  # Calculate RMSE manually
best_lasso_alpha = lasso_pipeline.named_steps["lasso"].alpha_

# Count non-zero coefficients (feature selection)
lasso_coef = lasso_pipeline.named_steps["lasso"].coef_
nonzero_coef = np.count_nonzero(lasso_coef)
total_features = lasso_coef.size

print("LassoCV Results:")
print(f"  Best alpha: {best_lasso_alpha:.5f}")
print(f"  R¬≤ Score: {lasso_r2:.4f}")
print(f"  RMSE: {lasso_rmse:.3f}")
print(f"  Non-zero coefficients: {nonzero_coef}/{total_features}")
print(f"  Cross-validation: 5-fold")


LassoCV Results:
  Best alpha: 0.00100
  R¬≤ Score: 0.6687
  RMSE: 4.929
  Non-zero coefficients: 13/13
  Cross-validation: 5-fold


In [25]:
# -----------------------------
# Model Comparison and Analysis
# -----------------------------
print("=" * 60)
print("RIDGE vs LASSO COMPARISON")
print("=" * 60)

# Create comparison table
print(f"{'Metric':<20} {'RidgeCV':<15} {'LassoCV':<15}")
print("-" * 50)
print(f"{'R¬≤ Score':<20} {ridge_r2:<15.4f} {lasso_r2:<15.4f}")
print(f"{'RMSE':<20} {ridge_rmse:<15.3f} {lasso_rmse:<15.3f}")
print(f"{'Best Alpha':<20} {best_ridge_alpha:<15.5f} {best_lasso_alpha:<15.5f}")
print(f"{'Features Used':<20} {'All (13)':<15} {f'{nonzero_coef}/13':<15}")

print("\n" + "=" * 60)
print("ANALYSIS:")
print("-" * 20)

if ridge_r2 > lasso_r2:
    print("üèÜ RidgeCV performs better:")
    print("   ‚Ä¢ Higher R¬≤ score indicates better fit")
    print("   ‚Ä¢ Ridge shrinks coefficients without eliminating features")
    print("   ‚Ä¢ Better for correlated features (common in housing data)")
elif lasso_r2 > ridge_r2:
    print("üèÜ LassoCV performs better:")
    print("   ‚Ä¢ Higher R¬≤ score with feature selection")
    print("   ‚Ä¢ Sparse model reduces overfitting")
    print("   ‚Ä¢ Automatic feature selection helps generalization")
else:
    print("ü§ù Both models perform similarly:")
    print("   ‚Ä¢ Similar R¬≤ scores and RMSE")
    print("   ‚Ä¢ Choice depends on interpretability vs sparsity needs")

print(f"\nKey Insights:")
print(f"‚Ä¢ Ridge uses all {total_features} features with shrinkage")
print(f"‚Ä¢ Lasso uses only {nonzero_coef} features (automatic selection)")
print(f"‚Ä¢ Cross-validation ensures robust hyperparameter selection")
print(f"‚Ä¢ Both models benefit from proper feature scaling")


RIDGE vs LASSO COMPARISON
Metric               RidgeCV         LassoCV        
--------------------------------------------------
R¬≤ Score             0.6680          0.6687         
RMSE                 4.935           4.929          
Best Alpha           2.68270         0.00100        
Features Used        All (13)        13/13          

ANALYSIS:
--------------------
üèÜ LassoCV performs better:
   ‚Ä¢ Higher R¬≤ score with feature selection
   ‚Ä¢ Sparse model reduces overfitting
   ‚Ä¢ Automatic feature selection helps generalization

Key Insights:
‚Ä¢ Ridge uses all 13 features with shrinkage
‚Ä¢ Lasso uses only 13 features (automatic selection)
‚Ä¢ Cross-validation ensures robust hyperparameter selection
‚Ä¢ Both models benefit from proper feature scaling


# Question 4: Multiclass Logistic Regression


In [26]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [27]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [28]:
X_mean, X_std = X_train.mean(axis=0), X_train.std(axis=0) + 1e-12
X_train = (X_train - X_mean) / X_std
X_test  = (X_test  - X_mean) / X_std


In [29]:
def sigmoid(z):
    z = np.clip(z, -30, 30)
    return 1.0 / (1.0 + np.exp(-z))

def bin_log_loss(y_true, p, w, alpha=0.0):
    eps = 1e-12
    loss = -np.mean(y_true * np.log(p + eps) + (1 - y_true) * np.log(1 - p + eps))
    reg = alpha * np.sum(w**2)
    return loss + reg

def fit_binary_logistic(X, y01, lr=0.1, alpha=0.0, iters=2000, tol=1e-8):
    n, d = X.shape
    w = np.zeros(d)
    b = 0.0
    prev = 1e18
    for _ in range(iters):
        z = X @ w + b
        p = sigmoid(z)
        grad_w = (X.T @ (p - y01)) / n + 2 * alpha * w
        grad_b = np.mean(p - y01)
        w -= lr * grad_w
        b -= lr * grad_b
        cur = bin_log_loss(y01, p, w, alpha)
        if abs(prev - cur) < tol:
            break
        prev = cur
    return w, b

def predict_proba_binary(X, w, b):
    return sigmoid(X @ w + b)


In [30]:
K = len(np.unique(y_train))
weights, biases = [], []
lr = 0.1
alpha = 0.0
iters = 4000

for k in range(K):
    y01 = (y_train == k).astype(float)
    w_k, b_k = fit_binary_logistic(X_train, y01, lr=lr, alpha=alpha, iters=iters)
    weights.append(w_k)
    biases.append(b_k)
weights, biases = np.vstack(weights), np.array(biases)


In [31]:
probs = np.column_stack([
    predict_proba_binary(X_test, weights[k], biases[k]) for k in range(K)
])
y_pred = probs.argmax(axis=1)


In [32]:
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))


Accuracy: 0.8684
Confusion Matrix:
 [[11  1  0]
 [ 0 10  3]
 [ 0  1 12]]

Classification Report:
               precision    recall  f1-score   support

           0     1.0000    0.9167    0.9565        12
           1     0.8333    0.7692    0.8000        13
           2     0.8000    0.9231    0.8571        13

    accuracy                         0.8684        38
   macro avg     0.8778    0.8697    0.8712        38
weighted avg     0.8746    0.8684    0.8690        38

