# Building your First Machine Learning Model: Using Lasso as a Feature Selection Technique

In [None]:
# Install the necessary libraries
%pip install numpy pandas scikit-learn statsmodels faraway

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

import faraway.datasets.ozone as ozone

In [8]:
# Load the ozone dataset
ozone_data = ozone.load()

# Inspect the first few rows to understand the structure
print(ozone_data.head())

   O3    vh  wind  humidity  temp   ibh  dpg  ibt  vis  doy
0   3  5710     4        28    40  2693  -25   87  250   33
1   5  5700     3        37    45   590  -24  128  100   34
2   5  5760     3        51    54  1450   25  139   60   35
3   6  5720     4        69    35  1568   15  121   60   36
4   4  5790     6        19    45  2631  -33  123  100   37


In [9]:
# Define the features (X) and target (y)
X = ozone_data.drop(columns=['O3'])
y = ozone_data['O3']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=123)

## Base Model

In [10]:
# Initialize the linear regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_base_model = model.predict(X_test)

# Calculate metrics
r_squared = r2_score(y_test, y_pred_base_model)
n = len(y_test)
p = X_test.shape[1]
adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)

# Create a DataFrame to display results
data = {
    'R-squared': [r_squared],
    'Adjusted R-squared': [adjusted_r_squared],
    'MSE': [mean_squared_error(y_test, y_pred_base_model)]
}
df_base_model = pd.DataFrame(data, index=['Base Model'])

# Display the DataFrame
display(df_base_model)

Unnamed: 0,R-squared,Adjusted R-squared,MSE
Base Model,0.722092,0.677429,17.670801


## Lasso Feature Selection

In [11]:

# Set up the Lasso model with cross-validation to tune alpha
lasso = Lasso()

# Set up GridSearchCV to tune alpha
param_grid = {'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]}
grid_search = GridSearchCV(Lasso(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['alpha']

print(f"Best alpha: {best_alpha}")

Best alpha: 0.1


In [12]:
# Refit the Lasso model with the best alpha
lasso = Lasso(alpha=best_alpha)
lasso.fit(X_train, y_train)

# Get the coefficients of the features
coefficients = pd.Series(lasso.coef_, index=ozone_data.drop(columns=['O3']).columns)

# Filter out features with zero coefficients
selected_features = coefficients[coefficients != 0].index
print(f"Selected features: \n{selected_features}")


Selected features: 
Index(['humidity', 'temp', 'ibh', 'ibt', 'vis', 'doy'], dtype='object')


In [14]:
# Train a classification model using the selected features
X_train_selected = X_train[:, coefficients != 0]
X_test_selected = X_test[:, coefficients != 0]

# Logistic Regression for classification
log_reg = LinearRegression()
log_reg.fit(X_train_selected, y_train)

# Predict on the test set
y_pred_lasso = log_reg.predict(X_test_selected)

# Calculate metrics
r_squared = r2_score(y_test, y_pred_lasso)
n = len(y_test)
p = X_test.shape[1]
adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)

# Create a DataFrame to display results
data = {
    'R-squared': [r_squared],
    'Adjusted R-squared': [adjusted_r_squared],
    'MSE': [mean_squared_error(y_test, y_pred_lasso)]
}
lasso_metrics_df = pd.DataFrame(data, index=['Lasso Feature Selection'])

# Display the DataFrame
display(lasso_metrics_df)


Unnamed: 0,R-squared,Adjusted R-squared,MSE
Lasso Feature Selection,0.724843,0.680622,17.495881


In [15]:
df = pd.concat([df_base_model, lasso_metrics_df])

display(df)

Unnamed: 0,R-squared,Adjusted R-squared,MSE
Base Model,0.722092,0.677429,17.670801
Lasso Feature Selection,0.724843,0.680622,17.495881
