## Linear Regression

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from dotenv import load_dotenv
import os


pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def read_data(path):
    return pd.read_csv(path)
load_dotenv() 
data_path = os.getenv("TRAINING_DATA")
df = read_data(data_path)

In [4]:
"""
Split the dataset into features and target, then divide it into training and testing sets.

- X: feature matrix (all columns except the target)
- y: target variable ('HATSURESI')
- 90% of the data is used for training, 10% for testing
- The random_state ensures reproducible results
"""

# Separate features (X) and target variable (y)
X = df.drop(["HATSURESI","Unnamed: 0", "Unnamed: 0.1"], axis=1)  # Drop target column to create feature set
y = df["HATSURESI"]                 # Target variable to predict

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Display the shape of the resulting splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((174207, 31), (19357, 31), (174207,), (19357,))

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [6]:
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [7]:
"""
Evaluate the regression model's performance on the test set using key metrics:
- RMSE (Root Mean Squared Error)
- MAE (Mean Absolute Error)
- R² Score (Coefficient of Determination)
- MAPE (Mean Absolute Percentage Error)
"""

# Make predictions using the trained model
y_pred = lr_model.predict(X_test_scaled)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  
mae = mean_absolute_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)  
mape = (abs((y_test - y_pred) / y_test).mean()) * 100  

print(f"Root Mean Square Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")


Root Mean Square Error (RMSE): 7.16
Mean Absolute Error (MAE): 4.84
R² Score: 0.63
Mean Absolute Percentage Error (MAPE): 8.11%


In [8]:
"""
Retrieve and display the coefficients of a trained linear regression model.
"""

coefficients = lr_model.coef_
intercept = lr_model.intercept_

feature_importance = pd.Series(coefficients, index=X_train.columns)

print("Intercept (β₀):", intercept)
print("Coefficients (β):")
print(feature_importance.sort_values(ascending=False))


Intercept (β₀): 60.0685236528957
Coefficients (β):
HATSURESI_LAG_1                       6.803572
HATSURESI_LAG_2                       1.709320
HATSURESI_LAG_5                       0.960757
HATSURESI_LAG_4                       0.960258
weather_temp                          0.466020
SCHOOL_STATUS_School Open             0.150514
HOLIDAY_CATEGORY_Normal               0.133128
weather_description_Cloudy            0.112428
weather_description_Low Visibility    0.064190
weather_description_Precipitation     0.048946
MONTH_3                               0.048764
weather_description_Storm             0.037839
DAY_OF_WEEK_3                         0.033198
DAY_OF_WEEK_4                         0.029210
MONTH_2                               0.013343
DAY_OF_WEEK_1                         0.009416
MONTH_4                               0.009115
MONTH_10                              0.009067
MONTH_12                              0.005061
DAY_OF_WEEK_2                        -0.015130
PANDEMIC_

#### VIF Analysis

In [9]:
"""
Calculate Variance Inflation Factor (VIF) for each feature in a given DataFrame.

VIF quantifies the severity of multicollinearity in a set of regression variables.
A higher VIF indicates a stronger linear relationship between a given feature and the others.

Returns a DataFrame showing each feature and its corresponding VIF value,
sorted from highest to lowest.
"""

def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data['Feature'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data.sort_values(by='VIF', ascending=False)

In [10]:
"""
Standardize the feature set and calculate Variance Inflation Factor (VIF)
to assess multicollinearity before performing linear regression.
"""

# Scale features using StandardScaler (mean=0, std=1)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Calculate VIF for the scaled features
vif_df = calculate_vif(X_scaled)

# Display the VIF values for all features
print(vif_df.head(500))


                               Feature       VIF
0                         weather_temp  4.023935
15                             MONTH_8  3.764123
14                             MONTH_7  3.754099
28                     HATSURESI_LAG_3  3.636850
27                     HATSURESI_LAG_2  3.586412
29                     HATSURESI_LAG_4  3.586115
13                             MONTH_6  2.824279
21           SCHOOL_STATUS_School Open  2.764097
26                     HATSURESI_LAG_1  2.689508
30                     HATSURESI_LAG_5  2.678429
12                             MONTH_5  2.640846
16                             MONTH_9  2.615390
11                             MONTH_4  2.152262
17                            MONTH_10  2.019845
10                             MONTH_3  1.905228
9                              MONTH_2  1.780710
19                            MONTH_12  1.764059
18                            MONTH_11  1.756523
4                        DAY_OF_WEEK_3  1.731041
3                   