# TASK 3 :  Regression Techniques 


In [1]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots

In [2]:
import statsmodels.api as sm

In [3]:
from statsmodels.stats.outliers_influence \
     import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm

In [4]:
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Multiple Regression 

In [6]:
# Chargement des données
df = pd.read_csv('data/training_data_imputed.csv')

In [7]:
# Exploration initiale
print(f"Dimensions du dataset: {df.shape}")
print(f"\nInformations sur les colonnes:")
print(df.info())

Dimensions du dataset: (70792, 18)

Informations sur les colonnes:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70792 entries, 0 to 70791
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SO2               70792 non-null  float64
 1   CO                70792 non-null  float64
 2   O3                70792 non-null  float64
 3   O3_8hr            70792 non-null  int64  
 4   PM10              70792 non-null  int64  
 5   PM2.5             70792 non-null  int64  
 6   NO2               70792 non-null  float64
 7   NOx               70792 non-null  float64
 8   NO                70792 non-null  float64
 9   WindSpeed         70792 non-null  float64
 10  WindDirec         70792 non-null  object 
 11  DataCreationDate  70792 non-null  object 
 12  CO_8hr            70792 non-null  float64
 13  PM2.5_AVG         70792 non-null  int64  
 14  PM10_AVG          70792 non-null  int64  
 15  SO2_AVG           70

In [8]:
numeric_features = ['SO2','CO','O3','PM10','PM2.5','NO2','NOx','NO',
                    'WindSpeed','CO_8hr','PM2.5_AVG','PM10_AVG','SO2_AVG','AQI', 'Pollutant']
categorical_features = ['WindDirec']

In [9]:
df = df[numeric_features]

In [10]:
# Cible
y = df["AQI"]

In [11]:
# Colonnes explicatives
X = df.drop(columns=["AQI"])

In [12]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [15]:
# Standardisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
X_train_scaled = pd.DataFrame(
    X_train_scaled,
    columns=X_train.columns,
    index=X_train.index   # <-- important !
)

X_test_scaled = pd.DataFrame(
    X_test_scaled,
    columns=X_test.columns,
    index=X_test.index
)


In [19]:
# Ajout d’une constante pour statsmodels
X_train_const = sm.add_constant(X_train_scaled)

In [21]:
multiple_regression = sm.OLS(y_train, X_train_const).fit()
print(multiple_regression.summary())


                            OLS Regression Results                            
Dep. Variable:                    AQI   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.937
Method:                 Least Squares   F-statistic:                 6.036e+04
Date:              jeu., 04 déc. 2025   Prob (F-statistic):               0.00
Time:                        12:37:07   Log-Likelihood:            -2.0061e+05
No. Observations:               56633   AIC:                         4.012e+05
Df Residuals:                   56618   BIC:                         4.014e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         64.7269      0.035   1842.604      0.0

In [22]:
multiple_regression .summary()

0,1,2,3
Dep. Variable:,AQI,R-squared:,0.937
Model:,OLS,Adj. R-squared:,0.937
Method:,Least Squares,F-statistic:,60360.0
Date:,"jeu., 04 déc. 2025",Prob (F-statistic):,0.0
Time:,12:37:25,Log-Likelihood:,-200610.0
No. Observations:,56633,AIC:,401200.0
Df Residuals:,56618,BIC:,401400.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,64.7269,0.035,1842.604,0.000,64.658,64.796
SO2,-0.1879,0.047,-4.034,0.000,-0.279,-0.097
CO,0.6741,0.080,8.453,0.000,0.518,0.830
O3,2.6821,0.043,62.469,0.000,2.598,2.766
PM10,-0.2724,0.100,-2.716,0.007,-0.469,-0.076
PM2.5,-0.1427,0.103,-1.380,0.168,-0.345,0.060
NO2,-0.0647,0.956,-0.068,0.946,-1.939,1.809
NOx,0.6352,1.384,0.459,0.646,-2.077,3.348
NO,-0.2655,0.635,-0.418,0.676,-1.511,0.980

0,1,2,3
Omnibus:,35322.952,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3037690.611
Skew:,2.18,Prob(JB):,0.0
Kurtosis:,38.613,Cond. No.,121.0


In [None]:
summarize(multiple_regression )


In [23]:
def forward_selection(X, y):
    remaining_vars = list(X.columns)
    selected_vars = []
    best_aic = np.inf

    while remaining_vars:
        aic_with_candidates = []
        for var in remaining_vars:
            model_vars = selected_vars + [var]
            model = sm.OLS(y, sm.add_constant(X[model_vars])).fit()
            aic_with_candidates.append((model.aic, var))

        aic_with_candidates.sort()
        best_new_aic, best_var = aic_with_candidates[0]

        if best_new_aic < best_aic:
            best_aic = best_new_aic
            selected_vars.append(best_var)
            remaining_vars.remove(best_var)
        else:
            break

    return selected_vars


forward_vars = forward_selection(X_train_scaled, y_train)
print("\nVariables sélectionnées (Forward AIC):")
print(forward_vars)

forward_model = sm.OLS(y_train, sm.add_constant(X_train_scaled[forward_vars])).fit()
print("\n=== Forward Selection Model Summary ===")
print(forward_model.summary())


Variables sélectionnées (Forward AIC):
['PM2.5_AVG', 'PM10_AVG', 'O3', 'Pollutant', 'CO', 'WindSpeed', 'CO_8hr', 'NO2', 'PM10', 'SO2', 'SO2_AVG']

=== Forward Selection Model Summary ===
                            OLS Regression Results                            
Dep. Variable:                    AQI   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.937
Method:                 Least Squares   F-statistic:                 7.683e+04
Date:              jeu., 04 déc. 2025   Prob (F-statistic):               0.00
Time:                        12:46:16   Log-Likelihood:            -2.0061e+05
No. Observations:               56633   AIC:                         4.012e+05
Df Residuals:                   56621   BIC:                         4.013e+05
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef 

In [24]:
summarize(forward_model)

Unnamed: 0,coef,std err,t,P>|t|
const,64.7269,0.035,1842.613,0.0
PM2.5_AVG,24.8577,0.068,365.2,0.0
PM10_AVG,5.8896,0.099,59.561,0.0
O3,2.6754,0.042,63.508,0.0
Pollutant,2.8487,0.047,60.224,0.0
CO,0.6845,0.069,9.937,0.0
WindSpeed,-0.2941,0.039,-7.549,0.0
CO_8hr,-0.374,0.059,-6.373,0.0
NO2,0.3664,0.064,5.73,0.0
PM10,-0.3424,0.087,-3.929,0.0


In [25]:
def backward_selection(X, y):
    selected_vars = list(X.columns)
    best_aic = sm.OLS(y, sm.add_constant(X[selected_vars])).fit().aic

    improved = True
    while improved and len(selected_vars) > 1:
        aic_with_candidates = []
        for var in selected_vars:
            vars_subset = list(selected_vars)
            vars_subset.remove(var)
            model = sm.OLS(y, sm.add_constant(X[vars_subset])).fit()
            aic_with_candidates.append((model.aic, var, vars_subset))

        aic_with_candidates.sort()
        best_new_aic, var_removed, best_subset = aic_with_candidates[0]

        if best_new_aic < best_aic:
            best_aic = best_new_aic
            selected_vars = best_subset
        else:
            improved = False

    return selected_vars


backward_vars = backward_selection(X_train_scaled, y_train)
print("\nVariables sélectionnées (Backward AIC):")
print(backward_vars)

backward_model = sm.OLS(y_train, sm.add_constant(X_train_scaled[backward_vars])).fit()
print("\n=== Backward Selection Model Summary ===")
print(backward_model.summary())


Variables sélectionnées (Backward AIC):
['SO2', 'CO', 'O3', 'PM10', 'NOx', 'NO', 'WindSpeed', 'CO_8hr', 'PM2.5_AVG', 'PM10_AVG', 'SO2_AVG', 'Pollutant']

=== Backward Selection Model Summary ===
                            OLS Regression Results                            
Dep. Variable:                    AQI   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.937
Method:                 Least Squares   F-statistic:                 7.042e+04
Date:              jeu., 04 déc. 2025   Prob (F-statistic):               0.00
Time:                        12:49:42   Log-Likelihood:            -2.0061e+05
No. Observations:               56633   AIC:                         4.012e+05
Df Residuals:                   56620   BIC:                         4.014e+05
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
              

In [26]:
def stepwise_selection(X, y):
    included = []
    improved = True

    while improved:
        improved = False

        # Forward step
        excluded = list(set(X.columns) - set(included))
        best_pval = 1.0
        for var in excluded:
            model = sm.OLS(y, sm.add_constant(X[included + [var]])).fit()
            pval = model.pvalues[var]
            if pval < best_pval:
                best_pval = pval
                best_var = var

        if best_pval < 0.05:  # seuil classique
            included.append(best_var)
            improved = True

        # Backward step
        model = sm.OLS(y, sm.add_constant(X[included])).fit()
        worst_pval = model.pvalues.drop("const").max()
        if worst_pval > 0.05:
            worst_var = model.pvalues.drop("const").idxmax()
            included.remove(worst_var)
            improved = True

    return included


stepwise_vars = stepwise_selection(X_train_scaled, y_train)
print("\nVariables sélectionnées (Stepwise):")
print(stepwise_vars)

stepwise_model = sm.OLS(y_train, sm.add_constant(X_train_scaled[stepwise_vars])).fit()
print("\n=== Stepwise Model Summary ===")
print(stepwise_model.summary())


Variables sélectionnées (Stepwise):
['NOx', 'PM10', 'PM10_AVG', 'O3', 'PM2.5_AVG', 'Pollutant', 'WindSpeed', 'CO', 'CO_8hr', 'NO', 'SO2', 'SO2_AVG']

=== Stepwise Model Summary ===
                            OLS Regression Results                            
Dep. Variable:                    AQI   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.937
Method:                 Least Squares   F-statistic:                 7.042e+04
Date:              jeu., 04 déc. 2025   Prob (F-statistic):               0.00
Time:                        12:51:19   Log-Likelihood:            -2.0061e+05
No. Observations:               56633   AIC:                         4.012e+05
Df Residuals:                   56620   BIC:                         4.014e+05
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std

## Interpretation : 
Those variables are significant to perform Linear Regression 