In [10]:
import pandas as pd

# Sample dataset
data = {
    'StudyHours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'PrevExamScore': [30, 40, 45, 50, 60, 65, 70, 75, 80, 85],
    'Pass': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  # 0 = Fail, 1 = Pass
}

df = pd.DataFrame(data)

# Features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']

In [11]:
import statsmodels.api as sm

# Add a constant (for the intercept)
X = sm.add_constant(X)

In [12]:
# Fit the model using OLS regression
model = sm.OLS(y, X).fit()

# Display the model summary (including p-values)
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Pass   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.688
Method:                 Least Squares   F-statistic:                     10.94
Date:                Sun, 12 Oct 2025   Prob (F-statistic):            0.00701
Time:                        09:41:09   Log-Likelihood:               -0.17258
No. Observations:                  10   AIC:                             6.345
Df Residuals:                       7   BIC:                             7.253
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.3333      1.464     -0.228

In [13]:
# Set a significance level
significance_level = 0.05

# Perform backward elimination
while True:
    # Fit the model
    model = sm.OLS(y, X).fit()
    
    # Get the highest p-value in the model
    max_p_value = model.pvalues.max()
    
    # Check if the highest p-value is greater than the significance level
    if max_p_value > significance_level:
        # Identify the feature with the highest p-value
        feature_to_remove = model.pvalues.idxmax()
        print(f"Removing feature: {feature_to_remove} with p-value: {max_p_value}")
        
        # Drop the feature
        X = X.drop(columns=[feature_to_remove])
    else:
        break

# Display the final model summary
print(model.summary())

Removing feature: PrevExamScore with p-value: 0.9999999999999999
Removing feature: const with p-value: 0.11419580126842226
                                 OLS Regression Results                                
Dep. Variable:                   Pass   R-squared (uncentered):                   0.831
Model:                            OLS   Adj. R-squared (uncentered):              0.812
Method:                 Least Squares   F-statistic:                              44.31
Date:                Sun, 12 Oct 2025   Prob (F-statistic):                    9.31e-05
Time:                        09:41:09   Log-Likelihood:                         -1.8294
No. Observations:                  10   AIC:                                      5.659
Df Residuals:                       9   BIC:                                      5.961
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

# -------------------------------------------------------------------
# 1) Sample data
# -------------------------------------------------------------------
data = {
    "StudyHours":     [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "PrevExamScore":  [30, 40, 45, 50, 60, 65, 70, 75, 80, 85],
    "Pass":           [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  # 0 = Fail, 1 = Pass (note: OLS on 0/1 is for demo; Logit is more appropriate)
}
df = pd.DataFrame(data)

# -------------------------------------------------------------------
# 2) Features/target + STANDARDIZATION
#    Why standardize here?
#    - Makes coefficients comparable (per 1 SD increase).
#    - Improves numerical conditioning; p-values/selection become more stable.
#    - For OLS specifically, scale does not change fit quality, but it helps interpretation.
# -------------------------------------------------------------------
X_raw = df[["StudyHours", "PrevExamScore"]]
y = df["Pass"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)                          # mean=0, std=1 for each feature
X = pd.DataFrame(X_scaled, columns=X_raw.columns, index=df.index)

# Add intercept AFTER scaling (do NOT scale the constant)
X = sm.add_constant(X, has_constant="add")

# -------------------------------------------------------------------
# 3) Initial OLS fit (on standardized features)
# -------------------------------------------------------------------
model = sm.OLS(y, X).fit()
# print(model.summary())

# -------------------------------------------------------------------
# 4) Backward elimination by p-values (keep the intercept!)
#    - At each step, drop the non-constant feature with the largest p-value
#      if it exceeds the significance threshold.
# -------------------------------------------------------------------
alpha = 0.05  # significance level

while True:
    model = sm.OLS(y, X).fit()

    # Consider only feature p-values (exclude intercept/const from elimination)
    pvals = model.pvalues.drop("const", errors="ignore")

    # If there are no removable features, or all are significant, stop
    if pvals.empty:
        break

    worst_feat = pvals.idxmax()
    worst_p = pvals.max()

    if worst_p > alpha:
        print(f"Removing feature: {worst_feat} (p-value = {worst_p:.4f})")
        X = X.drop(columns=[worst_feat])
    else:
        break

# -------------------------------------------------------------------
# 5) Final model summary
# -------------------------------------------------------------------
print(model.summary())

# --------------------------
# Notes:
# - Coefficients now reflect the change in y per 1 standard deviation
#   increase in each feature (since X was standardized).
# - The intercept corresponds to the prediction when features are at
#   their means (which is 0 after standardization).
# - For a binary target like 'Pass', consider statsmodels.Logit for
#   a proper classification model.
# --------------------------


Removing feature: PrevExamScore (p-value = 1.0000)
                            OLS Regression Results                            
Dep. Variable:                   Pass   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.727
Method:                 Least Squares   F-statistic:                     25.00
Date:                Sun, 12 Oct 2025   Prob (F-statistic):            0.00105
Time:                        09:41:09   Log-Likelihood:               -0.17258
No. Observations:                  10   AIC:                             4.345
Df Residuals:                       8   BIC:                             4.950
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c

## Comparison: OLS with backward elimination — unstandardized vs standardized

### What your two outputs show (essentials)
- **Unstandardized**
  - Dropped: `PrevExamScore` (p ≈ 1.0) **and `const`** (p ≈ 0.114).
  - Remaining term: `StudyHours` with `coef ≈ 0.1039` (per **1 hour**).
  - Reported **R² (uncentered) = 0.831** (no intercept in the model).

- **Standardized**
  - Dropped: `PrevExamScore` (p = 1.0).
  - Kept: `const` (p ≈ 0.000) and `StudyHours`.
  - `const ≈ 0.5`, `StudyHours coef ≈ 0.4352` (per **1 SD** of StudyHours).
  - Reported **R² = 0.758** (with intercept; regular centered R²).

---

### Why the results differ

1) **Intercept handling (const)**
   - Without scaling, backward elimination removed `const` → OLS reports **uncentered R²**, which tends to be **inflated** and is **not comparable** to centered R².
   - With scaling, features are mean-centered (0), so the intercept equals the **mean of y** (~0.5 for a 0/1 target) and is significant → `const` is retained.

2) **Coefficient meaning (scale vs SD)**
   - Unstandardized `0.1039` = change in y per **1 hour** of study.
   - Standardized `0.4352` = change in y per **1 standard deviation** increase in StudyHours.
   - These magnitudes are **not directly comparable** because the units differ.

3) **Strong collinearity drives variable removal**
   - `StudyHours` and `PrevExamScore` are highly correlated (≈ **0.994**).  
     Under OLS + backward elimination, such collinearity often leads to keeping **one** predictor and assigning a huge p-value (~1.0) to the other.

4) **R² comparability**
   - Do **not** compare uncentered R² (no intercept) with centered R² (with intercept).  
     Prefer centered R² for model comparison.

---

### How to interpret / which to use

- If you want **natural-unit effects** (“per 1 hour”), use **unstandardi**
