In [7]:
### IMPORTS

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error

In [8]:
### FUNCTIONS

## Min-Max Normalization
# X: array to be normalized
def normalize(X):
    X_norm = (X-np.min(X, axis=0))/(np.max(X, axis=0) - np.min(X, axis=0))
    return X_norm

In [9]:
### PREPROCESSING I

# read data set
data = pd.read_csv(r"C:\Users\kingh\Downloads\Concrete_Data.csv")

# extract feature variables
X = data.iloc[:,:8].to_numpy()
X_test = X[501:631, :]
X_train = np.vstack((X[0:501, :], X[631:, :]))

# extract target variable
y = data.iloc[:, -1].to_numpy().reshape((-1,1))
y_test = y[501:631]
y_train = np.vstack((y[0:501], y[631:]))

In [10]:
### PREPROCESSING II

# create normalized X
X_train_norm = normalize(X_train)
X_test_norm = normalize(X_test)

In [11]:
# add intercept term (statsmodels doesn't add it automatically)
X_train_sm = sm.add_constant(X_train)

# fit model
model = sm.OLS(y_train, X_train_sm).fit()

# summary of results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.624
Model:                            OLS   Adj. R-squared:                  0.621
Method:                 Least Squares   F-statistic:                     184.9
Date:                Thu, 02 Oct 2025   Prob (F-statistic):          1.80e-183
Time:                        18:50:20   Log-Likelihood:                -3367.7
No. Observations:                 900   AIC:                             6753.
Df Residuals:                     891   BIC:                             6797.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -52.0353     28.768     -1.809      0.0

In [12]:
# add intercept term to test data
X_test_sm = sm.add_constant(X_test)

# predict
y_pred_train = model.predict(X_train_sm)
y_pred_test = model.predict(X_test_sm)

var_train = np.var(y_train)
var_test = np.var(y_test)

train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
train_VE = 1 - (train_MSE / var_train)
test_VE  = 1 - (test_MSE / var_test)

print("Test VE:", test_VE)
print("Train VE:", train_VE)
print("Test MSE:", test_MSE)
print("Train MSE:", train_MSE)

Test VE: 0.35125143823506577
Train VE: 0.6241001089604974
Test MSE: 141.25799330049725
Train MSE: 104.1544639349233


In [14]:
p_vals = model.pvalues
print(p_vals)

[7.08196536e-02 6.97096932e-41 4.14518989e-26 1.89457127e-13
 1.91841837e-03 2.46111666e-01 5.44324111e-03 2.73812239e-03
 9.51621857e-74]


In [16]:
# add intercept term (statsmodels doesn't add it automatically)
X_train_sm = sm.add_constant(X_train_norm)

# fit model
model = sm.OLS(y_train, X_train_sm).fit()

# summary of results
p_vals = model.pvalues
print(p_vals)

[2.06573362e-01 6.97096932e-41 4.14518989e-26 1.89457127e-13
 1.91841837e-03 2.46111666e-01 5.44324111e-03 2.73812239e-03
 9.51621857e-74]


In [19]:
X_train_log = np.log(X_train + 1)

# add intercept term (statsmodels doesn't add it automatically)
X_train_sm = sm.add_constant(X_train_log)

# fit model
model = sm.OLS(y_train, X_train_sm).fit()

# summary of results
p_vals = model.pvalues
print(p_vals)

[2.93770453e-001 1.29860458e-086 4.51879000e-036 2.60534687e-001
 1.86258617e-020 1.03969702e-005 4.07894287e-001 9.47686639e-002
 1.74699065e-192]
