In [1]:
### IMPORTS

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error

In [2]:
### FUNCTIONS

### Min-Max Normalization
## INPUTS
# X: array to be normalized
## OUTPUTS
# X_norm: normalized array
def normalize(X):
    X_norm = (X-np.min(X, axis=0))/(np.max(X, axis=0) - np.min(X, axis=0))
    return X_norm

### Log Transform
## INPUTS
# X: array to be transformed
### OUTPUTS
# X_log: log-transformed array
def logtransform(X):
    X_log = np.log(1+X)
    return X_log

### Training + Analysis of Models
## INPUTS
# X_train: training data set
# X_test: testing data set
# full_results: boolean, displays all results if True
## OUTPUTS
# p_values: t-test p-values of model
def trainanalyze(X_train, X_test, full_results=False):
    # add intercept term (statsmodels doesn't add it automatically)
    X_train_sm = sm.add_constant(X_train)
    X_test_sm = sm.add_constant(X_test)
    
    # fit model
    model = sm.OLS(y_train, X_train_sm).fit()
    
    # make predictions
    y_pred_train = model.predict(X_train_sm)
    y_pred_test = model.predict(X_test_sm)
    
    # evaluate performance metrics
    var_train = np.var(y_train)
    var_test = np.var(y_test)
    
    train_MSE = mean_squared_error(y_train, y_pred_train)
    test_MSE = mean_squared_error(y_test, y_pred_test)
    train_VE = 1 - (train_MSE / var_train)
    test_VE  = 1 - (test_MSE / var_test)
    
    # t-statistics analysis
    p_values = model.pvalues
    
    # display performance metrics, p-values, and results summary
    if full_results:
        print("Test VE:", test_VE)
        print("Train VE:", train_VE)
        print("Test MSE:", test_MSE)
        print("Train MSE:", train_MSE)
        print('P-Values:', p_values)
        print(model.summary())
    return p_values

In [3]:
### PREPROCESSING I

# read data set
data = pd.read_csv(r"C:\Users\kingh\Downloads\Concrete_Data.csv")
predictors = data.columns.to_numpy()

# extract feature variables
X = data.iloc[:,:8].to_numpy()
X_test = X[501:631, :]
X_train = np.vstack((X[0:501, :], X[631:, :]))


# extract target variable
y = data.iloc[:, -1].to_numpy().reshape((-1,1))
y_test = y[501:631]
y_train = np.vstack((y[0:501], y[631:]))

In [4]:
### PREPROCESSING II

# create normalized X
X_train_norm = normalize(X_train)
X_test_norm = normalize(X_test)

# create log-transformed X
X_train_log = logtransform(X_train)
X_test_log = logtransform(X_test)

In [5]:
### TRAINING + ANALYSIS FOR TRANSFORMED+RAW DATA
X_trains = [X_train, X_train_norm, X_train_log]
X_tests = [X_test, X_test_norm, X_test_log]
P_Values = []

for i in range(3):
    p_values = trainanalyze(X_trains[i], X_tests[i])
    P_Values.append(p_values)

# omit p-values for b (intercept) term
P_Values = np.array(P_Values)[:, 1:].transpose()

In [6]:
### EXPORT RESULTS TO CSV
P_Values_df = pd.DataFrame(P_Values, columns=['Raw', 'Normalized', 'Log-Transformed'])
P_Values_df.to_csv("pvalues.csv")