In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import math
import scipy.stats
import bisect

In [93]:
r""" forward_selection_v2 differs from v1 in several ways.
1. It is assumed that the function contains a constant. 
2. It selects the next regressor by comparing ssr values of the models with the
same number of regressors. The one with the smallest ssr value is selected for the next step
3. Once the next regressor is selected, if the p-value for the t-statistic is smaller than alpha, then the regressor is
added to the model. 
4. The loop ends when a regressor is found that does not meet the condition outlined in (3). This means that the potential
result that is returned only contains the constant
5. It returns results instead of a model"""

def forward_section_v2(Y, X, alpha):
    
    # a list to store the included regressors
    # the constant is already included
    included_regressors = [0]
    
     #keeps track of the remaining regressors
    remaining_regressors = list(range(1, np.shape(X)[1]))
    
    #initialize the "best" model with just the constant
    final_results = sm.OLS(Y, X[:, included_regressors]).fit()
    
    
    while len(remaining_regressors) > 0:
    
        #increase the number of included regressors, and initialize with 
        #the first remaining regressor
        included_regressors.append(remaining_regressors[0])

        #initialize the temp best model for this level in the forward selection process
        temp_best_model = sm.OLS(Y, X[:, included_regressors]).fit()

        #initialize the best regressor for this level
        best_regressor = remaining_regressors[0]
        #consider the remaining regressors, and find the best one to choose next 
        for i in range(1, len(remaining_regressors)):
            #update the included_regressors to use the current regressor
            included_regressors[-1] = remaining_regressors[i]
            #create the results for the model
            temp_model = sm.OLS(Y, X[:, included_regressors]).fit()
            #see if the current regressor is better than the current best by reducing ssr
            if temp_model.ssr < temp_best_model.ssr:
                temp_best_model = temp_model
                best_regressor = included_regressors[-1]
                
        #check if the regressor should be added to the model, if so we will continue to add regressors
        if temp_best_model.pvalues[-1] < alpha:
            final_results = temp_best_model
            remaining_regressors.remove(best_regressor)
            included_regressors[-1] = best_regressor
        #if the regressor does not meet the threshold, then terminate the loop and return
        else:
            del included_regressors[-1]
            return included_regressors, final_results
    
    return included_regressors, final_results
    
    

In [98]:
df  = pd.read_excel('Crime_R.xlsx')
df_n = df.to_numpy()
X = df_n[:, 1:(df_n.shape[1])]
Y = df_n[:, [0]]
#add a column of 1's to the front to account for the constant
X = sm.add_constant(X)


In [100]:
import time
data = sm.datasets.longley.load()
data.exog = sm.add_constant(data.exog)

start = time.time()
#regressors, my_results= forward_section_v2(data.endog, data.exog, 0.05)
regressors, my_results= forward_section_v2(Y, X, 0.05)
end = time.time()


print(end - start)

0.020906925201416016


In [101]:
print(my_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.994
Method:                 Least Squares   F-statistic:                     3700.
Date:                Mon, 08 Nov 2021   Prob (F-statistic):           9.45e-50
Time:                        17:09:12   Log-Likelihood:                -103.69
No. Observations:                  47   AIC:                             213.4
Df Residuals:                      44   BIC:                             218.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         15.2343      5.052      3.016      0.0