In [None]:
from itertools import combinations
import statsmodels.api as sm
import pandas as pd
import seaborn as sns

Here we apply the best subset selection approach to the Hitters data. We wish to predict a baseball player’s Salary on the basis of various statistics associated with performance in the previous year. First of all, we note that the Salary variable is missing for some of the players. The is.na() function can be used to identify the missing observaitions. It returns a vector of the same length as the input vector, with a TRUE for any elements that are missing, and a FALSE for non-missing elements. The sum() function can then be used to count all of the missing elements.

In [None]:
df = sm.datasets.get_rdataset("Hitters", "ISLR", cache=True).data.pipe(pd.get_dummies, columns=["League", "Division", "NewLeague"], drop_first=True)

In [None]:
df.head()

In [None]:
df['Salary'].isna().sum()

Hence we see that Salary is missing for 59 players. The na.omit() function removes all of the rows that have missing values in any variable.

In [None]:
df = df.dropna(subset=["Salary"])

In [None]:
df['Salary'].isna().sum()

The regsubsets() function (part of the leaps library) performs best subset selection by identifying the best model that contains a given number of predictors, where best is quantified using RSS. The syntax is the same as for lm(). The summary() command outputs the best set of variables for each model size.

Fun times, doesn't look like python has an equivalent library so I guess I'm coding this by hand

In [None]:
y = df["Salary"]
X = df.drop(columns=["Salary"])

In [None]:
# It's too slow to do all the way up to 8, let's just do it for 3. I'll get the point
def modrsquared(coltuple):
    lm = sm.OLS(y, sm.add_constant(X[[col for col in coltuple]])).fit()
    return lm.rsquared

models = dict()
for i in range(1, 4):
    col_opts = list(combinations(X.columns, i))
    i_models = {cols: modrsquared(cols) for cols in col_opts}
    best_cols = max(i_models.keys(), key=lambda k: i_models[k])
    models[i] = best_cols
models

The summary() function also returns $R^2$, RSS, adjusted $R^2$, $C_p$, and BIC. We can examine these to try to select the best overall model. For instance, we see that the $R^2$ statistic increases from 32%, when only one variable is included in the model, to almost 55 %, when all variables are included. As expected, the Plotting RSS, adjusted $R^2$, $C_p$, and BIC for all of the models at once will help us decide which model to select. Note the type="l" option tells R to connect the plotted points with lines.

In [None]:
# Statsmodels has AIC but not C_p and since they're equivalent for OLS I'll just use AIC
df = pd.DataFrame()
for i in models.keys():
    lm = sm.OLS(y, sm.add_constant(X[[col for col in models[i]]])).fit()
    df.loc[i, "R_square"] = lm.rsquared
    df.loc[i, "adj_R_square"] = lm.rsquared_adj
    df.loc[i, "RSS"] = lm.mse_resid
    df.loc[i, "AIC"] = lm.aic
    df.loc[i, "BIC"] = lm.bic
df

In [None]:
cdf = df.reset_index().melt(id_vars=["index"])
sns.relplot(x="index", y="value", col="variable", kind="line", facet_kws={"sharey": False}, data=cdf);

## 6.5.2 Forward and Backward Stepwise Selection
We can also use the ```regsubsets()``` function to perform forward stepwise or backward stepwise selection, using the argument ```method="forward"``` or ```method="backward"```.

Sweet, we don't have this in python either. 
I'll base my implementation on [this](https://planspace.org/20150423-forward_selection_with_statsmodels/)

In [None]:
def forward_selected(x, y, maxvars):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    x: DataFrame, potential exogenous variables
    y: Series, variable to predict
    """
    remaining = set(x.columns)
    selected = []
    models = {}
    current_score, best_new_score = 0.0, 0.0
    while remaining and len(selected) <= maxvars and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            X_candidate = sm.add_constant(x[selected + [candidate]])
            score = sm.OLS(y, X_candidate).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            models[len(selected)] = selected[:]
    return models

In [None]:
forward_models = forward_selected(X, y, maxvars=20)


In [None]:
def backward_selected(x, y):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    x: DataFrame, potential exogenous variables
    y: Series, variable to predict
    """
    selected = list(x.columns)
    models = {}
    while len(selected) > 1:
        scores_with_candidates = []
        for candidate in selected:
            X_candidate = sm.add_constant(x[selected].drop(columns=[candidate]))
            score = sm.OLS(y, X_candidate).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
#         if len(scores_with_candidates) < 19:
#             return scores_with_candidates
        worst_score, worst_candidate = scores_with_candidates.pop(0)
        selected.pop(selected.index(worst_candidate))
        models[len(selected)] = selected[:]
    return models

In [None]:
backward_models = backward_selected(X, y)


In [None]:
backward_models

In [None]:
backward_models[1]

In [None]:
forward_models[1]