In [54]:
#### import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score
import itertools
import time
import statsmodels.formula.api as sm

train = pd.read_csv('ncaa_train.csv')
test = pd.read_csv('ncaa_test.csv')
train.head()

Unnamed: 0,Player,GP,GS,MIN,PTS,FGM,FGA,FGP,Three_Pmade,Three_PA,...,FTP,OFF,DEF,TRB,AST,STL,BLK,TOV,PF,NBA_PTS
0,Cade Cunningham,27,26,35.4,20.15,6.48,14.81,0.44,2.3,5.74,...,0.85,0.7,5.48,6.19,3.48,1.59,0.78,4.04,2.48,17.41
1,Jalen Suggs,30,30,28.9,14.37,5.17,10.27,0.5,1.17,3.47,...,0.76,0.6,4.73,5.33,4.53,1.9,0.33,2.93,2.6,11.75
2,Franz Wagner,28,26,31.8,12.46,4.43,9.29,0.48,1.25,3.64,...,0.84,0.71,5.82,6.54,3.0,1.25,1.04,1.29,2.21,15.15
3,Davion Mitchell,30,30,32.4,14.0,5.27,10.3,0.51,2.1,4.7,...,0.64,0.47,2.2,2.67,5.5,1.9,0.37,2.43,2.37,11.48
4,James Bouknight,15,14,31.6,18.73,6.53,14.6,0.45,1.47,5.0,...,0.78,1.53,4.13,5.67,1.8,1.13,0.27,2.8,1.87,4.58


In [55]:
y = np.log(train.NBA_PTS)
X = train.iloc[:,1:21]; 

#Defining a scaler object
scaler = StandardScaler()

#The scaler object will contain the mean and variance of each column (predictor) of X. 
#These values will be useful to scale test data based on the same mean and variance as obtained on train data
scaler.fit(X)

#Using the scaler object (or the values of mean and variance stored in it) to standardize X (or train data)
Xstd = scaler.transform(X)

alphas = 10**np.linspace(10,-2,200)*0.5

In [56]:
coefs = []
for a in alphas:        
    ridge = Ridge(alpha = a)
    ridge.fit(Xstd, y)
    coefs.append(ridge.coef_)
    
alphas = 10**np.linspace(1.5,-3,200)*0.5
ridgecv = RidgeCV(alphas = alphas,store_cv_values=True)
ridgecv.fit(Xstd, y)

#Optimal value of the tuning parameter - lambda
ridgecv.alpha_

  w = ((singvals_sq + alpha) ** -1) - (alpha ** -1)


15.811388300841896

In [57]:
Xtest = test.iloc[:,1:21]

#Standardizing test data
Xtest_std = scaler.transform(Xtest)

#Using the developed ridge regression model to predict on test data
ridge = Ridge(alpha = ridgecv.alpha_)
ridge.fit(Xstd, y)
pred=ridge.predict(Xtest_std)

#RMSE on test data
np.sqrt(((np.exp(pred)-test.NBA_PTS)**2).mean())

5.151579825810686

In [58]:
# find the ridge regression estimates for  𝜆=10−10
#almost zero regularization). Treat these estimates as OLS estimates and find the predictors for which these estimates have shrunk the most as compared to the model developed in E.2.3.

old = ridge.coef_

X_2test = test.iloc[:,1:21]

#Standardizing test data
X_2test_std = scaler.transform(X_2test)

#Using the developed ridge regression model to predict on test data
ridge_2 = Ridge(alpha = 10**-10)
ridge_2.fit(Xstd, y)
pred_s=ridge_2.predict(X_2test_std)

new = ridge_2.coef_

array1 = abs(new - old)
array1.argmax()

idx = (-array1).argsort()[:4]

for i in idx:
    print(test.columns[i])


MIN
PTS
Three_PP
FGP


In [None]:
X = train[['GP','GS','MIN','PTS','FGM','FGA','FGP','Three_Pmade','Three_PA','Three_PP','FTM','FTA','FTP','OFF','DEF','TRB','AST','STL','BLK','TOV','PF']]

def processSubset(predictor_subset):
    model = sm.ols('np.log(NBA_PTS)~' + '+'.join(predictor_subset),data = train).fit()
    Rsquared = model.rsquared
    return {"model":model, "Rsquared":Rsquared}

def getBest_model(k):
    tic = time.time()
    results = []
    for combo in itertools.combinations(X.columns, k):
        results.append(processSubset((list(combo))))

    models = pd.DataFrame(results)

    best_model = models.loc[models['Rsquared'].argmax()]
    
    toc = time.time()
#     print("Processed", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.")
    return best_model

models_best = pd.DataFrame(columns=["Rsquared", "model"])

tic = time.time()
for i in range(1,1+X.shape[1]):
    models_best.loc[i] = getBest_model(i)

toc = time.time()

def best_sub_plots():
    plt.figure(figsize=(20,10))
    plt.rcParams.update({'font.size': 18, 'lines.markersize': 10})

   
    plt.subplot(2, 2, 1)

 
    plt.plot(models_best["Rsquared"])
    plt.xlabel('# Predictors')
    plt.ylabel('Rsquared')

    rsquared_adj = models_best.apply(lambda row: row[1].rsquared_adj, axis=1)

    plt.subplot(2, 2, 2)
    plt.plot(rsquared_adj)
    plt.plot(1+rsquared_adj.argmax(), rsquared_adj.max(), "or")
    plt.xlabel('# Predictors')
    plt.ylabel('adjusted rsquared')

    aic = models_best.apply(lambda row: row[1].aic, axis=1)

    plt.subplot(2, 2, 3)
    plt.plot(aic)
    plt.plot(1+aic.argmin(), aic.min(), "or")
    plt.xlabel('# Predictors')
    plt.ylabel('AIC')

    bic = models_best.apply(lambda row: row[1].bic, axis=1)

    plt.subplot(2, 2, 4)
    plt.plot(bic)
    plt.plot(1+bic.argmin(), bic.min(), "or")
    plt.xlabel('# Predictors')
    plt.ylabel('BIC')
best_sub_plots()

best_subset_model = models_best.loc[4,'model']
models_best.loc[4,'model'].summary()