# 2 - Decison Trees & Random_Forests

# Ridge Regression

In [None]:
from sklearn.linear_model import RidgeCV
import numpy as np

alphas = 10**(np.linspace(-4,2,100))
ridge = RidgeCV(alphas=alphas, cv=5).fit(X_train, y_train)
print(ridge.alpha_)
print(ridge.coef_)


# Calculate RSS
y_pred = ridge.predict(X_test)
print('{:.6E}'.format(sum((y_test - y_pred) ** 2)))


# Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV
import numpy as np

alphas = 10**(np.linspace(-4,2,100))
lasso = LassoCV(alphas=alphas, cv=5).fit(X_train, y_train)
print(lasso.alpha_)
print(lasso.coef_)

y_pred = lasso.predict(X_test)
print('{:.6E}'.format(sum((y_test - y_pred) ** 2)))

## Partial Dependence Plot (PDP)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.inspection import plot_partial_dependence
import matplotlib.pyplot as plt 

feature_names = ['Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad',
    'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal',
    'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend',
    'Grad.Rate', 'Private'
    ]

lr = LinearRegression().fit(X_train, y_train)
# change X to get PDP for test set
disp = plot_partial_dependence(lr, X=X_train, features=feature_names)
plt.tight_layout()
# Adjust the spacing between subplots, so that feature name can be seen
plt.subplots_adjust(
    left = 0.125,
    right = 0.9,
    bottom = 0.1,
    top = 0.9,
    wspace = 0.2,
    hspace = 0.8
    )
plt.show()

## Best subset selection 
systematische durchprobieren aller Modellkombinationen.

In [None]:
import operator
import time
import itertools
import statsmodels.api as sm
import pandas as pd

def processSubset(feature_set, X, y):
    # Fit model on feature_set and calculate RSS
    model = sm.OLS(y,X[list(feature_set)])
    regr = model.fit()
    rss = regr.ssr
    aic = regr.aic
    return {'model':regr, 'RSS':rss, 'AIC':aic}

def getBest(X, y, k):
    tic = time.time()
    
    results = []
    
    for combo in itertools.combinations(X.columns, k):
        results.append(processSubset(combo, X, y))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the lowest AIC
    best_model = models.loc[models['AIC'].argmin()]
    
    toc = time.time()
    print(
        "Processed", models.shape[0], "models on", k,
        "predictors in", (toc-tic), "seconds."
        )
    
    # Return the best model, along with some other
    # useful information about the model
    return best_model

models_best = pd.DataFrame(columns=['RSS', 'AIC', 'model'])

start = time.time()
for i in range(1,18):
    models_best.loc[i] = getBest(X_train, y_train, i)

end = time.time()
print("Total elapsed time:", (end-start), "seconds.")
print(models_best)


best_model_res = models_best.loc[11, 'model']
y_pred = best_model_res.predict(X_test[[
    'Accept', 'Top10perc', 'Top25perc', 'F.Undergrad', 'Outstate',
    'Room.Board', 'PhD', 'Expend', 'Terminal', 'Grad.Rate', 'Private'
    ]]
)
print('{:.6E}'.format(sum((y_test - y_pred) ** 2)))
