In [1]:
from ISLP import load_data
df = load_data("College")
df.columns

Index(['Private', 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc',
       'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books',
       'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend',
       'Grad.Rate'],
      dtype='object')

In [2]:
df

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,No,2197,1515,543,4,26,3089,2029,6797,3900,500,1200,60,60,21.0,14,4469,40
773,Yes,1959,1805,695,24,47,2849,1107,11520,4960,600,1250,73,75,13.3,31,9189,83
774,Yes,2097,1915,695,34,61,2793,166,6900,4200,617,781,67,75,14.4,20,8323,49
775,Yes,10705,2453,1317,95,99,5217,83,19840,6510,630,2115,96,96,5.8,49,40386,99


In [3]:
df["Private"] = df.Private.apply(lambda x: 1 if x=="Yes" else 0)
df["Accept.Rate"] = df["Accept"] / df["Apps"]
data = df.drop(columns=["Accept", "Apps"])

In [4]:
data

Unnamed: 0,Private,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate,Accept.Rate
0,1,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60,0.742169
1,1,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56,0.880146
2,1,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54,0.768207
3,1,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59,0.836930
4,1,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15,0.756477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,0,543,4,26,3089,2029,6797,3900,500,1200,60,60,21.0,14,4469,40,0.689577
773,1,695,24,47,2849,1107,11520,4960,600,1250,73,75,13.3,31,9189,83,0.921388
774,1,695,34,61,2793,166,6900,4200,617,781,67,75,14.4,20,8323,49,0.913209
775,1,1317,95,99,5217,83,19840,6510,630,2115,96,96,5.8,49,40386,99,0.229145


In [5]:
X_cols = ['Private', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad',
       'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'PhD',
       'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend', 'Grad.Rate']

# a) Train/test Split

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data[X_cols], data["Accept.Rate"], test_size=0.2, random_state=42)

# b) Fit a regression tree to the training set. Plot the tree and interpret the results. What test MSE do you obtain?

In [7]:
from sklearn.tree import DecisionTreeRegressor, export_graphviz
tree = DecisionTreeRegressor()
tree.fit(x_train, y_train)

In [8]:
export_graphviz(tree, out_file="graph.dot", feature_names=X_cols, rounded=True, filled=True)

In [9]:
!dot -Tpng graph.dot -o tree.png

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.789148 to fit


In [10]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, tree.predict(x_test))

0.024081509749819124

# c) Cross - validation

In [11]:
from sklearn.model_selection import GridSearchCV
import numpy as np
param_grid = {
    'max_depth': np.arange(1, 20),
    'min_samples_split': np.arange(2, 20),
    'min_samples_leaf': np.arange(1, 20)
}

tree = DecisionTreeRegressor()
tree_cv = GridSearchCV(tree, param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
tree_cv.fit(x_train, y_train)

# Print the optimal parameters
print(tree_cv.best_params_)

{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 19}


In [12]:
y_pred = tree_cv.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse}")

Test MSE: 0.019901016088138403


In [13]:
best_tree = tree_cv.best_estimator_
export_graphviz(best_tree, out_file="graph2.dot", feature_names=X_cols, rounded=True, filled=True)

In [14]:
!dot -Tpng graph2.dot -o tree2.png

# d) Bagging Classifier

In [15]:
from sklearn.ensemble import BaggingRegressor

bag_reg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=100, random_state=42)
bag_reg.fit(x_train, y_train)
y_pred = bag_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse}")
all_imp = np.array([tree.feature_importances_ for tree in bag_reg.estimators_])
mean_imp = all_imp.mean(axis=0)

Test MSE: 0.015801828443889276


In [16]:
bag_reg.fit(x_train, y_train)

In [17]:
y_pred = bag_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse}")

Test MSE: 0.015801828443889276


In [18]:
all_imp = np.array([tree.feature_importances_ for tree in bag_reg.estimators_])
mean_imp = all_imp.mean(axis=0)

In [19]:
mean_imp

array([0.00269038, 0.02923165, 0.16971266, 0.19493624, 0.07055561,
       0.03993146, 0.09586227, 0.0666787 , 0.04455362, 0.02926273,
       0.02504309, 0.02563531, 0.03316534, 0.02598107, 0.03432809,
       0.11243179])

# e) Random Forests

In [23]:
from sklearn.ensemble import RandomForestRegressor
param_grid = {
    'n_estimators': [10, 50, 100, 200, 500, 1000],
    'max_features' : ['auto', 'sqrt', 'log2', 0.3, 0.5, 0.7]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jay/opt/anaconda3/envs/ml_a1/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jay/opt/anaconda3/envs/ml_a1/lib/python3.10/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/jay/opt/anaconda3/envs/ml_a1/lib/python3.10/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/jay/opt/anaconda3/envs/ml_a1/lib/python3.10/site-packages/sklearn/utils/_pa

In [24]:
print(f"Optimal number of trees: {grid_search.best_params_}")
print(f"Best Cross-Validation MSE: {-grid_search.best_score_}")

Optimal number of trees: {'max_features': 0.5, 'n_estimators': 100}
Best Cross-Validation MSE: 0.011490486026603184


In [25]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE with optimal number of trees: {mse}")

Test MSE with optimal number of trees: 0.016096983982072773


In [26]:
all_imp = np.array([tree.feature_importances_ for tree in best_rf.estimators_])
mean_imp = all_imp.mean(axis=0)

In [27]:
mean_imp

array([0.00539189, 0.04376945, 0.16418941, 0.164423  , 0.06537704,
       0.03922446, 0.09778385, 0.06400249, 0.0439498 , 0.03336755,
       0.02835735, 0.03042268, 0.03430672, 0.02949897, 0.05089466,
       0.10504069])