In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
data = pd.read_csv("../Data/Hitters.csv")
df = data.copy()
df = df.dropna()
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [10]:
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dms.head()

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,0,1,0,1,0,1
2,1,0,0,1,1,0
3,0,1,1,0,0,1
4,0,1,1,0,0,1
5,1,0,0,1,1,0


In [4]:
X_ = df.drop(["Salary","League","Division","NewLeague"],axis = 1).astype("float64")

In [5]:
y = df["Salary"]

In [6]:
X_.head()


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0


In [7]:
X = pd.concat([X_,dms[['League_N','Division_W','NewLeague_N']]],axis=1)
X.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,1,1,1
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,0,1,0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,1,0,1
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,1,0,1
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0,0,1,0


In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [11]:
bag_model = BaggingRegressor(bootstrap_features = True)
bag_model.fit(X_train, y_train)

BaggingRegressor(bootstrap_features=True)

In [12]:
bag_model.n_estimators

10

In [13]:
bag_model.estimators_

[DecisionTreeRegressor(random_state=747204345),
 DecisionTreeRegressor(random_state=579199047),
 DecisionTreeRegressor(random_state=654420098),
 DecisionTreeRegressor(random_state=1085027728),
 DecisionTreeRegressor(random_state=1661067084),
 DecisionTreeRegressor(random_state=881750537),
 DecisionTreeRegressor(random_state=724557789),
 DecisionTreeRegressor(random_state=1337507396),
 DecisionTreeRegressor(random_state=795042298),
 DecisionTreeRegressor(random_state=84133211)]

In [14]:
bag_model.estimators_samples_

[array([176, 105, 113,  14, 185, 140,  84, 102,  42,  79, 119, 112, 189,
        116,  74,  15,  29,   6, 123,  68,  72,  65, 162,  56, 148, 154,
        178,  57,  56,  23,  23, 106,  89,  54,  94, 136,  18, 116,  60,
         23,  35, 126, 146,  82, 184,  88, 127, 191, 174,  23, 177,  57,
        169, 176,  99, 119,  25, 158,  28, 108,  94, 187,  53,  81, 155,
         65, 149,  13, 123, 127, 168,  48, 192, 100,  64, 169, 118,  71,
        152, 111,  16, 154, 191, 163,  93, 120,  67,  71,  28,  31,  64,
        145,  87,  47,  28,  42, 134, 133,  22,  79, 115,  79,   3, 120,
         39,  78, 187,  29,  86, 117, 139, 179,   4,  19, 119,  42,  82,
        135,  12,  74,  30,  55, 156,   3, 163,  60, 178,  45,  48, 140,
         37, 172, 103, 113, 193, 118,  75, 132, 190,  32, 196,  86, 185,
         71,  67, 104,   3,  84, 106, 161, 163,  79, 118,  27, 186, 189,
        156,   4,  72, 185, 181,  69, 180, 136,   6, 179,  57,   1,  52,
         39, 133,  19,  21,  88, 175,  12,  12,  54

In [15]:
bag_model.estimators_features_

[array([ 7,  3,  2, 15, 14,  7,  1,  4,  2,  1,  8,  0, 11,  2, 14,  3, 12,
         9, 10]),
 array([ 7,  7,  7, 10, 18, 17,  3, 11,  3,  4, 16,  0,  5,  0,  2,  1,  5,
        16, 11]),
 array([ 2, 11, 18, 12,  0,  4,  4,  3, 17, 12, 14,  7, 15, 17, 12, 11, 15,
         8,  2]),
 array([16, 17, 18,  6, 10, 11, 10,  4,  2,  3,  5,  4, 18,  1, 16,  8,  5,
        16,  5]),
 array([12, 17,  9, 15, 18, 15, 11, 10,  5,  0,  5, 15,  6, 17,  1, 18,  3,
         8,  7]),
 array([ 9,  1,  2, 11, 12, 11, 13,  8, 14,  8, 14,  4,  1,  1,  7, 15, 15,
        13, 13]),
 array([ 8,  7, 14,  2,  4,  5,  0, 18,  6,  5,  4,  2,  2,  6, 15,  0,  6,
        17,  1]),
 array([ 4,  2,  1,  5, 16,  2,  4, 10,  8,  8, 10, 12, 15,  5, 11, 12,  7,
        11,  0]),
 array([10,  2,  8, 17,  4, 16, 15,  5, 17,  5,  8,  6,  2,  8, 17, 12, 15,
        11,  0]),
 array([ 6, 15, 15,  2, 17,  1,  2,  5, 12,  7, 13, 18, 18, 13,  4,  7,  1,
         6, 12])]

In [16]:
bag_model.estimators_[1]

DecisionTreeRegressor(random_state=579199047)

In [17]:
y_pred = bag_model.predict(X_test)

In [18]:
np.sqrt(mean_squared_error(y_test, y_pred))

329.3563753891228

In [19]:
bag_params = {"n_estimators": range(2,20)}

In [20]:
bag_cv_model = GridSearchCV(bag_model, bag_params, cv = 10)

In [25]:
bag_cv_model.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=BaggingRegressor(bootstrap_features=True),
             param_grid={'n_estimators': range(2, 20)})

In [26]:
bag_cv_model.best_params_

{'n_estimators': 19}

In [27]:
bag_tuned = BaggingRegressor( n_estimators = 14, random_state = 45)

In [28]:
bag_tuned.fit(X_train, y_train)

BaggingRegressor(n_estimators=14, random_state=45)

In [29]:
y_pred = bag_tuned.predict(X_test)

In [30]:
np.sqrt(mean_squared_error(y_test, y_pred))

346.457987188104