### Exercise 1

In [31]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier as CART
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [40]:
arrays = np.load('../data/iris_train_scaled.npz')
X_train = arrays['X']
y_train = arrays['y']
arrays = np.load('../data/iris_test_scaled.npz')
X_test = arrays['X']
y_test = arrays['y']
X_train = X_train[:, [0,1]]
X_test = X_test[:, [0,1]]

In [18]:
min_samples_leaf = [2, 10]
n_estimators = [5, 20]

#### Bagging

In [19]:
for msl in min_samples_leaf:
  cart = CART(min_samples_leaf=msl)
  for n_est in n_estimators:
    bagging = BaggingClassifier(estimator=cart, n_estimators=n_est)
    bagging.fit(X_train, y_train)
    score = bagging.score(X_test, y_test)
    print('The accuracy for min_samples_leaf={}, n_estimators={} on the test data is {:.3f}'.format(msl, n_est, score))

The accuracy for min_samples_leaf=2, n_estimators=5 on the test data is 0.733
The accuracy for min_samples_leaf=2, n_estimators=20 on the test data is 0.767
The accuracy for min_samples_leaf=10, n_estimators=5 on the test data is 0.633
The accuracy for min_samples_leaf=10, n_estimators=20 on the test data is 0.633


#### Random Forest

In [20]:
for msl in min_samples_leaf:
  for n_est in n_estimators:
    rf = RandomForestClassifier(min_samples_leaf=msl, n_estimators=n_est)
    rf.fit(X_train, y_train)
    score = rf.score(X_test, y_test)
    print('The accuracy for min_samples_leaf={}, n_estimators={} on the test data is {:.3f}'.format(msl, n_est, score))

The accuracy for min_samples_leaf=2, n_estimators=5 on the test data is 0.700
The accuracy for min_samples_leaf=2, n_estimators=20 on the test data is 0.767
The accuracy for min_samples_leaf=10, n_estimators=5 on the test data is 0.667
The accuracy for min_samples_leaf=10, n_estimators=20 on the test data is 0.700


#### Pasting

In [21]:
for msl in min_samples_leaf:
  cart = CART(min_samples_leaf=msl)
  for n_est in n_estimators:
    bagging = BaggingClassifier(estimator=cart, n_estimators=n_est, bootstrap=False, max_samples=0.5)
    bagging.fit(X_train, y_train)
    score = bagging.score(X_test, y_test)
    print('The accuracy for min_samples_leaf={}, n_estimators={} on the test data is {:.3f}'.format(msl, n_est, score))

The accuracy for min_samples_leaf=2, n_estimators=5 on the test data is 0.633
The accuracy for min_samples_leaf=2, n_estimators=20 on the test data is 0.767
The accuracy for min_samples_leaf=10, n_estimators=5 on the test data is 0.633
The accuracy for min_samples_leaf=10, n_estimators=20 on the test data is 0.633


#### AdaBoost

In [24]:
for msl in min_samples_leaf:
  cart = CART(min_samples_leaf=msl)
  for n_est in n_estimators:
    boosting = AdaBoostClassifier(estimator=cart, n_estimators=n_est, algorithm='SAMME')
    boosting.fit(X_train, y_train)
    score = boosting.score(X_test, y_test)
    print('The accuracy for min_samples_leaf={}, n_estimators={} on the test data is {:.3f}'.format(msl, n_est, score))

The accuracy for min_samples_leaf=2, n_estimators=5 on the test data is 0.667
The accuracy for min_samples_leaf=2, n_estimators=20 on the test data is 0.767
The accuracy for min_samples_leaf=10, n_estimators=5 on the test data is 0.633
The accuracy for min_samples_leaf=10, n_estimators=20 on the test data is 0.767


### Exercise 5

In [25]:
import xgboost as xgb

In [29]:
max_depth = [5, 10]
n_estimators = [5, 20, 100]

# Convert data into DMatrix
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [37]:
for md in max_depth:
  for n_est in n_estimators:
    params = {
        'objective': 'multi:softmax',  # Specify multiclass classification
        'num_class': 3,  # Number of classes in the dataset
        'max_depth': md,  # Maximum depth of the trees
        'eta': 0.1,  # Learning rate
        'gamma': 0.1,  # Minimum loss reduction required to make a further partition
        'lambda': 1.0,  # L2 regularization term on weights
        'eval_metric': 'mlogloss'  # Evaluation metric
    }

    bst = xgb.train(params, dtrain, n_est)
    preds = bst.predict(dtest)
    accuracy = accuracy_score(y_test, preds)
    print('The accuracy for max_depth={}, n_estimators={} on the test data is {:.3f}'.format(md, n_est, accuracy))

The accuracy for max_depth=5, n_estimators=5 on the test data is 0.767
The accuracy for max_depth=5, n_estimators=20 on the test data is 0.767
The accuracy for max_depth=5, n_estimators=100 on the test data is 0.767
The accuracy for max_depth=10, n_estimators=5 on the test data is 0.767
The accuracy for max_depth=10, n_estimators=20 on the test data is 0.767
The accuracy for max_depth=10, n_estimators=100 on the test data is 0.733
