#Feature selection

In [1]:
from sklearn.datasets import load_digits

# Load the digits dataset
X, y = load_digits(return_X_y=True)

In [2]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
X_new = selector.fit_transform(X, y)

# Print the indices of the selected features
print(selector.get_support())
print(selector.get_support(indices=True))

[False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True False  True  True  True
  True  True  True False  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 33 34 35 36 37 38 40 41 42 43 44 45 46 47 48 49 50
 51 52 53 54 55 56 57 58 59 60 61 62 63]


In [3]:
from sklearn.feature_selection import SelectKBest, chi2

selector = SelectKBest(chi2, k=10)
X_new = selector.fit_transform(X, y)

# Print the indices of the selected features
print(selector.get_support())
print(selector.get_support(indices=True))

[False False False False False False False False False False False False
 False False False False False False False False  True  True False False
 False False  True False False False  True False False  True  True False
 False False False False False False  True  True False False False False
 False False False False False False  True False False False False False
 False False  True False]
[20 21 26 30 33 34 42 43 54 62]


In [4]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

selector = RFE(DecisionTreeClassifier(), n_features_to_select=10)
X_new = selector.fit_transform(X, y)

# Print the indices of the selected features
print(selector.get_support())
print(selector.get_support(indices=True))

[False False False False False  True False False False False False False
 False False False False False False False False  True  True False False
 False False False  True False  True False False False  True False False
  True False False False False False  True  True False False False False
 False False False False False False False False False False False False
  True False False False]
[ 5 20 21 27 29 33 36 42 43 60]


In [5]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X, y)
print(clf.feature_importances_)
print(clf.feature_importances_.argsort()[::-1][:10])

[0.         0.         0.01054414 0.01604358 0.00029958 0.06715065
 0.         0.         0.         0.00380605 0.01662014 0.00420758
 0.01803892 0.01501789 0.00177254 0.         0.0012193  0.00445009
 0.01367017 0.01966672 0.04314203 0.08725831 0.         0.
 0.00122789 0.         0.0204186  0.05191891 0.05221603 0.04531884
 0.00243406 0.         0.         0.05573629 0.03255261 0.011133
 0.07259762 0.01779433 0.00731744 0.         0.         0.00802049
 0.08182843 0.05244982 0.01688078 0.00678821 0.00450956 0.
 0.         0.00061833 0.01132792 0.00528825 0.00267943 0.01382453
 0.02227425 0.         0.         0.00109925 0.00838184 0.00275304
 0.05939731 0.00429297 0.00116795 0.00284431]
[21 42 36  5 60 33 43 28 27 29]


#Ensemble learning

In [8]:
from sklearn.datasets import load_digits

# Load the digits dataset
X, y = load_digits(return_X_y=True)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test)))

clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10)
clf.fit(X_train,y_train)

print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        44
           1       0.72      0.76      0.74        55
           2       0.80      0.78      0.79        58
           3       0.77      0.80      0.79        69
           4       0.75      0.88      0.81        52
           5       0.92      0.81      0.86        57
           6       0.95      0.92      0.93        60
           7       0.89      0.83      0.86        48
           8       0.74      0.70      0.72        46
           9       0.72      0.76      0.74        51

    accuracy                           0.82       540
   macro avg       0.82      0.82      0.82       540
weighted avg       0.82      0.82      0.82       540

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        44
           1       0.89      0.93      0.91        55
           2       0.95      0.91      0.93        58
           3       0.92 



##Workshop - Bagging

In [11]:
import numpy as np

#Initialization
n_estimators = 10
index = []
for i in range(n_estimators):
    index.append(np.random.randint(0,X_train.shape[0],(X_train.shape[0],)))

In [12]:
#Training
model = [DecisionTreeClassifier() for _ in range(n_estimators)]
for i in range(n_estimators):
    model[i].fit(X_train[index[i]],y_train[index[i]])

In [13]:
#Prediction
result = 1/n_estimators*model[0].predict_proba(X_test)
for i in range(1,n_estimators):
    result = result + 1/n_estimators*model[i].predict_proba(X_test)
print(result.argmax(axis=1))

[8 4 6 0 9 8 7 1 4 9 4 2 4 8 3 3 8 8 1 2 3 3 4 9 8 5 1 6 9 0 5 4 7 8 6 9 2
 4 1 7 2 0 2 0 1 3 4 3 5 3 0 9 2 2 5 1 1 2 2 2 7 4 6 9 2 2 8 2 0 1 0 3 9 9
 7 2 5 9 5 1 3 6 4 6 7 2 1 8 7 7 6 6 5 3 4 6 0 3 7 6 8 1 1 0 0 5 2 6 1 6 3
 3 1 7 1 4 6 4 8 6 5 1 0 6 3 3 6 9 7 5 3 6 7 5 8 6 7 3 4 0 1 9 0 2 4 5 4 0
 4 7 6 5 0 3 5 8 6 8 8 0 9 5 4 6 9 4 2 3 3 6 1 1 2 5 8 3 3 6 8 5 2 1 3 4 3
 1 7 1 9 2 6 4 4 6 6 5 7 5 1 8 6 2 0 8 6 5 3 3 8 9 2 7 3 7 2 1 8 4 0 0 3 7
 1 4 9 9 3 4 7 3 6 8 6 4 1 6 5 6 1 7 4 0 4 0 8 6 9 6 2 4 4 4 9 1 7 0 0 7 5
 9 2 1 5 3 8 9 0 3 3 2 5 0 5 3 5 9 8 2 5 1 1 1 9 2 4 9 1 0 0 8 9 7 7 4 6 0
 2 1 6 2 7 9 0 3 4 2 2 8 9 9 8 2 3 2 9 2 6 4 2 6 6 2 1 2 3 1 0 2 5 5 6 6 1
 7 9 4 9 4 7 7 7 3 6 1 1 4 8 4 8 8 0 2 1 0 9 5 9 1 8 3 0 9 7 5 1 3 3 6 6 2
 6 8 7 4 6 6 7 4 5 0 9 4 5 0 7 2 8 2 9 4 5 9 2 4 9 5 7 4 3 3 0 4 0 5 7 0 6
 6 6 0 8 1 3 1 8 8 5 9 1 1 5 5 5 3 7 4 3 8 1 3 9 1 2 1 9 0 1 3 7 4 5 3 1 2
 6 6 8 5 9 7 3 2 3 7 8 9 0 3 6 2 6 3 2 5 8 3 2 8 8 9 9 8 0 7 1 3 4 2 2 6 1
 6 6 1 9 2 5 2 9 3 2 0 9 

##Workshop - Boosting (Binary classification)

In [14]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

class AdaBoost:
    def __init__(self, n_classifiers):
        self.n_classifiers = n_classifiers
        self.classifiers = []
        self.alphas = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_classifiers):
            clf = DecisionTreeClassifier(max_depth=1)
            clf.fit(X, y, sample_weight=weights)

            predictions = clf.predict(X)

            misclassified = predictions != y
            error = np.sum(weights*misclassified) / np.sum(weights)

            alpha = 0.5*np.log((1-error)/error)

            weights *= np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)

            self.classifiers.append(clf)
            self.alphas.append(alpha)

    def predict(self, X):
        clf_preds = np.array([alpha * clf.predict(X) for clf, alpha in zip(self.classifiers, self.alphas)])
        return np.sign(np.sum(clf_preds, axis=0))

# Load the dataset
from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
y = digits.target
y = np.where(y < 5, -1, 1)  # Convert to a binary classification problem

adaboost = AdaBoost(n_classifiers=50)
adaboost.fit(X, y)

y_pred = adaboost.predict(X)

accuracy = accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9021
