In [None]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.stats import mode


In [None]:
class MyRandomForestClassifier():
    def __init__(self, n_estimators=10, max_features='sqrt'):
        self.forest = [DecisionTreeClassifier(max_features=max_features) for _ in range(n_estimators)]
    # train the trees of this random forest using subsets of X (and y)
    def fit(self, X, y):
        for tree in self.forest:
            X_new = X.sample(frac=0.623, random_state=32)
            #subset = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True)
            tree.fit(X_new, y[X_new.index])

    # predict the label for each point in X
    def predict(self, X):
        predictions = [ tree.predict(X) for tree in self.forest ] 
        return pd.DataFrame(predictions, columns=range(0,10000)).mode(axis=0)

    def compute_feat_imp(self):
        fi_trees = np.array([tree.feature_importances_ for tree in self.forest])
        return fi_trees.sum(axis=0) / fi_trees.sum()




In [None]:
# Reading MNIST dataset
dataset = fetch_openml("mnist_784")
X = dataset["data"]
y = dataset["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=60000)
my_clf = MyRandomForestClassifier(10)
my_clf.fit(X_train, y_train)
y_pred_n = my_clf.predict(X_test).iloc[0]
print(f"Accuracy wih {10} trees: {accuracy_score(y_test, y_pred_n)}")

In [None]:
for i in range(10,101,10):
    my_clf = MyRandomForestClassifier(i)
    my_clf.fit(X_train, y_train)
    y_pred = my_clf.predict(X_test).iloc[0]
    print(f"Accuracy wih {i} trees: {accuracy_score(y_test, y_pred)}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
skl_clf = RandomForestClassifier(100)
skl_clf.fit(X_train, y_train)
y_s_predict = skl_clf.predict(X_test)
print(accuracy_score(y_test, y_s_predict))

In [None]:
X.columns

In [None]:
feat_imp = my_clf.compute_feat_imp()
len(feat_imp)

In [None]:
import seaborn as sns
# This is the result from the previous exercise
feature_importances = feat_imp
sns.heatmap(np.reshape(feature_importances, (28,28)), cmap='binary')

In [None]:
s_feat_imp = skl_clf.feature_importances_
sns.heatmap(np.reshape(s_feat_imp, (28,28)), cmap='binary')