### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.metrics import accuracy_score

### Download Sonar Dataset

In [2]:
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
names = ['col_'+str(i) for i in np.arange(61)]
data = pd.read_csv(path, names = names)
data.fillna(method='ffill', inplace=True)

#TODO train, test split
X = data[data.columns[0 : -1]]
Y = data[data.columns[-1]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [3]:
from sklearn.tree import DecisionTreeClassifier as DecisionTree
class RandomForest:
    def __init__(self, 
                 max_depth=4,
                 subsample_size=70,
                 tree_count=5,
                 criterion="gini"):
        self.trees = []
        for i in range(tree_count):
            self.trees.append(DecisionTree(criterion=criterion, max_depth=max_depth))
    
    def fit(self, X, Y):
        #np.random.seed(64)
        idx = np.random.choice(X.shape[0], size = 70)
        X_part = X.values[idx]
        Y_part = Y.values[idx]       
        for index, tree in enumerate(self.trees):
            #TODO bootstrap X_part and Y_part for each tree
            self.trees[index].fit(X_part, Y_part)
        
    def predict(self, X):
        predictions = []
        for index, tree in enumerate(self.trees):
            predictions.append(self.trees[index].predict(X))
        #TODO combine and return y_pred
        y_pred = stats.mode(np.array(predictions))
        return y_pred.mode.reshape(X.shape[0],1)

### Try on the sonar dataset

In [4]:
# TODO fit and predict on sonar dataset
# TODO compare results with Y_test and print the accuracy
model_a = RandomForest()
model_a.fit(X_train, Y_train)
model_a.predict(X_test)
print(accuracy_score(Y_test, model_a.predict(X_test)))

0.7142857142857143


### Try the Random Forest Implementation form Sklearn and compare results

In [5]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=4)
model.fit(X_train, Y_train)
model.predict(X_test)
print(accuracy_score(Y_test, model.predict(X_test)))

0.7936507936507936
