In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier

from load_data import load_heart_data, load_tweet_data

In [2]:
train_X, test_X, train_y, test_y = load_heart_data("heart_dataset.csv")

In [3]:
print("Shape of Train X: ", train_X.shape)
print("Shape of Test X: ", test_X.shape)
print("Shape of Train y: ", train_y.shape)
print("Shape of Test y: ", test_y.shape)

Shape of Train X:  (820, 13)
Shape of Test X:  (205, 13)
Shape of Train y:  (820,)
Shape of Test y:  (205,)


# Baseline Performance

Before testing out the new method explored in the project regarding the locally weighted trees, we will first test out the existing method we aim to improve upon, the random forest model.

In [4]:
# To find the best random forrest model, we need to tune the hyperparameters. Will do 
# so by performing a grid search 

grid_search_parameters = {'max_depth':[5, 10, 15, 30, 50, None], 
                          'n_estimators':[10, 25, 50, 100, 250, 500, 1000], 
                          'max_samples': [0.3, 0.5, 0.7, 0.8, 0.9, 1]}


In [9]:
random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier_GS = GridSearchCV(random_forest_classifier, grid_search_parameters, cv=10, verbose=2)
random_forest_classifier_GS.fit(train_X, train_y)


Fitting 10 folds for each of 252 candidates, totalling 2520 fits
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=10; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3, n_estimators=25; total time=   0.0s
[CV] END ......max_depth=5, max_samples=0.3,

In [10]:
print("Best Parameters: ", random_forest_classifier_GS.best_params_)
print("Best Score: ", random_forest_classifier_GS.best_score_)

Best Parameters:  {'max_depth': 10, 'max_samples': 0.9, 'n_estimators': 50}
Best Score:  0.9902439024390244


In [5]:
# Train the model with the best parameters:
random_forest_classifier = RandomForestClassifier(random_state=0, n_estimators=50, max_samples=0.9, max_depth=10)
random_forest_classifier.fit(train_X, train_y)
predictions = random_forest_classifier.predict(test_X)

print(classification_report(test_y, predictions, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.97222   0.98592       108
           1    0.97000   1.00000   0.98477        97

    accuracy                        0.98537       205
   macro avg    0.98500   0.98611   0.98534       205
weighted avg    0.98580   0.98537   0.98537       205



# Locally Weighted Random Forest

In [49]:
class LocallyWeightedRandomForest:
    
    def __init__(self, 
                 n_estimators=100, 
                 criterion="gini", 
                 max_depth=None, 
                 max_samples=None):

        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth

        self.max_samples = max_samples
        if self.max_samples is None:
            self.max_samples = 1.0

    def fit(self, X, y):
        self.estimators_ = []
        self.estimator_datasets = {}

        total_samples = y.shape[0]
        samples_to_draw = int(total_samples * self.max_samples)

        for _ in range(self.n_estimators):
            # First we sub-sample the dataset 
            # TODO should we sample with or without replacement for each individual tree itself
            sampled_X, sampled_y = resample(X, y, n_samples=samples_to_draw)
            # print(sampled_X.shape)
            # print(sampled_y.shape)

            _decision_tree = DecisionTreeClassifier(max_depth=self.max_depth, criterion=self.criterion)
            _decision_tree.fit(sampled_X, sampled_y)
            self.estimators_.append(_decision_tree)
            self.estimator_datasets[_decision_tree] = (sampled_X, sampled_y)
    
    def predict(self, test_X, distance_function = lambda point, x, y: 1):
        predictions = np.zeros(test_X.shape[0])
        
        for index, test_point in enumerate(test_X):
            estimator_predictions = {0:0, 1:0}
            for _estimator in self.estimators_:
                sampled_dataset = self.estimator_datasets[_estimator]
                sampled_X = sampled_dataset[0]
                sampled_y = sampled_dataset[1]
                
                # print(sampled_X.shape)
                # print(sampled_y.shape)

                est_prediction = _estimator.predict([test_point])[0]
                dist_to_tree = distance_function(test_point, sampled_X, sampled_y)
                estimator_predictions[est_prediction] += 1


            predictions[index] = 1 if estimator_predictions[1] > estimator_predictions[0]  else 0


        return predictions
        


In [50]:
lwrf = LocallyWeightedRandomForest(n_estimators=50, max_samples=0.9, max_depth=10)

In [51]:
lwrf.fit(train_X, train_y)

In [52]:
pred = lwrf.predict(test_X)
print(classification_report(test_y, pred, digits=5))

KeyError: 1

In [53]:
for row in test_X:
    print(row)

[ 58.    1.    0.  114.  318.    0.    2.  140.    0.    4.4   0.    3.
   1. ]
[ 77.   1.   0. 125. 304.   0.   0. 162.   1.   0.   2.   3.   2.]
[ 57.    1.    2.  150.  126.    1.    1.  173.    0.    0.2   2.    1.
   3. ]
[ 45.    1.    3.  110.  264.    0.    1.  132.    0.    1.2   1.    0.
   3. ]
[ 65.    1.    0.  135.  254.    0.    0.  127.    0.    2.8   1.    1.
   3. ]
[ 42.   0.   2. 120. 209.   0.   1. 173.   0.   0.   1.   0.   2.]
[ 42.    1.    0.  136.  315.    0.    1.  125.    1.    1.8   1.    0.
   1. ]
[ 45.   1.   0. 104. 208.   0.   0. 148.   1.   3.   1.   0.   2.]
[ 60.    1.    0.  140.  293.    0.    0.  170.    0.    1.2   1.    2.
   3. ]
[ 57.   1.   0. 110. 335.   0.   1. 143.   1.   3.   1.   1.   3.]
[ 54.    1.    2.  150.  232.    0.    0.  165.    0.    1.6   2.    0.
   3. ]
[ 54.   0.   1. 132. 288.   1.   0. 159.   1.   0.   2.   1.   2.]
[ 54.   1.   0. 110. 206.   0.   0. 108.   1.   0.   1.   1.   2.]
[ 43.    1.    0.  120.  177.    0.   