In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier

from load_data import load_heart_data, load_tweet_data

In [2]:
train_X, test_X, train_y, test_y = load_heart_data("heart_dataset.csv")

In [3]:
print("Shape of Train X: ", train_X.shape)
print("Shape of Test X: ", test_X.shape)
print("Shape of Train y: ", train_y.shape)
print("Shape of Test y: ", test_y.shape)

Shape of Train X:  (820, 13)
Shape of Test X:  (205, 13)
Shape of Train y:  (820,)
Shape of Test y:  (205,)


# Baseline Performance

Before testing out the new method explored in the project regarding the locally weighted trees, we will first test out the existing method we aim to improve upon, the random forest model.

In [4]:
# To find the best random forrest model, we need to tune the hyperparameters. Will do 
# so by performing a grid search 

grid_search_parameters = {'max_depth':[5, 10, 15, 30, 50, None], 
                          'n_estimators':[10, 25, 50, 100, 250, 500, 1000], 
                          'max_samples': [0.3, 0.5, 0.7, 0.8, 0.9, 1]}


In [None]:
random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier_GS = GridSearchCV(random_forest_classifier, grid_search_parameters, cv=10, verbose=0)
random_forest_classifier_GS.fit(train_X, train_y)


In [7]:
print("Best Parameters: ", random_forest_classifier_GS.best_params_)
print("Best Score: ", random_forest_classifier_GS.best_score_)

Best Parameters:  {'max_depth': 10, 'max_samples': 0.9, 'n_estimators': 50}
Best Score:  0.9902439024390244


In [8]:
# Train the model with the best parameters:
random_forest_classifier = RandomForestClassifier(random_state=0, n_estimators=50, max_samples=0.9, max_depth=10)
random_forest_classifier.fit(train_X, train_y)
predictions = random_forest_classifier.predict(test_X)

print(classification_report(test_y, predictions, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.97222   0.98592       108
           1    0.97000   1.00000   0.98477        97

    accuracy                        0.98537       205
   macro avg    0.98500   0.98611   0.98534       205
weighted avg    0.98580   0.98537   0.98537       205



# Locally Weighted Random Forest

In [45]:
class LocallyWeightedRandomForest:
    
    '''
    Constructor for the model class
    Input: 
        n_estimators - number of estimators in the ensemble
        criterion - splitting criteria when training the individual trees
        max_depth - the max depth for each individual tree
        max_samples - the portion of the dataset subsampled for each tree. 
    '''
    def __init__(self, 
                 n_estimators=100, 
                 criterion="gini", 
                 max_depth=None, 
                 max_samples=None):

        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth

        self.max_samples = max_samples
        if self.max_samples is None:
            self.max_samples = 1.0

    '''
    Fit the dataset to an ensemble of decision trees. Each decision tree
    is trained on a subsample of the dataset determined by the "max_samples" values
    of the model. 

    Input: X - the dataset feature values
           y - the target values corresponding to the dataset
    '''
    def fit(self, X, y):
        self.estimators_ = []
        self.estimator_datasets = {}

        total_samples = y.shape[0]
        samples_to_draw = int(total_samples * self.max_samples)

        for _ in range(self.n_estimators):
            # First we sub-sample the dataset 
            # TODO should we sample with or without replacement for each individual tree itself
            sampled_X, sampled_y = resample(X, y, n_samples=samples_to_draw)
      

            _decision_tree = DecisionTreeClassifier(max_depth=self.max_depth, criterion=self.criterion)
            _decision_tree.fit(sampled_X, sampled_y)
            self.estimators_.append(_decision_tree)
            self.estimator_datasets[_decision_tree] = (sampled_X, sampled_y)
    

    '''
    Calculate the predictions given the distance function and the temperature value for 
    aggregating the distance values

    Input: test_X - the data to calculate the predictions with 
           distance_function - a function that takes in two parameters 
                * point - a single point to predict on
                * X - the dataset used to train the classifier
                This function is meant to allow for a flexible calculation of distances which will get aggregated afterwards
            temperature - input to the distance softmax calculation

    Output: predictions numpy array 
    '''
    def predict(self, test_X, distance_function = lambda point, x: 1, temperature=1.0):
        predictions = np.zeros(test_X.shape[0])
        
        for index, test_point in enumerate(test_X):
            estimator_predictions = {}
            estimator_distances = np.zeros(self.n_estimators)

            # First loop through all the estimators and calculate the distances
            # using the distance functions provided
            for i, _estimator in enumerate(self.estimators_):
                sampled_dataset = self.estimator_datasets[_estimator]
                sampled_X = sampled_dataset[0]
                estimator_distances[i] = distance_function(test_point, sampled_X)
            
            # Calculate the weights. Now all the weights should add to 1. 
            prediction_weights = self.calculate_weights(estimator_distances, temperature)

            # Predict the value using the estimators and the associated weights. 
            for i, _estimator in enumerate(self.estimators_):
                # Make the prediction 
                est_prediction = _estimator.predict([test_point])[0]
                
                # If this class hasn't been predicted before, initialize the sum as 0. 
                if est_prediction not in estimator_predictions:
                    estimator_predictions[est_prediction] = 0

                # Add the weight of that prediction to the predicted class' running total
                estimator_predictions[est_prediction] += prediction_weights[i] 

            
            # The final prediction will be the class with the largest sum of its weights
            # Get the argmax of the dictionary. I.e. key with the largest value
            predictions[index]  = max(estimator_predictions, key=estimator_predictions.get)

        return predictions
        

    '''
    Calculate the weights of the trees using the distances.
    The weights are the softmax output of the distances. 

    Input: - estimator_distances: list of the distances of the point to each tree in the ensemble
           - temperature - hyperparameter for the softmax function. 
    
    Output: List of the weight values, the sum should be equal to 1. 

    '''
    def calculate_weights(self, estimator_distances, temperature):
        weights = np.zeros(self.n_estimators)
        
        # TODO Figure out, should it be distance or -1 * distance because closer distances should get a larger value?

        # Calculate Denominator values
        total_den_sum = 0
        for distance in estimator_distances:
            total_den_sum += np.exp(-distance / (2 * temperature ** 2))

        for i, distance in enumerate(estimator_distances):
            weights[i] = np.exp(-distance / (2 * temperature ** 2)) / total_den_sum
        
        return weights

In [91]:
# lwrf = LocallyWeightedRandomForest(n_estimators=50, max_samples=0.9, max_depth=10)
lwrf = LocallyWeightedRandomForest(n_estimators=1000, max_samples=0.5, max_depth=6)

In [92]:
def average_euclidean_distance(point, dataset):
    '''
    Compute L2 distance between test point and each training point
    
    Input: point is a 1d numpy array
    Output: dist is a numpy array containing the distances between the test point and each training point
    '''
    # Source: CSC2515 Homework 3 Starter Code. 
    
    dataset_norm = (dataset**2).sum(axis=1).reshape(-1,1)

    # Process test point shape
    point = np.squeeze(point)
    if point.ndim == 1:
        point = point.reshape(1, -1)
    assert point.shape[1] == dataset.shape[1]

    # Compute squared distance
    test_norm = (point**2).sum(axis=1).reshape(1,-1)
    dist = dataset_norm + test_norm - 2*dataset.dot(point.transpose())

    return np.mean(np.squeeze(dist))

In [93]:
lwrf.fit(train_X, train_y)

In [96]:
pred = lwrf.predict(test_X, average_euclidean_distance, temperature=2)
print(classification_report(test_y, pred, digits=5))

              precision    recall  f1-score   support

           0    0.98039   0.92593   0.95238       108
           1    0.92233   0.97938   0.95000        97

    accuracy                        0.95122       205
   macro avg    0.95136   0.95265   0.95119       205
weighted avg    0.95292   0.95122   0.95125       205

