In [20]:
import numpy as np
import pandas as pd
import statistics



from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from load_data import load_heart_data, load_tweet_data

In [21]:
train_X, test_X, train_y, test_y = load_heart_data("heart_dataset.csv")

In [22]:
print("Shape of Train X: ", train_X.shape, ", Type: ", type(train_X))
print("Shape of Test X: ", test_X.shape, ", Type: ", type(test_X))
print("Shape of Train y: ", train_y.shape, ", Type: ", type(train_y))
print("Shape of Test y: ", test_y.shape, ", Type: ", type(test_y))

Shape of Train X:  (820, 13) , Type:  <class 'numpy.ndarray'>
Shape of Test X:  (205, 13) , Type:  <class 'numpy.ndarray'>
Shape of Train y:  (820,) , Type:  <class 'numpy.ndarray'>
Shape of Test y:  (205,) , Type:  <class 'numpy.ndarray'>


# Baseline Performance

Before testing out the new method explored in the project regarding the locally weighted trees, we will first test out the existing method we aim to improve upon, the random forest model.

In [4]:
# To find the best random forrest model, we need to tune the hyperparameters. Will do 
# so by performing a grid search 

grid_search_parameters = {'max_depth':[5, 10, 15, 30, 50, None], 
                          'n_estimators':[10, 25, 50, 100, 250, 500, 1000], 
                          'max_samples': [0.3, 0.5, 0.7, 0.8, 0.9, 1]}


In [None]:
random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier_GS = GridSearchCV(random_forest_classifier, grid_search_parameters, cv=10, verbose=0)
random_forest_classifier_GS.fit(train_X, train_y)


In [None]:
print("Best Parameters: ", random_forest_classifier_GS.best_params_)
print("Best Score: ", random_forest_classifier_GS.best_score_)

Best Parameters:  {'max_depth': 10, 'max_samples': 0.9, 'n_estimators': 50}
Best Score:  0.9902439024390244


In [None]:
# Train the model with the best parameters:
random_forest_classifier = RandomForestClassifier(random_state=0, n_estimators=50, max_samples=0.9, max_depth=10)
random_forest_classifier.fit(train_X, train_y)
predictions = random_forest_classifier.predict(test_X)

print(classification_report(test_y, predictions, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.97222   0.98592       108
           1    0.97000   1.00000   0.98477        97

    accuracy                        0.98537       205
   macro avg    0.98500   0.98611   0.98534       205
weighted avg    0.98580   0.98537   0.98537       205



# Locally Weighted Random Forest

In [23]:
from locallyWeightedRandomForest import LocallyWeightedRandomForest
from sklearn.utils.estimator_checks import check_estimator

# Distance functions

TODO move to a separate file, here for easier testing (for now) 

In [24]:
def euclidean_distance(x_1:np.ndarray, x_2:np.ndarray) -> float:
    return np.linalg.norm(x_1 - x_2)

def mean_distance(point:np.ndarray, dataset:np.ndarray, distance_function:callable = lambda a,b: 1)-> float:
    distance_sum = 0
    for p in dataset:
        distance_sum += distance_function(point, p)
    return distance_sum/len(dataset)

def distance_to_dataset_mean(point:np.ndarray, dataset:np.ndarray, distance_function:callable = lambda a,b: 1)-> float:
    mean_dataset_point = np.mean(dataset, axis=0)
    return distance_function(point, mean_dataset_point)

def median_distance(point:np.ndarray, dataset:np.ndarray, distance_function:callable = lambda a,b: 1)-> float:
    distances = []
    for p in dataset:
        distances.append(distance_function(point, p))
    return statistics.median(distances)

def nearest_k_distance_mean(k:int) -> callable:
    def _smallest_k_distances_mean(point:np.ndarray, dataset:np.ndarray, distance_function:callable = lambda a,b: 1) -> float:
        distances = np.zeros(len(dataset))
        for i in range(len(dataset)):
            distances[i] = distance_function(point, dataset[i])
        smallest_k_distaces = np.sort(distances)[:k]
        return np.mean(smallest_k_distaces, axis=0)

    return _smallest_k_distances_mean

In [25]:
# lwrf = LocallyWeightedRandomForest(n_estimators=50, max_samples=0.9, max_depth=10)
lwrf = LocallyWeightedRandomForest(n_estimators=50, max_samples=0.9, max_depth=10)
#TODO: lwrf class should allow execution of predict independent of execution of fit
#check_estimator(lwrf)

In [26]:
lwrf.fit(train_X, train_y)

In [27]:
pred = lwrf.predict(test_X, temperature=1)
print(classification_report(test_y, pred, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.97222   0.98592       108
           1    0.97000   1.00000   0.98477        97

    accuracy                        0.98537       205
   macro avg    0.98500   0.98611   0.98534       205
weighted avg    0.98580   0.98537   0.98537       205



In [28]:
pred = lwrf.predict(test_X, temperature=0.15, distance_function=euclidean_distance, distance_aggregation_function=mean_distance)
print(classification_report(test_y, pred, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.97222   0.98592       108
           1    0.97000   1.00000   0.98477        97

    accuracy                        0.98537       205
   macro avg    0.98500   0.98611   0.98534       205
weighted avg    0.98580   0.98537   0.98537       205



In [29]:
#TODO move to separate file 

def cross_validation(X, y, model, temperature, distance_function, distance_agg_func, folds=10, verbose=False):
    k_folds = KFold(n_splits=folds)
    validation_scores = []

    for train_index, val_index in k_folds.split(X):
        train_X, val_X = X[train_index], X[val_index]
        train_y, val_y = y[train_index], y[val_index]


        model.fit(train_X, train_y)
        predictions = model.predict(val_X, temperature, distance_function, distance_agg_func)

        accuracy = accuracy_score(val_y, predictions)

        if verbose:
            print("Fold Accuracy: ", accuracy)

        validation_scores.append(accuracy)

    return np.mean(np.array(validation_scores))

# Experiments

### Comparing sample sizes vs performance 

For this graph, we will keep the number of estimators constant at 300. 

In [30]:
performance = []

In [None]:
# For the random forest, we will try various values for the portion of the population we 
# sample the data from 
sample_sizes = range(0.1, 1.01, 0.1)

for size in sample_sizes:
    lwrf = LocallyWeightedRandomForest(n_estimators=300, max_samples=size, max_depth=10)


In [31]:
from sklearn.model_selection import cross_val_score


In [15]:
cross_validation(X=train_X, y=train_y, model=lwrf, temperature=1.0, distance_agg_func=mean_distance, distance_function=euclidean_distance, folds=5, verbose=True)

Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9451219512195121
Fold Accuracy:  1.0


0.9817073170731707

In [16]:
cross_validation(X=train_X, y=train_y, model=lwrf, temperature=2.0, distance_agg_func=mean_distance, distance_function=euclidean_distance, folds=5, verbose=True)

Fold Accuracy:  0.975609756097561
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9634146341463414
Fold Accuracy:  1.0


0.9829268292682926

In [33]:
cross_validation(X=train_X, y=train_y, model=lwrf, temperature=2.0, distance_agg_func=nearest_k_distance_mean(k=10), distance_function=euclidean_distance, folds=5, verbose=True)

Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9939024390243902
Fold Accuracy:  0.9451219512195121
Fold Accuracy:  1.0


0.9829268292682928

In [34]:
cross_validation(X=train_X, y=train_y, model=lwrf, temperature=2.0, distance_agg_func=nearest_k_distance_mean(k=1), distance_function=euclidean_distance, folds=5, verbose=True)

Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9817073170731707
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9634146341463414
Fold Accuracy:  0.9878048780487805


0.9817073170731707