In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier

from load_data import load_heart_data, load_tweet_data

In [2]:
train_X, test_X, train_y, test_y = load_heart_data("heart_dataset.csv")

In [3]:
print("Shape of Train X: ", train_X.shape)
print("Shape of Test X: ", test_X.shape)
print("Shape of Train y: ", train_y.shape)
print("Shape of Test y: ", test_y.shape)

Shape of Train X:  (820, 13)
Shape of Test X:  (205, 13)
Shape of Train y:  (820,)
Shape of Test y:  (205,)


# Baseline Performance

Before testing out the new method explored in the project regarding the locally weighted trees, we will first test out the existing method we aim to improve upon, the random forest model.

In [4]:
# To find the best random forrest model, we need to tune the hyperparameters. Will do 
# so by performing a grid search 

grid_search_parameters = {'max_depth':[5, 10, 15, 30, 50, None], 
                          'n_estimators':[10, 25, 50, 100, 250, 500, 1000], 
                          'max_samples': [0.3, 0.5, 0.7, 0.8, 0.9, 1]}


In [5]:
random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier_GS = GridSearchCV(random_forest_classifier, grid_search_parameters, cv=10, verbose=0)
random_forest_classifier_GS.fit(train_X, train_y)


In [6]:
print("Best Parameters: ", random_forest_classifier_GS.best_params_)
print("Best Score: ", random_forest_classifier_GS.best_score_)

Best Parameters:  {'max_depth': 10, 'max_samples': 0.9, 'n_estimators': 50}
Best Score:  0.9902439024390244


In [7]:
# Train the model with the best parameters:
random_forest_classifier = RandomForestClassifier(random_state=0, n_estimators=50, max_samples=0.9, max_depth=10)
random_forest_classifier.fit(train_X, train_y)
predictions = random_forest_classifier.predict(test_X)

print(classification_report(test_y, predictions, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.97222   0.98592       108
           1    0.97000   1.00000   0.98477        97

    accuracy                        0.98537       205
   macro avg    0.98500   0.98611   0.98534       205
weighted avg    0.98580   0.98537   0.98537       205



# Locally Weighted Random Forest

In [26]:
import locallyWeightedRandomForest

In [28]:
# lwrf = LocallyWeightedRandomForest(n_estimators=50, max_samples=0.9, max_depth=10)
lwrf = LocallyWeightedRandomForest(n_estimators=1000, max_samples=0.5, max_depth=6)

In [29]:
def average_euclidean_distance(point, dataset):
    '''
    Compute L2 distance between test point and each training point
    
    Input: point is a 1d numpy array
    Output: dist is a numpy array containing the distances between the test point and each training point
    '''
    # Source: CSC2515 Homework 3 Starter Code. 
    
    dataset_norm = (dataset**2).sum(axis=1).reshape(-1,1)

    # Process test point shape
    point = np.squeeze(point)
    if point.ndim == 1:
        point = point.reshape(1, -1)
    assert point.shape[1] == dataset.shape[1]

    # Compute squared distance
    test_norm = (point**2).sum(axis=1).reshape(1,-1)
    dist = dataset_norm + test_norm - 2*dataset.dot(point.transpose())

    return np.mean(np.squeeze(dist))

In [30]:
lwrf.fit(train_X, train_y)

In [31]:
pred = lwrf.predict(test_X, average_euclidean_distance, temperature=2)
print(classification_report(test_y, pred, digits=5))

              precision    recall  f1-score   support

           0    0.98039   0.92593   0.95238       108
           1    0.92233   0.97938   0.95000        97

    accuracy                        0.95122       205
   macro avg    0.95136   0.95265   0.95119       205
weighted avg    0.95292   0.95122   0.95125       205

