In [None]:
!pip3 install -r requirements.txt

In [1]:
import numpy as np
import pandas as pd
import statistics



from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from load_data import load_heart_data, load_tweet_data

In [2]:
train_X, test_X, train_y, test_y = load_heart_data("heart_dataset.csv")

In [3]:
print("Shape of Train X: ", train_X.shape, ", Type: ", type(train_X))
print("Shape of Test X: ", test_X.shape, ", Type: ", type(test_X))
print("Shape of Train y: ", train_y.shape, ", Type: ", type(train_y))
print("Shape of Test y: ", test_y.shape, ", Type: ", type(test_y))

Shape of Train X:  (820, 13) , Type:  <class 'numpy.ndarray'>
Shape of Test X:  (205, 13) , Type:  <class 'numpy.ndarray'>
Shape of Train y:  (820,) , Type:  <class 'numpy.ndarray'>
Shape of Test y:  (205,) , Type:  <class 'numpy.ndarray'>


# Baseline Performance

Before testing out the new method explored in the project regarding the locally weighted trees, we will first test out the existing method we aim to improve upon, the random forest model.

In [155]:
# To find the best random forrest model, we need to tune the hyperparameters. Will do 
# so by performing a grid search 

grid_search_parameters = {'max_depth':[5, 10, 15, 30, 50, None], 
                          'n_estimators':[10, 25, 50, 100, 250, 500, 1000], 
                          'max_samples': [0.3, 0.5, 0.7, 0.8, 0.9, 1]}


In [156]:
random_forest_classifier = RandomForestClassifier(random_state=1)
random_forest_classifier_GS = GridSearchCV(random_forest_classifier, grid_search_parameters, cv=10, verbose=0)
random_forest_classifier_GS.fit(train_X, train_y)


KeyboardInterrupt: ignored

In [None]:
print("Best Parameters: ", random_forest_classifier_GS.best_params_)
print("Best Score: ", random_forest_classifier_GS.best_score_)

In [138]:
# Train the model with the best parameters:
random_forest_classifier = RandomForestClassifier(random_state=1, n_estimators=500, max_samples=0.7, max_depth=10)
random_forest_classifier.fit(train_X, train_y)
predictions = random_forest_classifier.predict(test_X)

print(classification_report(test_y, predictions, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.94444   0.97143       108
           1    0.94175   1.00000   0.97000        97

    accuracy                        0.97073       205
   macro avg    0.97087   0.97222   0.97071       205
weighted avg    0.97244   0.97073   0.97075       205



In [139]:
RF_prob = random_forest_classifier.predict_proba(test_X)
fpr, tpr, _ = metrics.roc_curve(test_y, RF_prob[:, 1])
print("AUC = ", metrics.auc(fpr, tpr)) 

AUC =  0.9985681557846506


### Other Model Performance

In [4]:
from sklearn import metrics

KNN Model 

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [118]:
grid_search_parameters = {'n_neighbors':[2, 5, 8, 10,15,25]}

KNN = KNeighborsClassifier()
KNN_classifier_GS = GridSearchCV(KNN, grid_search_parameters, cv=5, verbose=0)
KNN_classifier_GS.fit(train_X, train_y)

print("Best Parameters: ", KNN_classifier_GS.best_params_)
print("Best Score: ", KNN_classifier_GS.best_score_)

Best Parameters:  {'n_neighbors': 2}
Best Score:  0.9414634146341463


In [119]:
# Train the model with the best parameters:
KNN_classifier = KNeighborsClassifier(n_neighbors=2)
KNN_classifier.fit(train_X, train_y)
predictions = KNN_classifier.predict(test_X)

print(classification_report(test_y, predictions, digits=5))

              precision    recall  f1-score   support

           0    0.98131   0.97222   0.97674       108
           1    0.96939   0.97938   0.97436        97

    accuracy                        0.97561       205
   macro avg    0.97535   0.97580   0.97555       205
weighted avg    0.97567   0.97561   0.97562       205



In [129]:
knn_prob = KNN_classifier.predict_proba(test_X)
fpr, tpr, _ = metrics.roc_curve(test_y, knn_prob[:, 1])
print("AUC = ", metrics.auc(fpr, tpr)) 

AUC =  0.9844883543337152


### SVM

In [147]:
from sklearn.svm import SVC

grid_search_parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
                          'C': [0.25,0.5,1,2,3,5],
                          'gamma': [0.1, 1, 'scale', 'auto']}
                          
#Using LibLinear to allow for both l1 and l2 loss in grid search 
svm = SVC()
svm_GS = GridSearchCV(svm, grid_search_parameters, cv=5, verbose=0)
svm_GS.fit(train_X, train_y)

print("Best Parameters: ", svm_GS.best_params_)
print("Best Score: ", svm_GS.best_score_)

Best Parameters:  {'C': 5, 'gamma': 0.1, 'kernel': 'poly'}
Best Score:  0.9707317073170731


In [150]:
# Train the model with the best parameters:
SVM_classifier = SVC(C=1, gamma=0.1, kernel='rbf', probability=True)
SVM_classifier.fit(train_X, train_y)
predictions = SVM_classifier.predict(test_X)

print(classification_report(test_y, predictions, digits=5))

              precision    recall  f1-score   support

           0    0.95238   0.92593   0.93897       108
           1    0.92000   0.94845   0.93401        97

    accuracy                        0.93659       205
   macro avg    0.93619   0.93719   0.93649       205
weighted avg    0.93706   0.93659   0.93662       205



In [151]:
svm_prob = SVM_classifier.predict_proba(test_X)
fpr, tpr, _ = metrics.roc_curve(test_y, svm_prob[:, 1])
print("AUC = ", metrics.auc(fpr, tpr)) 

AUC =  0.9572355861015654


### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
grid_search_parameters = {'penalty':['l2', 'l1'], 
                          'C': [0.25,0.5,1,2,3,5]}
                          
#Using LibLinear to allow for both l1 and l2 loss in grid search 
LR = LogisticRegression(solver='liblinear')
LogisticRegression_GS = GridSearchCV(LR, grid_search_parameters, cv=5, verbose=0)
LogisticRegression_GS.fit(train_X, train_y)

print("Best Parameters: ", LogisticRegression_GS.best_params_)
print("Best Score: ", LogisticRegression_GS.best_score_)

Best Parameters:  {'C': 1, 'penalty': 'l1'}
Best Score:  0.8512195121951219


In [132]:
# Train the model with the best parameters:
LR_classifier = LogisticRegression(solver='liblinear', C=1, penalty='l1')
LR_classifier.fit(train_X, train_y)
predictions = LR_classifier.predict(test_X)

print(classification_report(test_y, predictions, digits=5))

              precision    recall  f1-score   support

           0    0.91489   0.79630   0.85149       108
           1    0.80180   0.91753   0.85577        97

    accuracy                        0.85366       205
   macro avg    0.85835   0.85691   0.85363       205
weighted avg    0.86138   0.85366   0.85351       205



In [134]:
LR_prob = LR_classifier.predict_proba(test_X)
fpr, tpr, _ = metrics.roc_curve(test_y, LR_prob[:, 1])
print("AUC = ", metrics.auc(fpr, tpr)) 

AUC =  0.9080756013745706


# Locally Weighted Random Forest

In [7]:
from locallyWeightedRandomForest import LocallyWeightedRandomForest
from sklearn.utils.estimator_checks import check_estimator

# Distance functions

TODO move to a separate file, here for easier testing (for now) 

In [8]:
def euclidean_distance(x_1, x_2):
    return np.linalg.norm(x_1 - x_2)

def mean_distance(point, dataset, distance_function = lambda a,b: 1):
    distance_sum = 0
    for p in dataset:
        distance_sum += distance_function(point, p)
    return distance_sum/len(dataset)

def distance_to_dataset_mean(point, dataset, distance_function = lambda a,b: 1):
    mean_dataset_point = np.mean(dataset, axis=0)
    return distance_function(point, mean_dataset_point)

def median_distance(point, dataset, distance_function = lambda a,b: 1):
    distances = []
    for p in dataset:
        distances.append(distance_function(point, p))
    return statistics.median(distances)

def smallest_k_distance_mean(k):
    def _smallest_k_distances_mean(point, dataset, distance_function = lambda a,b: 1):
        distances = np.zeros(len(dataset))
        for i in range(len(dataset)):
            distances[i] = distance_function(point, dataset[i])
        smallest_k_distaces = np.sort(distances)[:k]
        return np.mean(smallest_k_distaces)

    return _smallest_k_distances_mean

In [None]:
# lwrf = LocallyWeightedRandomForest(n_estimators=50, max_samples=0.9, max_depth=10)
lwrf = LocallyWeightedRandomForest(n_estimators=50, max_samples=0.9, max_depth=10)
#TODO: lwrf class should allow execution of predict independent of execution of fit
#check_estimator(lwrf)

In [None]:
lwrf.fit(train_X, train_y)

In [None]:
pred = lwrf.predict(test_X, temperature=1)
print(classification_report(test_y, pred, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.97222   0.98592       108
           1    0.97000   1.00000   0.98477        97

    accuracy                        0.98537       205
   macro avg    0.98500   0.98611   0.98534       205
weighted avg    0.98580   0.98537   0.98537       205



In [None]:
pred = lwrf.predict(test_X, temperature=0.15, distance_function=euclidean_distance, distance_aggregation_function=mean_distance)
print(classification_report(test_y, pred, digits=5))

              precision    recall  f1-score   support

           0    1.00000   0.97222   0.98592       108
           1    0.97000   1.00000   0.98477        97

    accuracy                        0.98537       205
   macro avg    0.98500   0.98611   0.98534       205
weighted avg    0.98580   0.98537   0.98537       205



In [None]:
#TODO move to separate file 

def cross_validation(X, y, model, temperature, distance_function, distance_agg_func, folds=10, verbose=False):
    k_folds = KFold(n_splits=folds)
    validation_scores = []

    for train_index, val_index in k_folds.split(X):
        train_X, val_X = X[train_index], X[val_index]
        train_y, val_y = y[train_index], y[val_index]


        model.fit(train_X, train_y)
        predictions = model.predict(val_X, temperature, distance_function, distance_agg_func)

        accuracy = accuracy_score(val_y, predictions)

        if verbose:
            print("Fold Accuracy: ", accuracy)

        validation_scores.append(accuracy)

    return np.mean(np.array(validation_scores))

# Experiments

### Comparing sample sizes vs performance 

For this graph, we will keep the number of estimators constant at 300. 

In [None]:
performance = []

In [None]:
# For the random forest, we will try various values for the portion of the population we 
# sample the data from 
sample_sizes = range(0.1, 1.01, 0.1)

for size in sample_sizes:
    lwrf = LocallyWeightedRandomForest(n_estimators=300, max_samples=size, max_depth=10)


In [11]:
from sklearn.model_selection import cross_val_score


In [None]:
cross_validation(X=train_X, y=train_y, model=lwrf, temperature=1.0, distance_agg_func=mean_distance, distance_function=euclidean_distance, folds=5, verbose=True)

Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9451219512195121
Fold Accuracy:  1.0


0.9817073170731707

In [None]:
cross_validation(X=train_X, y=train_y, model=lwrf, temperature=2.0, distance_agg_func=mean_distance, distance_function=euclidean_distance, folds=5, verbose=True)

Fold Accuracy:  0.975609756097561
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9878048780487805
Fold Accuracy:  0.9634146341463414
Fold Accuracy:  1.0


0.9829268292682926

### Comparing Temperature vs Performance

In [9]:
temperature_list=[0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.75,0.85, 1, 2, 5, 10, 15, 25]

In [None]:
euc_mean_temp_scores = []
for temp in temperature_list:
  model = LocallyWeightedRandomForest(n_estimators=300, max_samples=0.7, max_depth=10, distance_aggregation_function=mean_distance, distance_function=euclidean_distance)
  euc_mean_temp_scores.append(np.mean(cross_val_score(model, train_X, train_y, cv=5, n_jobs=-1, verbose=2)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
euc_mean_temp_scores

In [None]:



# Baseline with random forest
model = RandomForestClassifier(random_state=1, n_estimators=300, max_samples=0.4, max_depth=10)
random_forest_cross_eval = np.mean(cross_val_score(model, train_X, train_y, cv=5))


plt.plot(temperature_list, k_1_temp_scores, label = "k=1 Euclidean")
plt.plot(temperature_list, k_3_temp_scores, label = "k=3 Euclidean")
plt.plot(temperature_list, k_5_temp_scores, label = "k=5 Euclidean")

plt.plot(temperature_list, euc_mean_temp_scores, label = " Euclidean Mean"  )
plt.plot(temperature_list, dataset_mean_temp_scores, label = " dataset euc Mean"  )

plt.axhline(y = random_forest_cross_eval, color = 'r', linestyle = '--', label = "Random Forest") 
plt.legend()
plt.show()