In [1]:
import copy
import types
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score as sklearn_accuracy_score

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
def get_accuracy(classifier, X_test, y_test):
    """
    Calculating overall accuracy for classifier.
    """
    predictions = classifier.predict(X_test)
    
    return sklearn_accuracy_score(y_test, predictions)


def calculate_weights_for_trees(self, X_train, y_train):
    """
    Method to calculate trees' weights.
    """
    # calculating weights for each tree
    self.tree_weights = np.array([get_accuracy(estimator, X_train, y_train) 
                                         for estimator in self.estimators_])
    # ensuring that weights sum to one
    self.tree_weights = self.tree_weights / self.tree_weights.sum()
    

def predict_with_weights(self, X_test, X_train, y_train):
    """
    Method for weighted RF instance to leverage trees' weights.
    """
    #first, check if weights are already defined
    if not hasattr(self, 'tree_weights'):
        self.calculate_weights_for_trees(X_train, y_train)
    
    #getting raw predictions from trees
    raw_predictions = np.array([estimator.predict_proba(X_test)[:,1] for estimator in self.estimators_])
    #weighing raw predictions with trees' weights
    return np.sum(raw_predictions.T * self.tree_weights, axis=1)

In [5]:
results = []

In [6]:
n_samples = [500, 1000, 10000, 50000, 100000]
n_features = [5, 10, 50]
n_estimators = [25, 50, 100, 500]
max_depth = [1, 5, 10]

In [7]:
for samples in n_samples:
    for n_feature in n_features:
        for n_estimator in n_estimators:
            for depth in max_depth:
                
                print("Working on:", '-'.join([str(x) for x in (samples, n_feature, n_estimator, depth)]))
                
                X, y = make_classification(n_samples=samples, n_features=n_feature)
                
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
                
                standard_rf = RandomForestClassifier(n_estimators=n_estimator, 
                                                     max_depth=depth, 
                                                     n_jobs=-1,
                                                     random_state=np.random.randint(1, 1000000))
                _ = standard_rf.fit(X_train, y_train)

                standard_accuracy = get_accuracy(standard_rf, X_test, y_test)
                print("Standard RF achieves accuracy of: ", standard_accuracy)
                
                # Getting a copy of RF instance that we can manipulate
                weighted_rf = copy.deepcopy(standard_rf)

                # Adding functions to copied RF instance
                weighted_rf.calculate_weights_for_trees = types.MethodType(calculate_weights_for_trees, weighted_rf)
                weighted_rf.predict_with_weights = types.MethodType(predict_with_weights, weighted_rf)
                
                weighted_predictions = weighted_rf.predict_with_weights(X_test, X_train, y_train)

                # Turning weighted scores into 1s and 0s for sklearn accuracy
                weighted_predictions[np.greater(weighted_predictions, 0.5)] = 1
                weighted_predictions[weighted_predictions < 1] = 0

                weighted_accuracy = sklearn_accuracy_score(y_test, weighted_predictions)
                
                results.append([samples, n_feature, n_estimator, depth, standard_accuracy, weighted_accuracy])

results_df = pd.DataFrame(data=results,
                          columns=['n_samples', 'n_features', 
                                   'n_estimators', 'max_depth', 
                                   'accuracy_standard', 'accuracy_weighted'])

Working on: 500-5-25-1
Standard RF achieves accuracy of:  0.909090909091
Working on: 500-5-25-5
Standard RF achieves accuracy of:  0.933333333333
Working on: 500-5-25-10
Standard RF achieves accuracy of:  0.951515151515
Working on: 500-5-50-1
Standard RF achieves accuracy of:  0.878787878788
Working on: 500-5-50-5
Standard RF achieves accuracy of:  0.951515151515
Working on: 500-5-50-10
Standard RF achieves accuracy of:  0.927272727273
Working on: 500-5-100-1
Standard RF achieves accuracy of:  0.90303030303
Working on: 500-5-100-5
Standard RF achieves accuracy of:  0.951515151515
Working on: 500-5-100-10
Standard RF achieves accuracy of:  0.915151515152
Working on: 500-5-500-1
Standard RF achieves accuracy of:  0.951515151515
Working on: 500-5-500-5
Standard RF achieves accuracy of:  0.939393939394
Working on: 500-5-500-10
Standard RF achieves accuracy of:  0.89696969697
Working on: 500-10-25-1
Standard RF achieves accuracy of:  0.915151515152
Working on: 500-10-25-5
Standard RF achiev

Standard RF achieves accuracy of:  0.962303030303
Working on: 50000-5-25-10
Standard RF achieves accuracy of:  0.913757575758
Working on: 50000-5-50-1
Standard RF achieves accuracy of:  0.854909090909
Working on: 50000-5-50-5
Standard RF achieves accuracy of:  0.923575757576
Working on: 50000-5-50-10
Standard RF achieves accuracy of:  0.897939393939
Working on: 50000-5-100-1
Standard RF achieves accuracy of:  0.943393939394
Working on: 50000-5-100-5
Standard RF achieves accuracy of:  0.945454545455
Working on: 50000-5-100-10
Standard RF achieves accuracy of:  0.893454545455
Working on: 50000-5-500-1
Standard RF achieves accuracy of:  0.809212121212
Working on: 50000-5-500-5
Standard RF achieves accuracy of:  0.893696969697
Working on: 50000-5-500-10
Standard RF achieves accuracy of:  0.976303030303
Working on: 50000-10-25-1
Standard RF achieves accuracy of:  0.966121212121
Working on: 50000-10-25-5
Standard RF achieves accuracy of:  0.927575757576
Working on: 50000-10-25-10
Standard RF

In [8]:
results_df

Unnamed: 0,n_samples,n_features,n_estimators,max_depth,accuracy_standard,accuracy_weighted
0,500,5,25,1,0.909091,0.909091
1,500,5,25,5,0.933333,0.933333
2,500,5,25,10,0.951515,0.951515
3,500,5,50,1,0.878788,0.866667
4,500,5,50,5,0.951515,0.951515
5,500,5,50,10,0.927273,0.927273
6,500,5,100,1,0.903030,0.884848
7,500,5,100,5,0.951515,0.951515
8,500,5,100,10,0.915152,0.915152
9,500,5,500,1,0.951515,0.951515


In [9]:
results_df['diff_improvement'] = results_df['accuracy_weighted'] - results_df['accuracy_standard']
results_df.to_csv('results_df.csv')

In [10]:
#Overall mean of improvement
results_df['diff_improvement'].mean() * 100

-0.0706734006734003

In [11]:
#Mean of improvement when grouped by grid search params
col_list = ['n_samples', 'n_features', 'n_estimators', 'max_depth']

for col in col_list:
    results_df.groupby([col])['diff_improvement'].mean() * 100

n_samples
500      -0.151515
1000      0.008418
10000    -0.029461
50000    -0.129293
100000   -0.051515
Name: diff_improvement, dtype: float64

n_features
5    -0.084747
10   -0.049192
50   -0.078081
Name: diff_improvement, dtype: float64

n_estimators
25     0.009832
50    -0.068215
100   -0.163636
500   -0.060673
Name: diff_improvement, dtype: float64

max_depth
1    -0.182020
5    -0.016061
10   -0.013939
Name: diff_improvement, dtype: float64