In [1]:
import copy
import types
import numbers
import numpy as np
import pandas as pd

In [2]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn.metrics import *
from sklearn.metrics import accuracy_score as sklearn_accuracy_score

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [12]:
def get_accuracy(classifier, X_test, y_test):
    """
    Calculating overall accuracy for classifier.
    """
    predictions = classifier.predict(X_test)
    
    return sklearn_accuracy_score(y_test, predictions)


def joint_sort_descending(l1, l2):
    # l1 and l2 have to be numpy arrays
    idx = np.argsort(l1)[::-1]
    return l1[idx], l2[idx]

def generate_binary_at_k(y_scores, k):
    cutoff_index = int(len(y_scores) * (k / 100.0))
    predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))]
    return predictions_binary, cutoff_index


def precision_at_k(y_true, y_scores, k, cutoff_index_req=False):
    #y_scores_sorted, y_true_sorted = zip(*sorted(zip(y_scores, y_true), reverse=True))
    y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))
    preds_at_k, cutoff_index = generate_binary_at_k(y_scores_sorted, k)
    #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k)
    #precision = precision[1]  # only interested in precision for label 1
    precision = precision_score(y_true_sorted, preds_at_k)
    if cutoff_index_req:
        return precision, cutoff_index
    else:
        return precision


def calculate_weights_for_trees(self, X_train, y_train, k, use_oob=False):
    """
    Method to calculate trees' weights.
    """
    # calculating weights for each tree
    n_samples = X_train.shape[0]    
        
    if use_oob:
        
        list_of_weights=[]
        
        for estimator in self.estimators_:
            #Get tree's random seed to recalculate bootstrap
            random_seed = estimator.random_state
            oob_indices = _generate_unsampled_indices(random_seed, n_samples)
            X_oob, y_oob = X_train[oob_indices], y_train[oob_indices]
            list_of_weights.append(precision_at_k(y_oob, estimator.predict_proba(X_oob)[:,1], k))
        self.tree_weights=np.array(list_of_weights)
    
    else:
        self.tree_weights = np.array([precision_at_k(y_train, estimator.predict_proba(X_train)[:,1], k) 
                                     for estimator in self.estimators_])
    # ensuring that weights sum to one
    self.tree_weights = self.tree_weights / self.tree_weights.sum()
    

def predict_with_weights(self, X_test, X_train, y_train, k):
    """
    Method for weighted RF instance to leverage trees' weights.
    """
    #first, check if weights are already defined
    if not hasattr(self, 'tree_weights'):
        self.calculate_weights_for_trees(X_train, y_train, k)
    
    #getting raw predictions from trees
    raw_predictions = np.array([estimator.predict_proba(X_test)[:,1] for estimator in self.estimators_])
    #weighing raw predictions with trees' weights
    return np.sum(raw_predictions.T * self.tree_weights, axis=1)

In [10]:
#functions from sklearn
def check_random_state(seed):
    """Turn seed into a np.random.RandomState instance
    Parameters
    ----------
    seed : None | int | instance of RandomState
        If seed is None, return the RandomState singleton used by np.random.
        If seed is an int, return a new RandomState instance seeded with seed.
        If seed is already a RandomState instance, return it.
        Otherwise raise ValueError.
    """
    if seed is None or seed is np.random:
        return np.random.mtrand._rand
    if isinstance(seed, (numbers.Integral, np.integer)):
        return np.random.RandomState(seed)
    if isinstance(seed, np.random.RandomState):
        return seed
    raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
                     ' instance' % seed)

def _generate_sample_indices(random_state, n_samples):
    """Private function used to _parallel_build_trees function."""
    random_instance = check_random_state(random_state)
    sample_indices = random_instance.randint(0, n_samples, n_samples)

    return sample_indices

def _generate_unsampled_indices(random_state, n_samples):
    """Private function used to forest._set_oob_score function."""
    sample_indices = _generate_sample_indices(random_state, n_samples)
    sample_counts = np.bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]

    return unsampled_indices

In [6]:
results = []

In [7]:
n_samples = [1000, 10000, 50000, 100000]
n_features = [5, 10, 50]
n_estimators = [25, 50, 100, 250]
max_depth = [1, 3, 5, 10]
precision_at_k_params = [1, 5, 10, 25]

In [13]:
for samples in n_samples:
    for n_feature in n_features:
        for n_estimator in n_estimators:
            for depth in max_depth:
                
                print("Working on:", '-'.join([str(x) for x in (samples, n_feature, n_estimator, depth)]))

                X, y = make_classification(n_samples=samples, n_features=n_feature)

                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

                standard_rf = RandomForestClassifier(n_estimators=n_estimator, 
                                                     max_depth=depth, 
                                                     n_jobs=-1,
                                                     random_state=np.random.randint(1, 1000000))
                _ = standard_rf.fit(X_train, y_train) 

                pred_probs = standard_rf.predict_proba(X_test)[:,1]
                standard_prec_at_k_list = [precision_at_k(y_test, pred_probs, k) for k in precision_at_k_params]

                # Getting a copy of RF instance that we can manipulate
                weighted_rf = copy.deepcopy(standard_rf)

                # Adding functions to copied RF instance
                weighted_rf.calculate_weights_for_trees = types.MethodType(calculate_weights_for_trees, weighted_rf)
                weighted_rf.predict_with_weights = types.MethodType(predict_with_weights, weighted_rf)

                weighted_prec_at_k_list = []
                cutoff_index_at_k_list = []

                for k in precision_at_k_params:

                    weighted_rf.calculate_weights_for_trees(X_train, y_train, k, use_oob=True)
                    weighted_predictions = weighted_rf.predict_with_weights(X_test, X_train, y_train, k)

                    weighted_prec_at_k, cutoff_index = precision_at_k(y_test, weighted_predictions, 
                                                                      k, cutoff_index_req=True)
                    weighted_prec_at_k_list.append(weighted_prec_at_k)
                    cutoff_index_at_k_list.append(cutoff_index)
                    
                    
                results.append([samples, n_feature, n_estimator, depth, len(X_test)] + 
                               cutoff_index_at_k_list + standard_prec_at_k_list + weighted_prec_at_k_list)



Working on: 1000-5-25-1
Working on: 1000-5-25-3
Working on: 1000-5-25-5
Working on: 1000-5-25-10
Working on: 1000-5-50-1
Working on: 1000-5-50-3
Working on: 1000-5-50-5
Working on: 1000-5-50-10
Working on: 1000-5-100-1
Working on: 1000-5-100-3
Working on: 1000-5-100-5
Working on: 1000-5-100-10
Working on: 1000-5-250-1
Working on: 1000-5-250-3
Working on: 1000-5-250-5
Working on: 1000-5-250-10
Working on: 1000-10-25-1
Working on: 1000-10-25-3
Working on: 1000-10-25-5
Working on: 1000-10-25-10
Working on: 1000-10-50-1
Working on: 1000-10-50-3
Working on: 1000-10-50-5
Working on: 1000-10-50-10
Working on: 1000-10-100-1
Working on: 1000-10-100-3
Working on: 1000-10-100-5
Working on: 1000-10-100-10
Working on: 1000-10-250-1
Working on: 1000-10-250-3
Working on: 1000-10-250-5
Working on: 1000-10-250-10
Working on: 1000-50-25-1
Working on: 1000-50-25-3
Working on: 1000-50-25-5
Working on: 1000-50-25-10
Working on: 1000-50-50-1
Working on: 1000-50-50-3
Working on: 1000-50-50-5
Working on: 1000

In [14]:
results_df = pd.DataFrame(data=results,
                          columns=['n_samples', 'n_features', 
                                   'n_estimators', 'max_depth', 'n_samples_testset'] +
                                  ['cutoff_index_at_' + str(k) for k in precision_at_k_params] +
                                  ['precision_standard_at_' + str(k) for k in precision_at_k_params] +
                                  ['precision_weighted_at_' + str(k) for k in precision_at_k_params])
results_df

Unnamed: 0,n_samples,n_features,n_estimators,max_depth,n_samples_testset,cutoff_index_at_1,cutoff_index_at_5,cutoff_index_at_10,cutoff_index_at_25,precision_standard_at_1,precision_standard_at_5,precision_standard_at_10,precision_standard_at_25,precision_weighted_at_1,precision_weighted_at_5,precision_weighted_at_10,precision_weighted_at_25
0,1000,5,25,1,330,3,16,33,82,1.000000,1.000000,1.000000,0.975610,1.000000,1.000000,1.000000,0.975610
1,1000,5,25,3,330,3,16,33,82,1.000000,1.000000,0.969697,0.951220,1.000000,1.000000,0.969697,0.951220
2,1000,5,25,5,330,3,16,33,82,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3,1000,5,25,10,330,3,16,33,82,1.000000,1.000000,1.000000,0.975610,1.000000,1.000000,1.000000,0.975610
4,1000,5,50,1,330,3,16,33,82,1.000000,1.000000,1.000000,0.987805,1.000000,1.000000,1.000000,0.987805
5,1000,5,50,3,330,3,16,33,82,1.000000,1.000000,1.000000,0.975610,1.000000,1.000000,1.000000,0.975610
6,1000,5,50,5,330,3,16,33,82,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
7,1000,5,50,10,330,3,16,33,82,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
8,1000,5,100,1,330,3,16,33,82,1.000000,1.000000,0.939394,0.902439,1.000000,1.000000,0.939394,0.902439
9,1000,5,100,3,330,3,16,33,82,1.000000,1.000000,0.969697,0.975610,1.000000,1.000000,0.969697,0.975610


In [15]:
for k in precision_at_k_params:
    results_df["improved_at" + str(k)] = results_df["precision_weighted_at_" + str(k)] -\
                                            results_df["precision_standard_at_" + str(k)]
    
results_df.to_csv('results_df_top_k_oob.csv')

In [17]:
for k in precision_at_k_params:
    print("For precision at %s, we improved by: %s" % (k, results_df["improved_at" + str(k)].mean()* 100))

For precision at 1, we improved by: -0.5160984848484849
For precision at 5, we improved by: -0.051846590909090995
For precision at 10, we improved by: -0.0007891414141413325
For precision at 25, we improved by: 0.012412232076866418


In [18]:
#Mean of improvement when grouped by grid search params
col_list = ['n_samples', 'n_features', 'n_estimators', 'max_depth', 'n_samples_testset']

for k in precision_at_k_params:
    print("----"*10)
    print("For k =", k)
    for col in col_list:
        print("\nGrouped by col: ", col, "precision at ", str(k), "improved:")
        results_df.groupby([col])['improved_at' + str(k)].mean() * 100

----------------------------------------
For k = 1

Grouped by col:  n_samples precision at  1 improved:


n_samples
1000     -2.083333
10000     0.063131
50000     0.000000
100000   -0.044192
Name: improved_at1, dtype: float64


Grouped by col:  n_features precision at  1 improved:


n_features
5    -0.037879
10   -0.501894
50   -1.008523
Name: improved_at1, dtype: float64


Grouped by col:  n_estimators precision at  1 improved:


n_estimators
25    -0.726010
50    -0.744949
100   -0.568182
250   -0.025253
Name: improved_at1, dtype: float64


Grouped by col:  max_depth precision at  1 improved:


max_depth
1    -1.363636
3     0.000000
5    -0.700758
10    0.000000
Name: improved_at1, dtype: float64


Grouped by col:  n_samples_testset precision at  1 improved:


n_samples_testset
330     -2.083333
3300     0.063131
16500    0.000000
33000   -0.044192
Name: improved_at1, dtype: float64

----------------------------------------
For k = 5

Grouped by col:  n_samples precision at  5 improved:


n_samples
1000     -0.260417
10000     0.037879
50000     0.017677
100000   -0.002525
Name: improved_at5, dtype: float64


Grouped by col:  n_features precision at  5 improved:


n_features
5    -0.001894
10   -0.123224
50   -0.030421
Name: improved_at5, dtype: float64


Grouped by col:  n_estimators precision at  5 improved:


n_estimators
25    -0.093592
50    -0.133996
100   -0.111269
250    0.131471
Name: improved_at5, dtype: float64


Grouped by col:  max_depth precision at  5 improved:


max_depth
1    -0.194760
3    -0.011364
5     0.000000
10   -0.001263
Name: improved_at5, dtype: float64


Grouped by col:  n_samples_testset precision at  5 improved:


n_samples_testset
330     -0.260417
3300     0.037879
16500    0.017677
33000   -0.002525
Name: improved_at5, dtype: float64

----------------------------------------
For k = 10

Grouped by col:  n_samples precision at  10 improved:


n_samples
1000     -0.063131
10000     0.050505
50000     0.012626
100000   -0.003157
Name: improved_at10, dtype: float64


Grouped by col:  n_features precision at  10 improved:


n_features
5    -0.005682
10   -0.035985
50    0.039299
Name: improved_at10, dtype: float64


Grouped by col:  n_estimators precision at  10 improved:


n_estimators
25    -0.080177
50    -0.008838
100    0.012626
250    0.073232
Name: improved_at10, dtype: float64


Grouped by col:  max_depth precision at  10 improved:


max_depth
1     2.312965e-16
3     7.638889e-02
5    -2.335859e-02
10   -5.618687e-02
Name: improved_at10, dtype: float64


Grouped by col:  n_samples_testset precision at  10 improved:


n_samples_testset
330     -0.063131
3300     0.050505
16500    0.012626
33000   -0.003157
Name: improved_at10, dtype: float64

----------------------------------------
For k = 25

Grouped by col:  n_samples precision at  25 improved:


n_samples
1000      0.025407
10000    -0.022727
50000     0.019697
100000    0.027273
Name: improved_at25, dtype: float64


Grouped by col:  n_features precision at  25 improved:


n_features
5     0.001136
10   -0.023674
50    0.059775
Name: improved_at25, dtype: float64


Grouped by col:  n_estimators precision at  25 improved:


n_estimators
25     0.032071
50     0.009848
100   -0.001361
250    0.009091
Name: improved_at25, dtype: float64


Grouped by col:  max_depth precision at  25 improved:


max_depth
1    -0.021465
3     0.062023
5     0.007071
10    0.002020
Name: improved_at25, dtype: float64


Grouped by col:  n_samples_testset precision at  25 improved:


n_samples_testset
330      0.025407
3300    -0.022727
16500    0.019697
33000    0.027273
Name: improved_at25, dtype: float64