## Reproduce results of Scheme B

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics

In [21]:
trainset = 'ePBRN_F_dup' 
testset = 'ePBRN_D_dup'

import recordlinkage as rl, pandas as pd, numpy as np
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.utils import shuffle
from recordlinkage.preprocessing import phonetic
from numpy.random import choice
import collections, numpy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split, KFold


def generate_true_links(df): 
    # although the match_id column is included in the original df to imply the true links,
    # this function will create the true_link object identical to the true_links properties
    # of recordlinkage toolkit, in order to exploit "Compare.compute()" from that toolkit
    # in extract_function() for extracting features quicker.
    # This process should be deprecated in the future release of the UNSW toolkit.
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    processed = 0
    for match_id in df["match_id"].unique():
        if match_id != -1:    
            processed = processed + 1
            # print("In routine generate_true_links(), count =", processed)
            # clear_output(wait=True)
            linkages = df.loc[df['match_id'] == match_id]
            for j in range(len(linkages)-1):
                for k in range(j+1, len(linkages)):
                    indices_1 = indices_1 + [linkages.iloc[j]["rec_id"]]
                    indices_2 = indices_2 + [linkages.iloc[k]["rec_id"]]    
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def generate_false_links(df, size):
    # A counterpart of generate_true_links(), with the purpose to generate random false pairs
    # for training. The number of false pairs in specified as "size".
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    unique_match_id = df["match_id"].unique()
    unique_match_id = unique_match_id[~np.isnan(unique_match_id)] # remove nan values
    for j in range(size):
            false_pair_ids = choice(unique_match_id, 2)
            candidate_1_cluster = df.loc[df['match_id'] == false_pair_ids[0]]
            candidate_1 = candidate_1_cluster.iloc[choice(range(len(candidate_1_cluster)))]
            candidate_2_cluster = df.loc[df['match_id'] == false_pair_ids[1]]
            candidate_2 = candidate_2_cluster.iloc[choice(range(len(candidate_2_cluster)))]    
            indices_1 = indices_1 + [candidate_1["rec_id"]]
            indices_2 = indices_2 + [candidate_2["rec_id"]]  
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def swap_fields_flag(f11, f12, f21, f22):
    return ((f11 == f22) & (f12 == f21)).astype(float)

def join_names_space(f11, f12, f21, f22):
    return ((f11+" "+f12 == f21) | (f11+" "+f12 == f22)| (f21+" "+f22 == f11)| (f21+" "+f22 == f12)).astype(float)

def join_names_dash(f11, f12, f21, f22):
    return ((f11+"-"+f12 == f21) | (f11+"-"+f12 == f22)| (f21+"-"+f22 == f11)| (f21+"-"+f22 == f12)).astype(float)

def abb_surname(f1, f2):
    return ((f1[0]==f2) | (f1==f2[0])).astype(float)

def reset_day(f11, f12, f21, f22):
    return (((f11 == 1) & (f12 == 1))|((f21 == 1) & (f22 == 1))).astype(float)

def extract_features(df, links):
    c = rl.Compare()
    c.string('given_name', 'given_name', method='levenshtein', label='y_name_leven')
    c.string('surname', 'surname', method='levenshtein', label='y_surname_leven')  
    c.string('given_name', 'given_name', method='jarowinkler', label='y_name_jaro')
    c.string('surname', 'surname', method='jarowinkler', label='y_surname_jaro')  
    c.string('postcode', 'postcode', method='jarowinkler', label='y_postcode')      
    exact_fields = ['postcode', 'address_1', 'address_2', 'street_number']
    for field in exact_fields:
        c.exact(field, field, label='y_'+field+'_exact')
    c.compare_vectorized(reset_day,('day', 'month'), ('day', 'month'),label='reset_day_flag')    
    c.compare_vectorized(swap_fields_flag,('day', 'month'), ('day', 'month'),label='swap_day_month')    
    c.compare_vectorized(swap_fields_flag,('surname', 'given_name'), ('surname', 'given_name'),label='swap_names')    
    c.compare_vectorized(join_names_space,('surname', 'given_name'), ('surname', 'given_name'),label='join_names_space')
    c.compare_vectorized(join_names_dash,('surname', 'given_name'), ('surname', 'given_name'),label='join_names_dash')
    c.compare_vectorized(abb_surname,'surname', 'surname',label='abb_surname')
    # Build features
    feature_vectors = c.compute(links, df, df)
    return feature_vectors

def generate_train_X_y(df):
    # This routine is to generate the feature vector X and the corresponding labels y
    # with exactly equal number of samples for both classes to train the classifier.
    pos = extract_features(df, train_true_links)
    train_false_links = generate_false_links(df, len(train_true_links))    
    neg = extract_features(df, train_false_links)
    X = pos.values.tolist() + neg.values.tolist()
    y = [1]*len(pos)+[0]*len(neg)
    X, y = shuffle(X, y, random_state=0)
    X = np.array(X)
    y = np.array(y)
    return X, y

def train_model(modeltype, modelparam, train_vectors, train_labels, modeltype_2):
    if modeltype == 'svm': # Support Vector Machine
        model = svm.SVC(C = modelparam, kernel = modeltype_2)
        model.fit(train_vectors, train_labels) 
    elif modeltype == 'lg': # Logistic Regression
        model = LogisticRegression(C=modelparam, penalty = modeltype_2,class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=None)
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nb': # Naive Bayes
        model = GaussianNB()
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nn': # Neural Network
        model = MLPClassifier(solver='lbfgs', alpha=modelparam, hidden_layer_sizes=(256, ), 
                              activation = modeltype_2,random_state=None, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=30000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        model.fit(train_vectors, train_labels)
    return model

def classify(model, test_vectors):
    result = model.predict(test_vectors)
    return result

    
def evaluation(test_labels, result):
    true_pos = np.logical_and(test_labels, result)
    count_true_pos = np.sum(true_pos)
    true_neg = np.logical_and(np.logical_not(test_labels),np.logical_not(result))
    count_true_neg = np.sum(true_neg)
    false_pos = np.logical_and(np.logical_not(test_labels), result)
    count_false_pos = np.sum(false_pos)
    false_neg = np.logical_and(test_labels,np.logical_not(result))
    count_false_neg = np.sum(false_neg)
    precision = count_true_pos/(count_true_pos+count_false_pos)
    sensitivity = count_true_pos/(count_true_pos+count_false_neg) # sensitivity = recall
    confusion_matrix = [count_true_pos, count_false_pos, count_false_neg, count_true_neg]
    no_links_found = np.count_nonzero(result)
    no_false = count_false_pos + count_false_neg
    Fscore = 2*precision*sensitivity/(precision+sensitivity)
    metrics_result = {'no_false':no_false, 'confusion_matrix':confusion_matrix ,'precision':precision,
                     'sensitivity':sensitivity ,'no_links':no_links_found, 'F-score': Fscore}
    return metrics_result

def blocking_performance(candidates, true_links, df):
    count = 0
    for candi in candidates:
        if df.loc[candi[0]]["match_id"]==df.loc[candi[1]]["match_id"]:
            count = count + 1
    return count

In [22]:
## TRAIN SET CONSTRUCTION

# Import
print("Import train set...")
df_train = pd.read_csv(trainset+".csv", index_col = "rec_id")
train_true_links = generate_true_links(df_train)
print("Train set size:", len(df_train), ", number of matched pairs: ", str(len(train_true_links)))

# Preprocess train set
df_train['postcode'] = df_train['postcode'].astype(str)

# Final train feature vectors and labels
X_train, y_train = generate_train_X_y(df_train)
print("Finished building X_train, y_train")

Import train set...
Train set size: 14078 , number of matched pairs:  3192
Finished building X_train, y_train


In [23]:
print(X_train.shape)

(6384, 15)


In [24]:
# Blocking Criteria: declare non-match of all of the below fields disagree
# Import
print("Import test set...")
df_test = pd.read_csv(testset+".csv", index_col = "rec_id")
test_true_links = generate_true_links(df_test)
leng_test_true_links = len(test_true_links)
print("Test set size:", len(df_test), ", number of matched pairs: ", str(leng_test_true_links))

print("BLOCKING PERFORMANCE:")
blocking_fields = ["given_name", "surname", "postcode"]
all_candidate_pairs = []
for field in blocking_fields:
    block_indexer = rl.Index()
    block_indexer.block(on=field)
#     block_indexer = rl.BlockIndex(on=field)
    candidates = block_indexer.index(df_test)
    detects = blocking_performance(candidates, test_true_links, df_test)
    all_candidate_pairs = candidates.union(all_candidate_pairs)
    print("Number of pairs of matched "+ field +": "+str(len(candidates)), ", detected ",
         detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )
detects = blocking_performance(all_candidate_pairs, test_true_links, df_test)
print("Number of pairs of at least 1 field matched: " + str(len(all_candidate_pairs)), ", detected ",
     detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )

Import test set...
Test set size: 11731 , number of matched pairs:  2653
BLOCKING PERFORMANCE:
Number of pairs of matched given_name: 252552 , detected  1567 /2653 true matched pairs, missed 1086
Number of pairs of matched surname: 33832 , detected  1480 /2653 true matched pairs, missed 1173
Number of pairs of matched postcode: 79940 , detected  2462 /2653 true matched pairs, missed 191
Number of pairs of at least 1 field matched: 362910 , detected  2599 /2653 true matched pairs, missed 54


In [25]:
## TEST SET CONSTRUCTION

# Preprocess test set
print("Processing test set...")
print("Preprocess...")
df_test['postcode'] = df_test['postcode'].astype(str)

# Test feature vectors and labels construction
print("Extract feature vectors...")
df_X_test = extract_features(df_test, all_candidate_pairs)
vectors = df_X_test.values.tolist()
labels = [0]*len(vectors)
feature_index = df_X_test.index
for i in range(0, len(feature_index)):
    if df_test.loc[feature_index[i][0]]["match_id"]==df_test.loc[feature_index[i][1]]["match_id"]:
        labels[i] = 1
X_test, y_test = shuffle(vectors, labels, random_state=0)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Count labels of y_test:",collections.Counter(y_test))
print("Finished building X_test, y_test")

Processing test set...
Preprocess...
Extract feature vectors...
Count labels of y_test: Counter({0: 360311, 1: 2599})
Finished building X_test, y_test


In [26]:
def argmax(a):
    np_a = np.array(a)
    i = np.argmax(np_a)
    return np_a[i], i

def argmin(a):
    np_a = np.array(a)
    i = np.argmin(np_a)
    return np_a[i], i

In [27]:
print(np.average(y_train), np.average(y_test))

0.5 0.007161555206525034


In [29]:
## BASE LEARNERS CLASSIFICATION AND EVALUATION
# Choose model
print("BASE LEARNERS CLASSIFICATION PERFORMANCE:")
models = {
    'svm': ['linear', 'rbf'],
    'lg': ['none', 'l2'],
    'nn': ['relu', 'logistic'],
}

modelparam_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000] # C for svm, C for lg, alpha for NN
for model, model_types in models.items():
    for model_type in model_types:
        precision = []
        sensitivity = []
        Fscore = []
        nb_false = []
        print("Model:", model,", Param_1:",model_type, ", tuning range:", modelparam_range)
        for modelparam in modelparam_range:
            md = train_model(model, modelparam, X_train, y_train, model_type)
            final_result = classify(md, X_test)
            final_eval = evaluation(y_test, final_result)
            precision += [final_eval['precision']]
            sensitivity += [final_eval['sensitivity']]
            Fscore += [final_eval['F-score']]
            nb_false  += [final_eval['no_false']]
        print("No_false:",nb_false,"\n")
        print("Precision:",precision,"\n")
        print("Sensitivity:",sensitivity,"\n")
        print("F-score:", Fscore,"\n")

        print("MIN No_false:",argmin(nb_false),"\n")
        print("MAX Precision:",argmax(precision),"\n")
        print("MAX Sensitivity:",argmax(sensitivity),"\n")
        print("MAX F-score:", argmax(Fscore),"\n")
        print("")
    


BASE LEARNERS CLASSIFICATION PERFORMANCE:
Model: svm , Param_1: linear , tuning range: [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]
No_false: [1217, 1754, 50444, 77628, 77631, 77633, 77637, 78113, 78429, 79735, 91408, 95127, 92091, 96668, 96668, 96206, 96216, 96204, 96226, 96253] 

Precision: [0.6910951327433629, 0.600715137067938, 0.048555415070483665, 0.03218565674452245, 0.03218445270219717, 0.03218365005735947, 0.03220537629045933, 0.032177727115031966, 0.032052231437598735, 0.03155516688529369, 0.02764687736019658, 0.026594764955078485, 0.027447460133065794, 0.02618191342540824, 0.02618191342540824, 0.02630433682505946, 0.026301674846936195, 0.026304869285345586, 0.02629901340753858, 0.026291830210820218] 

Sensitivity: [0.9615236629472874, 0.9696036937283571, 0.9899961523662947, 0.9930742593305117, 0.9930742593305117, 0.9930742593305117, 0.993843786071566, 0.9992304732589458, 0.9992304732589458, 0.9996152366294728, 1.0,



No_false: [77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476, 77476] 

Precision: [0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299, 0.03245707149547299] 

Sensitivity: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 

F-score: [0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.06287345477417325, 0.062873454774

  precision = count_true_pos/(count_true_pos+count_false_pos)


No_false: [95155, 98707, 121895, 95580, 97085, 100988, 96651, 92362, 93091, 87075, 78045, 77442, 72352, 52326, 24554, 5871, 1780, 1447, 1202, 2599] 

Precision: [0.026558088145025985, 0.02562685093780849, 0.020861113342437145, 0.026462409729366348, 0.02604386123317081, 0.025071681646602242, 0.02616730145295528, 0.027349221226449866, 0.027160622844602363, 0.028982759774293553, 0.03220486111111111, 0.03244749303447156, 0.03448966244444148, 0.04692311897516264, 0.09464199992614748, 0.30402491614758026, 0.5974304068522484, 0.6489141675284384, 0.693222683264177, nan] 

Sensitivity: [0.9988457098884186, 0.9988457098884186, 0.9992304732589458, 0.9996152366294728, 0.9988457098884186, 0.9992304732589458, 0.9992304732589458, 0.9992304732589458, 1.0, 1.0, 0.9992304732589458, 0.9992304732589458, 0.9942285494420932, 0.990765679107349, 0.9861485186610235, 0.9765294343978453, 0.966140823393613, 0.9657560600230858, 0.9642170065409773, 0.0] 

F-score: [0.05174046060171206, 0.049971607041453724, 0.04086

  precision = count_true_pos/(count_true_pos+count_false_pos)
  precision = count_true_pos/(count_true_pos+count_false_pos)


No_false: [84565, 97293, 116997, 95893, 86765, 79602, 78447, 76008, 80946, 77116, 69881, 61873, 34483, 6014, 2017, 1557, 1209, 2599, 2599, 360311] 

Precision: [0.029817355789087237, 0.026018099547511313, 0.021731496036656743, 0.02638792998416115, 0.029083299762768005, 0.03161762022359825, 0.03206820817807171, 0.033051332612429235, 0.0310865323613555, 0.032580195957897906, 0.03565315812720848, 0.04006952419379869, 0.06939117281616336, 0.2989520781820323, 0.5653052064631957, 0.6309703368526898, 0.6918829376035339, nan, nan, 0.007161555206525034] 

Sensitivity: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9996152366294728, 0.9992304732589458, 0.9992304732589458, 0.993843786071566, 0.9934590227010388, 0.9884570988841862, 0.9769141977683724, 0.9692189303578299, 0.9657560600230858, 0.9642170065409773, 0.0, 0.0, 1.0] 

F-score: [0.05790804674531822, 0.05071664829106947, 0.042538565407749904, 0.051419018508076876, 0.0565227319682916, 0.061297169811320745, 0.062143582999581566, 0.06398699571449681, 0

In [30]:
## ENSEMBLE CLASSIFICATION AND EVALUATION

print("BAGGING PERFORMANCE:\n")
modeltypes = ['svm', 'nn', 'lg'] 
modeltypes_2 = ['rbf', 'relu', 'l2']
modelparams = [0.001, 2000, 0.005]
nFold = 10
kf = KFold(n_splits=nFold)
model_raw_score = [0]*3
model_binary_score = [0]*3
model_i = 0
for model_i in range(3):
    modeltype = modeltypes[model_i]
    modeltype_2 = modeltypes_2[model_i]
    modelparam = modelparams[model_i]
    print(modeltype, "per fold:")
    iFold = 0
    result_fold = [0]*nFold
    final_eval_fold = [0]*nFold
    for train_index, valid_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        md =  train_model(modeltype, modelparam, X_train_fold, y_train_fold, modeltype_2)
        result_fold[iFold] = classify(md, X_test)
        final_eval_fold[iFold] = evaluation(y_test, result_fold[iFold])
        print("Fold", str(iFold), final_eval_fold[iFold])
        iFold = iFold + 1
    bagging_raw_score = np.average(result_fold, axis=0)
    bagging_binary_score  = np.copy(bagging_raw_score)
    bagging_binary_score[bagging_binary_score > 0.5] = 1
    bagging_binary_score[bagging_binary_score <= 0.5] = 0
    bagging_eval = evaluation(y_test, bagging_binary_score)
    print(modeltype, "bagging:", bagging_eval)
    print('')
    model_raw_score[model_i] = bagging_raw_score
    model_binary_score[model_i] = bagging_binary_score
    
thres = .99
print("STACKING PERFORMANCE:\n")
stack_raw_score = np.average(model_raw_score, axis=0)
stack_binary_score = np.copy(stack_raw_score)
stack_binary_score[stack_binary_score > thres] = 1
stack_binary_score[stack_binary_score <= thres] = 0
stacking_eval = evaluation(y_test, stack_binary_score)
print(stacking_eval)

BAGGING PERFORMANCE:

svm per fold:
Fold 0 {'no_false': 4713, 'confusion_matrix': [2552, 4666, 47, 355645], 'precision': 0.35356054308672763, 'sensitivity': 0.9819161215852251, 'no_links': 7218, 'F-score': 0.5199144341448508}
Fold 1 {'no_false': 4513, 'confusion_matrix': [2550, 4464, 49, 355847], 'precision': 0.3635585970915312, 'sensitivity': 0.9811465948441709, 'no_links': 7014, 'F-score': 0.5305315718298138}
Fold 2 {'no_false': 4616, 'confusion_matrix': [2552, 4569, 47, 355742], 'precision': 0.35837663249543605, 'sensitivity': 0.9819161215852251, 'no_links': 7121, 'F-score': 0.5251028806584361}
Fold 3 {'no_false': 4674, 'confusion_matrix': [2552, 4627, 47, 355684], 'precision': 0.35548126480011144, 'sensitivity': 0.9819161215852251, 'no_links': 7179, 'F-score': 0.5219881366332583}
Fold 4 {'no_false': 4604, 'confusion_matrix': [2550, 4555, 49, 355756], 'precision': 0.3589021815622801, 'sensitivity': 0.9811465948441709, 'no_links': 7105, 'F-score': 0.5255564715581204}
Fold 5 {'no_fals

In [31]:
print(model_raw_score)

[array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.])]


In [59]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = [1, 'log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='f1')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [60]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 1,
 'max_depth': 20,
 'bootstrap': False}

In [61]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
#     errors = abs(predictions - test_labels)
#     print(errors)
#     mape = 100 * np.mean(errors / test_labels)
#     accuracy = 100 - mape
    print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(predictions))
    
    return accuracy
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
print(evaluation(y_test, base_model.predict(X_test)))

best_random = rf_random.best_estimator_
print(evaluation(y_test, best_random.predict(X_test)))

# print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

{'no_false': 178505, 'confusion_matrix': [2599, 178505, 0, 181806], 'precision': 0.014350870218217158, 'sensitivity': 1.0, 'no_links': 181104, 'F-score': 0.02829567290681154}
{'no_false': 115029, 'confusion_matrix': [2599, 115029, 0, 245282], 'precision': 0.022095079402863264, 'sensitivity': 1.0, 'no_links': 117628, 'F-score': 0.043234880684039356}


In [62]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [15, 20, 25],
    'max_features': [1, 'log2', 'sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [1700, 1750, 1800, 1850, 1900]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring='f1')

In [63]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 405 candidates, totalling 1215 fits
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=450; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=350; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=500; total time=   1.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=450; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=3, n_estimators=400; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=1, 

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=300; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=450; total time=   1.6s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=400; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=350; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=450; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   0.7s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=300; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=450; total time=   1.7s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=400; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=400; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=500; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   1.0s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=400; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=350; total time=   1.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=300; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=450; total time=   1.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=3, n_estimators=350; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=1, n_estimators=350; total time=   0.2s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=350; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.5s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=500; total time=   1.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=450; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=3, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.3s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=400; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=350; total time=   1.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=300; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=450; total time=   1.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   1.0s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=3, n_estimators=350; total time=   1.0s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=3, n_estimators=500; total time=   1.4s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=350; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.6s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=300; total time=   0.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=350; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=400; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=500; total time=   0.2s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=450; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   1.6s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=350; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=300; total time=   0.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=350; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=450; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   0.7s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=400; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=350; total time=   1.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=300; total time=   1.0s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=450; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   0.8s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   1.4s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=1, n_estimators=300; total time=   0.2s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=350; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.7s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=300; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=400; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=450; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=350; total time=   0.8s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=300; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=450; total time=   1.7s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=1, n_estimators=500; total time=   0.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=350; total time=   0.9s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=3, n_estimators=300; total time=   0.8s
[CV] END bootstrap=False, m

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=450; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=350; total time=   1.2s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=500; total time=   1.5s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   1.3s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=3, n_estimators=500; total time=   1.4s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=450; total time=   1.4s
[CV] END bootstrap=False, m

{'bootstrap': False,
 'max_depth': 15,
 'max_features': 1,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 1700}

In [64]:
best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
# grid_accuracy = evaluate(best_grid, test_features, test_labels)
print(evaluation(y_test, best_grid.predict(X_test)))

{'no_false': 115203, 'confusion_matrix': [2599, 115203, 0, 245108], 'precision': 0.022062443761566016, 'sensitivity': 1.0, 'no_links': 117802, 'F-score': 0.04317239890034136}


In [65]:
best_grid

[CV] END bootstrap=False, max_depth=15, max_features=log2, min_samples_leaf=3, min_samples_split=4, n_estimators=1850; total time=   7.5s
[CV] END bootstrap=False, max_depth=15, max_features=log2, min_samples_leaf=3, min_samples_split=5, n_estimators=1800; total time=   7.3s
[CV] END bootstrap=False, max_depth=15, max_features=log2, min_samples_leaf=3, min_samples_split=6, n_estimators=1750; total time=   6.8s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1700; total time=   7.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1900; total time=   7.7s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1800; total time=   9.0s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1800; total time=   9.6s
[CV] END bootstrap=False, max_dept

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1700; total time=   7.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1850; total time=   7.6s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1800; total time=   8.0s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1750; total time=   7.9s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1700; total time=   7.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1900; total time=   8.0s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1850; total time=   7.2s
[CV] END bootstrap=False, max_depth=2

[CV] END bootstrap=False, max_depth=25, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1750; total time=  10.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1850; total time=   8.0s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1800; total time=   8.5s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1800; total time=   7.8s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1700; total time=   7.4s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1700; total time=   6.9s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1850; total time=   7.5s
[CV] END bootstrap=False, max_depth=20, 

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1700; total time=   7.0s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1850; total time=   7.9s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1800; total time=   8.4s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1750; total time=   9.7s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1700; total time=   8.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1900; total time=   9.0s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1850; total time=   7.6s
[CV] END bootstrap=False, max_dept

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1750; total time=   9.9s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1700; total time=   7.2s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1900; total time=   8.0s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1850; total time=   7.4s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1750; total time=   8.0s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1750; total time=   7.6s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1700; total time=   7.1s
[CV] END bootstrap=False, max_dept

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1750; total time=   7.7s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1700; total time=   7.8s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1700; total time=   7.2s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1850; total time=   8.2s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1800; total time=   7.4s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1750; total time=   7.7s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=5, n_estimators=1700; total time=   6.6s
[CV] END bootstrap=False, max_depth=20, 

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1750; total time=   7.4s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1700; total time=   7.5s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1900; total time=   8.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1850; total time=   8.5s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1800; total time=   7.7s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1750; total time=   7.9s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=5, n_estimators=1700; total time=   6.7s
[CV] END bootstrap=False, max_depth=20, 

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1900; total time=  10.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1900; total time=   8.8s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1850; total time=   8.4s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1800; total time=   7.8s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1800; total time=   7.7s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1700; total time=   7.9s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1900; total time=   8.5s
[CV] END bootstrap=False, max_dept

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1900; total time=   8.9s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1900; total time=   7.4s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=5, n_estimators=1850; total time=   7.2s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=6, n_estimators=1800; total time=   7.3s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=4, n_estimators=1750; total time=   6.6s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=5, n_estimators=1700; total time=   6.6s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=5, n_estimators=1900; total time=   8.1s
[CV] END bootstrap=False, max_depth=20, max_features

[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=5, n_estimators=1850; total time=   6.9s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=6, n_estimators=1750; total time=   7.2s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=4, n_estimators=1750; total time=   6.5s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=4, n_estimators=1900; total time=   7.8s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=5, n_estimators=1900; total time=   7.9s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=6, n_estimators=1850; total time=   7.7s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=3, min_samples_split=4, n_estimators=1800; total time=   8.7s
[CV] END bootstrap=False, max_depth=20, max_features=1,

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1850; total time=   7.9s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1800; total time=   7.1s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1700; total time=   7.6s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=5, n_estimators=1700; total time=   6.3s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=5, n_estimators=1850; total time=   7.4s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=6, n_estimators=1800; total time=   7.5s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=4, n_estimators=1750; total time=   6.8s
[CV] END bootstrap=False, max_depth=20, max_featu

[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1700; total time=   7.7s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1700; total time=   7.2s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1850; total time=   8.4s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=1800; total time=   8.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=1750; total time=   7.6s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1750; total time=   7.4s
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1900; total time=   7.7s
[CV] END bootstrap=False, max_depth=20, 

In [55]:
np.unique(y_train)

array([0, 1])