## Reproduce results of Scheme A

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics


In [1]:
import recordlinkage as rl, pandas as pd, numpy as np
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.utils import shuffle
from recordlinkage.preprocessing import phonetic
from numpy.random import choice
import collections, numpy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split, KFold

In [2]:
def generate_true_links(df): 
    # although the match_id column is included in the original df to imply the true links,
    # this function will create the true_link object identical to the true_links properties
    # of recordlinkage toolkit, in order to exploit "Compare.compute()" from that toolkit
    # in extract_function() for extracting features quicker.
    # This process should be deprecated in the future release of the UNSW toolkit.
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    processed = 0
    for match_id in df["match_id"].unique():
        if match_id != -1:    
            processed = processed + 1
            # print("In routine generate_true_links(), count =", processed)
            # clear_output(wait=True)
            linkages = df.loc[df['match_id'] == match_id]
            for j in range(len(linkages)-1):
                for k in range(j+1, len(linkages)):
                    indices_1 = indices_1 + [linkages.iloc[j]["rec_id"]]
                    indices_2 = indices_2 + [linkages.iloc[k]["rec_id"]]    
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def generate_false_links(df, size):
    # A counterpart of generate_true_links(), with the purpose to generate random false pairs
    # for training. The number of false pairs in specified as "size".
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    unique_match_id = df["match_id"].unique()
    for j in range(size):
            false_pair_ids = choice(unique_match_id, 2)
            candidate_1_cluster = df.loc[df['match_id'] == false_pair_ids[0]]
            candidate_1 = candidate_1_cluster.iloc[choice(range(len(candidate_1_cluster)))]
            candidate_2_cluster = df.loc[df['match_id'] == false_pair_ids[1]]
            candidate_2 = candidate_2_cluster.iloc[choice(range(len(candidate_2_cluster)))]    
            indices_1 = indices_1 + [candidate_1["rec_id"]]
            indices_2 = indices_2 + [candidate_2["rec_id"]]  
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def swap_fields_flag(f11, f12, f21, f22):
    return int((f11 == f22) and (f12 == f21))

def extract_features(df, links):
    c = rl.Compare()
    c.string('given_name', 'given_name', method='jarowinkler', label='y_name')
    c.string('given_name_soundex', 'given_name_soundex', method='jarowinkler', label='y_name_soundex')
    c.string('given_name_nysiis', 'given_name_nysiis', method='jarowinkler', label='y_name_nysiis')
    c.string('surname', 'surname', method='jarowinkler', label='y_surname')
    c.string('surname_soundex', 'surname_soundex', method='jarowinkler', label='y_surname_soundex')
    c.string('surname_nysiis', 'surname_nysiis', method='jarowinkler', label='y_surname_nysiis')
    c.exact('street_number', 'street_number', label='y_street_number')
    c.string('address_1', 'address_1', method='levenshtein', threshold=0.7, label='y_address1')
    c.string('address_2', 'address_2', method='levenshtein', threshold=0.7, label='y_address2')
    c.exact('postcode', 'postcode', label='y_postcode')
    c.exact('day', 'day', label='y_day')
    c.exact('month', 'month', label='y_month')
    c.exact('year', 'year', label='y_year')
        
    # Build features
    feature_vectors = c.compute(links, df, df)
    return feature_vectors

def generate_train_X_y(df):
    # This routine is to generate the feature vector X and the corresponding labels y
    # with exactly equal number of samples for both classes to train the classifier.
    pos = extract_features(df, train_true_links)
    train_false_links = generate_false_links(df, len(train_true_links))    
    neg = extract_features(df, train_false_links)
    X = pos.values.tolist() + neg.values.tolist()
    y = [1]*len(pos)+[0]*len(neg)
    X, y = shuffle(X, y, random_state=0)
    X = np.array(X)
    y = np.array(y)
    return X, y

def train_model(modeltype, modelparam, train_vectors, train_labels, modeltype_2):
    if modeltype == 'svm': # Support Vector Machine
        model = svm.SVC(C = modelparam, kernel = modeltype_2)
        model.fit(train_vectors, train_labels) 
    elif modeltype == 'lg': # Logistic Regression
        model = LogisticRegression(C=modelparam, penalty = modeltype_2,class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=None)
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nb': # Naive Bayes
        model = GaussianNB()
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nn': # Neural Network
        model = MLPClassifier(solver='lbfgs', alpha=modelparam, hidden_layer_sizes=(256, ), 
                              activation = modeltype_2,random_state=None, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=10000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        model.fit(train_vectors, train_labels)
    return model

def classify(model, test_vectors):
    result = model.predict(test_vectors)
    return result

    
def evaluation(test_labels, result):
    true_pos = np.logical_and(test_labels, result)
    count_true_pos = np.sum(true_pos)
    true_neg = np.logical_and(np.logical_not(test_labels),np.logical_not(result))
    count_true_neg = np.sum(true_neg)
    false_pos = np.logical_and(np.logical_not(test_labels), result)
    count_false_pos = np.sum(false_pos)
    false_neg = np.logical_and(test_labels,np.logical_not(result))
    count_false_neg = np.sum(false_neg)
    precision = count_true_pos/(count_true_pos+count_false_pos)
    sensitivity = count_true_pos/(count_true_pos+count_false_neg) # sensitivity = recall
    confusion_matrix = [count_true_pos, count_false_pos, count_false_neg, count_true_neg]
    no_links_found = np.count_nonzero(result)
    no_false = count_false_pos + count_false_neg
    Fscore = 2*precision*sensitivity/(precision+sensitivity)
    metrics_result = {'no_false':no_false, 'confusion_matrix':confusion_matrix ,'precision':precision,
                     'sensitivity':sensitivity ,'no_links':no_links_found, 'F-score': Fscore}
    return metrics_result

def blocking_performance(candidates, true_links, df):
    count = 0
    for candi in candidates:
        if df.loc[candi[0]]["match_id"]==df.loc[candi[1]]["match_id"]:
            count = count + 1
    return count

In [3]:
trainset = 'febrl3_UNSW'
testset = 'febrl4_UNSW'

In [4]:
## TRAIN SET CONSTRUCTION

# Import
print("Import train set...")
df_train = pd.read_csv(trainset+".csv", index_col = "rec_id")
train_true_links = generate_true_links(df_train)
print("Train set size:", len(df_train), ", number of matched pairs: ", str(len(train_true_links)))

# Preprocess train set
df_train['postcode'] = df_train['postcode'].astype(str)
df_train['given_name_soundex'] = phonetic(df_train['given_name'], method='soundex')
df_train['given_name_nysiis'] = phonetic(df_train['given_name'], method='nysiis')
df_train['surname_soundex'] = phonetic(df_train['surname'], method='soundex')
df_train['surname_nysiis'] = phonetic(df_train['surname'], method='nysiis')

# Final train feature vectors and labels
X_train, y_train = generate_train_X_y(df_train)
print("Finished building X_train, y_train")

Import train set...
Train set size: 5000 , number of matched pairs:  1165
Finished building X_train, y_train


In [5]:
print(X_train.shape)

(2330, 13)


In [6]:
# Blocking Criteria: declare non-match of all of the below fields disagree
# Import
print("Import test set...")
df_test = pd.read_csv(testset+".csv", index_col = "rec_id")
test_true_links = generate_true_links(df_test)
leng_test_true_links = len(test_true_links)
print("Test set size:", len(df_test), ", number of matched pairs: ", str(leng_test_true_links))

print("BLOCKING PERFORMANCE:")
blocking_fields = ["given_name", "surname", "postcode"]
all_candidate_pairs = []
for field in blocking_fields:
    block_indexer = rl.Index()
    block_indexer.block(on=field)
#     block_indexer = rl.BlockIndex(on=field)
    candidates = block_indexer.index(df_test)
    detects = blocking_performance(candidates, test_true_links, df_test)
    all_candidate_pairs = candidates.union(all_candidate_pairs)
    print("Number of pairs of matched "+ field +": "+str(len(candidates)), ", detected ",
         detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )
detects = blocking_performance(all_candidate_pairs, test_true_links, df_test)
print("Number of pairs of at least 1 field matched: " + str(len(all_candidate_pairs)), ", detected ",
     detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )

Import test set...
Test set size: 10000 , number of matched pairs:  5000
BLOCKING PERFORMANCE:
Number of pairs of matched given_name: 154898 , detected  3287 /5000 true matched pairs, missed 1713
Number of pairs of matched surname: 170843 , detected  3325 /5000 true matched pairs, missed 1675
Number of pairs of matched postcode: 53197 , detected  4219 /5000 true matched pairs, missed 781
Number of pairs of at least 1 field matched: 372073 , detected  4894 /5000 true matched pairs, missed 106


In [7]:
## TEST SET CONSTRUCTION

# Preprocess test set
print("Processing test set...")
print("Preprocess...")
df_test['postcode'] = df_test['postcode'].astype(str)
df_test['given_name_soundex'] = phonetic(df_test['given_name'], method='soundex')
df_test['given_name_nysiis'] = phonetic(df_test['given_name'], method='nysiis')
df_test['surname_soundex'] = phonetic(df_test['surname'], method='soundex')
df_test['surname_nysiis'] = phonetic(df_test['surname'], method='nysiis')

# Test feature vectors and labels construction
print("Extract feature vectors...")
df_X_test = extract_features(df_test, all_candidate_pairs)
vectors = df_X_test.values.tolist()
labels = [0]*len(vectors)
feature_index = df_X_test.index
for i in range(0, len(feature_index)):
    if df_test.loc[feature_index[i][0]]["match_id"]==df_test.loc[feature_index[i][1]]["match_id"]:
        labels[i] = 1
X_test, y_test = shuffle(vectors, labels, random_state=0)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Count labels of y_test:",collections.Counter(y_test))
print("Finished building X_test, y_test")

Processing test set...
Preprocess...
Extract feature vectors...
Count labels of y_test: Counter({0: 367179, 1: 4894})
Finished building X_test, y_test


In [8]:
def argmax(a):
    np_a = np.array(a)
    i = np.argmax(np_a)
    return np_a[i], i

def argmin(a):
    np_a = np.array(a)
    i = np.argmin(np_a)
    return np_a[i], i

In [9]:
## BASE LEARNERS CLASSIFICATION AND EVALUATION
# Choose model
## BASE LEARNERS CLASSIFICATION AND EVALUATION
# Choose model
print("BASE LEARNERS CLASSIFICATION PERFORMANCE:")
models = {
#     'svm': ['linear', 'rbf'],
    'lg': ['none', 'l2'],
    'nn': ['relu', 'logistic'],
}

modelparam_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000] # C for svm, C for lg, alpha for NN
for model, model_types in models.items():
    for model_type in model_types:
        precision = []
        sensitivity = []
        Fscore = []
        nb_false = []
        print("Model:", model,", Param_1:",model_type, ", tuning range:", modelparam_range)
        for modelparam in modelparam_range:
            md = train_model(model, modelparam, X_train, y_train, model_type)
            final_result = classify(md, X_test)
            final_eval = evaluation(y_test, final_result)
            precision += [final_eval['precision']]
            sensitivity += [final_eval['sensitivity']]
            Fscore += [final_eval['F-score']]
            nb_false  += [final_eval['no_false']]
        print("No_false:",nb_false,"\n")
        print("Precision:",precision,"\n")
        print("Sensitivity:",sensitivity,"\n")
        print("F-score:", Fscore,"\n")

        print("MIN No_false:",argmin(nb_false),"\n")
        print("MAX Precision:",argmax(precision),"\n")
        print("MAX Sensitivity:",argmax(sensitivity),"\n")
        print("MAX F-score:", argmax(Fscore),"\n")
        print("")

BASE LEARNERS CLASSIFICATION PERFORMANCE:
Model: lg , Param_1: none , tuning range: [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]




No_false: [22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708, 22708] 

Precision: [0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594, 0.17714223575467594] 

Sensitivity: [0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009, 0.9985696771557009]

  precision = count_true_pos/(count_true_pos+count_false_pos)


No_false: [9645, 9933, 8956, 9332, 10093, 8628, 5098, 6722, 3764, 2300, 823, 623, 512, 418, 396, 395, 417, 421, 4894, 367179] 

Precision: [0.3365669074647403, 0.33000472302813577, 0.3533150368337426, 0.3439521800281294, 0.3265033704865514, 0.3618469735089537, 0.48977955911823645, 0.4212476305359297, 0.5653330249768732, 0.6804898413581965, 0.8571679241972275, 0.8886260236578708, 0.9070977331846897, 0.9238636363636363, 0.9283809523809524, 0.9301969783897495, 0.9303980003845415, 0.9315068493150684, nan, 0.013153332813722038] 

Sensitivity: [0.9995913363302003, 0.9993870044953004, 0.9995913363302003, 0.9993870044953004, 0.9995913363302003, 0.9991826726604005, 0.9987740089906008, 0.9989783408255006, 0.9989783408255006, 0.9991826726604005, 0.9981610134859011, 0.9977523498161014, 0.9975480179812015, 0.9967306906416019, 0.9959133633020024, 0.9938700449530037, 0.9887617490805067, 0.9865140988966081, 0.0, 1.0] 

F-score: [0.5035771269751403, 0.4961704286076592, 0.5220917822838848, 0.51177147640

  precision = count_true_pos/(count_true_pos+count_false_pos)


No_false: [7125, 12184, 7930, 7273, 6766, 3909, 3666, 2028, 1698, 958, 541, 470, 404, 391, 407, 420, 4894, 367179, 367179, 4894] 

Precision: [0.4071268004329365, 0.2864925023430178, 0.3815543071161049, 0.4021390374331551, 0.41964285714285715, 0.5560104628681906, 0.5717960710944808, 0.7072606306045705, 0.7427095990279465, 0.8371017471736896, 0.9020875669684094, 0.9142322097378277, 0.9264817629179332, 0.9309090909090909, 0.9303663917130252, 0.9313536444273043, nan, 0.013153332813722038, 0.013153332813722038, nan] 

Sensitivity: [0.9991826726604005, 0.9993870044953004, 0.9991826726604005, 0.9987740089906008, 0.9987740089906008, 0.9989783408255006, 0.9991826726604005, 0.9991826726604005, 0.9991826726604005, 0.9985696771557009, 0.9977523498161014, 0.9975480179812015, 0.996526358806702, 0.9938700449530037, 0.9910093992644053, 0.9869227625664079, 0.0, 1.0, 1.0, 0.0] 

F-score: [0.5785270629991127, 0.44532459255212603, 0.5522303783173348, 0.5734060648718401, 0.5909805343972917, 0.714400526046

  precision = count_true_pos/(count_true_pos+count_false_pos)


In [10]:
## ENSEMBLE CLASSIFICATION AND EVALUATION

print("BAGGING PERFORMANCE:\n")
modeltypes = ['svm', 'nn', 'lg'] 
modeltypes_2 = ['linear', 'relu', 'l2']
modelparams = [0.005, 100, 0.2]
nFold = 10
kf = KFold(n_splits=nFold)
model_raw_score = [0]*3
model_binary_score = [0]*3
model_i = 0
for model_i in range(3):
    modeltype = modeltypes[model_i]
    modeltype_2 = modeltypes_2[model_i]
    modelparam = modelparams[model_i]
    print(modeltype, "per fold:")
    iFold = 0
    result_fold = [0]*nFold
    final_eval_fold = [0]*nFold
    for train_index, valid_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        md =  train_model(modeltype, modelparam, X_train_fold, y_train_fold, modeltype_2)
        result_fold[iFold] = classify(md, X_test)
        final_eval_fold[iFold] = evaluation(y_test, result_fold[iFold])
        print("Fold", str(iFold), final_eval_fold[iFold])
        iFold = iFold + 1
    bagging_raw_score = np.average(result_fold, axis=0)
    bagging_binary_score  = np.copy(bagging_raw_score)
    bagging_binary_score[bagging_binary_score > 0.5] = 1
    bagging_binary_score[bagging_binary_score <= 0.5] = 0
    bagging_eval = evaluation(y_test, bagging_binary_score)
    print(modeltype, "bagging:", bagging_eval)
    print('')
    model_raw_score[model_i] = bagging_raw_score
    model_binary_score[model_i] = bagging_binary_score

BAGGING PERFORMANCE:

svm per fold:
Fold 0 {'no_false': 297, 'confusion_matrix': [4881, 284, 13, 366895], 'precision': 0.9450145208131655, 'sensitivity': 0.9973436861463016, 'no_links': 5165, 'F-score': 0.9704742022069788}
Fold 1 {'no_false': 257, 'confusion_matrix': [4881, 244, 13, 366935], 'precision': 0.952390243902439, 'sensitivity': 0.9973436861463016, 'no_links': 5125, 'F-score': 0.974348737398942}
Fold 2 {'no_false': 276, 'confusion_matrix': [4881, 263, 13, 366916], 'precision': 0.9488724727838258, 'sensitivity': 0.9973436861463016, 'no_links': 5144, 'F-score': 0.972504482964734}
Fold 3 {'no_false': 276, 'confusion_matrix': [4881, 263, 13, 366916], 'precision': 0.9488724727838258, 'sensitivity': 0.9973436861463016, 'no_links': 5144, 'F-score': 0.972504482964734}
Fold 4 {'no_false': 247, 'confusion_matrix': [4881, 234, 13, 366945], 'precision': 0.9542521994134897, 'sensitivity': 0.9973436861463016, 'no_links': 5115, 'F-score': 0.9753222100109901}
Fold 5 {'no_false': 248, 'confusi

In [11]:
thres = .99

print("STACKING PERFORMANCE:\n")
stack_raw_score = np.average(model_raw_score, axis=0)
stack_binary_score = np.copy(stack_raw_score)
stack_binary_score[stack_binary_score > thres] = 1
stack_binary_score[stack_binary_score <= thres] = 0
stacking_eval = evaluation(y_test, stack_binary_score)
print(stacking_eval)

STACKING PERFORMANCE:

{'no_false': 199, 'confusion_matrix': [4870, 175, 24, 367004], 'precision': 0.9653121902874133, 'sensitivity': 0.9950960359624029, 'no_links': 5045, 'F-score': 0.9799778649763556}


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = [1, 'log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='f1')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [13]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 1,
 'max_depth': 20,
 'bootstrap': False}

In [15]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
print(evaluation(y_test, base_model.predict(X_test)))

best_random = rf_random.best_estimator_
print(evaluation(y_test, best_random.predict(X_test)))


{'no_false': 14904, 'confusion_matrix': [4888, 14898, 6, 352281], 'precision': 0.24704336399474375, 'sensitivity': 0.9987740089906008, 'no_links': 19786, 'F-score': 0.3961102106969206}
{'no_false': 5967, 'confusion_matrix': [4891, 5964, 3, 361215], 'precision': 0.45057577153385536, 'sensitivity': 0.9993870044953004, 'no_links': 10855, 'F-score': 0.6211188011937265}


In [16]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [15, 20, 25],
    'max_features': [1],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [1700, 1750, 1800, 1850, 1900]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring='f1')

In [17]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 135 candidates, totalling 405 fits


{'bootstrap': False,
 'max_depth': 15,
 'max_features': 1,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 1700}

In [18]:
best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
# grid_accuracy = evaluate(best_grid, test_features, test_labels)
print(evaluation(y_test, best_grid.predict(X_test)))

{'no_false': 3945, 'confusion_matrix': [4890, 3941, 4, 363238], 'precision': 0.5537311742724493, 'sensitivity': 0.9991826726604005, 'no_links': 8831, 'F-score': 0.7125683060109289}
[CV] END bootstrap=False, max_depth=20, max_features=1, min_samples_leaf=2, min_samples_split=5, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   1.6s
[CV] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   2.3s
[CV] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.5s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=   2.

[CV] END bootstrap=True, max_depth=100, max_features=1, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   2.1s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   2.4s
[CV] END bootstrap=False, max_depth=110, max_features=1, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1800; total time=   3.2s
[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1850; total time=   3.9s
[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=1, min_samples_split=5, n_estimators=1800; total time=   3.8s
[CV] END bootstrap=False, max_depth=15, max_fea

[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1750; total time=   3.7s
[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=1, min_samples_split=4, n_estimators=1900; total time=   4.1s
[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=1, min_samples_split=6, n_estimators=1700; total time=   3.5s
[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=1, min_samples_split=6, n_estimators=1850; total time=   4.4s
[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=2, min_samples_split=4, n_estimators=1850; total time=   3.4s
[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=2, min_samples_split=5, n_estimators=1800; total time=   3.8s
[CV] END bootstrap=False, max_depth=15, max_features=1, min_samples_leaf=2, min_samples_split=6, n_estimators=1750; total time=   3.4s
[CV] END bootstrap=False, max_depth=15, max_features=1,

[CV] END bootstrap=True, max_depth=20, max_features=1, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=   1.2s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=90, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   5.2s
[CV] END bootstrap=False, max_depth=40, max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   2.0s
[CV] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=   2.3s
[CV] END bootstrap=True, max_depth=80, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   6.0s
[CV] END bootstrap=False, max_depth=None, max