In [1]:
from sklearn.model_selection import KFold
import itertools
from FEBRL_utils import *
from utils import *

training_set_file_name = 'FEBRL_train'
testing_set_file_name = 'FEBRL_test'
set_seed(42)

print('Importing training data')
X_train, y_train = preprocess_and_generate_train_data(training_set_file_name)

print('Import testing data')
X_test, y_test = preprocess_and_generate_test_data(testing_set_file_name)

print('Base model performance')
models = {
    'svm': ['linear', 0.005],
#     'nn': ['relu', 100],
#     'lg': ['l2', 0.2],
#     'rf': ['gini', 400],
#     'dnn': ['relu', 200],
}

for model, model_params in models.items():
    md = train_model(model, model_params[1], X_train, y_train, model_params[0], 10000)
    prediction = classify(md, X_test)
    eval_results = evaluate_results(y_test, prediction)
    print(model, eval_results)

print('Bagging performance')
number_models = len(models)
number_folds = 10
kf = KFold(n_splits=number_folds)
model_raw_score = np.zeros((number_models, len(X_test)))
model_binary_score = [0] * number_models
model_i = 0
model_names = np.array(['.  '] * number_models, dtype=object)
for model, model_params in models.items():
    model_names[model_i] = model
    iFold = 0
    result_fold = [0] * number_folds
    for train_index, valid_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        md = train_model(model, model_params[1], X_train_fold, y_train_fold, model_params[0], 10000)
        result_fold[iFold] = classify(md, X_test)
        iFold = iFold + 1
    bagging_raw_score = np.average(result_fold, axis=0)
    bagging_binary_score = np.copy(bagging_raw_score)
    bagging_binary_score[bagging_binary_score > 0.5] = 1
    bagging_binary_score[bagging_binary_score <= 0.5] = 0
    bagging_eval = evaluate_results(y_test, bagging_binary_score)
    print(model, bagging_eval)
    model_raw_score[model_i, :] = bagging_raw_score
    model_binary_score[model_i] = bagging_binary_score
    model_i += 1



Importing training data
Train set size: 5000 , number of matched pairs:  1165
Import testing data
Test set size: 10000 , number of matched pairs:  5000
BLOCKING PERFORMANCE:
Number of pairs of matched given_name: 154898 , detected  3287 /5000 true matched pairs, missed 1713
Number of pairs of matched surname: 170843 , detected  3325 /5000 true matched pairs, missed 1675
Number of pairs of matched postcode: 53197 , detected  4219 /5000 true matched pairs, missed 781
Number of pairs of at least 1 field matched: 372073 , detected  4894 /5000 true matched pairs, missed 106
Processing test set...
Preprocess...
Extract feature vectors...
Base model performance
svm confusion_matrx: [[366980    199]
 [    13   4881]], precision: 0.9608267716535434, recall: 0.9973436861463016, f_score: 0.9787447363144175
Bagging performance
svm confusion_matrx: [[367001    178]
 [    13   4881]], precision: 0.9648151808657838, recall: 0.9973436861463016, f_score: 0.9808098060886167


In [3]:
np.unique(model_raw_score)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [4]:
unique, counts = np.unique(model_raw_score, return_counts=True)
dict(zip(unique, counts))

{0.0: 366974,
 0.1: 14,
 0.2: 11,
 0.3: 5,
 0.4: 5,
 0.5: 5,
 0.6: 1,
 0.7: 5,
 0.8: 5,
 0.9: 21,
 1.0: 5027}