In [157]:
import numpy as np
import pandas as pd
from time import time
import os
import joblib
from sklearn.metrics import confusion_matrix

In [158]:
#Initialise repositories
_projroot = os.path.abspath('.')
_datadir = os.path.join(_projroot, 'data')
_preprocesseddir = os.path.join(_datadir, 'preprocesseddata')
_experimentdir = os.path.join(_datadir, 'experiments')
_rawdir = os.path.join(_datadir, 'rawdata')
_src = os.path.join(_projroot,'src')
_sampling = os.path.join(_experimentdir,'sampling')
splitsampling = os.path.join(_sampling,'splitsampling')
modelsdir = os.path.join(splitsampling,'models')

In [201]:
#Import pickle files
models = {}
for filename in os.listdir(modelsdir):
    if filename.startswith('split_data_4'):
        models[os.path.splitext(filename)[0]] = joblib.load(os.path.join(modelsdir,filename))

#k max is max number of classifiers added to adaboost
#W_1(i) is the error associated with the ith sample in classifier 1
#Pick weak classifier with smallest error
print(models)

{'split_data_4_2_model': LogisticRegression(C=1, max_iter=1000), 'split_data_4_4_model': LogisticRegression(C=1, max_iter=1000), 'split_data_4_3_model': LogisticRegression(C=0.1, max_iter=1000), 'split_data_4_1_model': LogisticRegression(C=1, max_iter=1000)}




In [202]:
#Import training files
features = pd.read_excel(os.path.join(splitsampling, 'splitsampling_features.xlsx'), sheet_name = 'train_test_split')
target = pd.read_excel(os.path.join(splitsampling, 'splitsampling_target.xlsx'), sheet_name = 'train_test_split')
features_test = pd.read_excel(os.path.join(splitsampling, 'splitsampling_features.xlsx'), sheet_name = 'test_data')
target_test = pd.read_excel(os.path.join(splitsampling, 'splitsampling_target.xlsx'), sheet_name = 'test_data')


In [203]:
features = features.drop("Unnamed: 0", axis = 1)
target = target.values[:,1]
total_positives = np.sum(target)
total_negatives = len(target) - np.sum(target)
target = np.where(target == 0, -1, target)


In [204]:
errors = {}
k_max = 4
W = np.ones(len(target)) * (1/len(target))
classifier = 0
b = total_negatives/total_positives
beta = 0.5
for p in range(0,4):
    errors = {}
    Ws_new = {}
    for key in models:
        y_pred = models[key].predict(features)
        y_pred = np.where(y_pred == 0, -1, y_pred)
        tn, fp, fn, tp = confusion_matrix(target, y_pred, labels = [1., -1.]).ravel()
        gamma = tp/ total_positives
        error = 0
        W_tot = 0
        for i in range(0, len(target)):
            W_tot = W_tot + W[i]
            if (target[i]+y_pred[i]) == 0:
                error = error + W[i]
        errors[key] = error/W_tot
        
        
        #print(W_new)
        
    #if gamma > 0.5 or errors[key] < 0.5*((1-(2*gamma-1))/b+1):
    print(errors)
    min_val = min(errors.values())
    min_key = [k for k, v in errors.items() if v == min_val]
    min_model = models[min_key[0]]
        
    logplus = 1 + (2*gamma - 1)/(b+1)
    logminus = 1 - (2*gamma -1)/b+1
    logtot = logplus/logminus
    k = 0.5*(np.log(logtot))/(np.exp(beta*(2*gamma-1)))
    print('k =', k)                                                             
             
    alpha = 0.5*np.log((1-errors[min_key[0]])/(errors[min_key[0]])) + k*np.exp(beta*(2*gamma-1))
    print('alpha',alpha)
    y_pred_min = min_model.predict(features)
    y_pred_min = np.where(y_pred_min == 0, -1, y_pred_min)
    
    W_new = np.array([])
    for i in range(0, len(target)):
        W_new = np.append(W_new,(W[i]*np.exp(-(alpha*target[i]*y_pred[i]))))
    W = W_new
    classifier = classifier + alpha*y_pred_min
    del models[min_key[0]]
    print(models)
print(np.sign(classifier))
    
    
print(errors)

{'split_data_4_2_model': 0.07999999999999882, 'split_data_4_4_model': 0.09655172413792967, 'split_data_4_3_model': 0.06758620689655068, 'split_data_4_1_model': 0.07999999999999882}
k = 7.869428826768949e-06
alpha 2.683957074893497
{'split_data_4_2_model': LogisticRegression(C=1, max_iter=1000), 'split_data_4_4_model': LogisticRegression(C=1, max_iter=1000), 'split_data_4_1_model': LogisticRegression(C=1, max_iter=1000)}
{'split_data_4_2_model': 0.7862218847286935, 'split_data_4_4_model': 0.7871376999796423, 'split_data_4_1_model': 0.9490959356347581}
k = 7.869428826768949e-06
alpha 0.7206204778855557
{'split_data_4_4_model': LogisticRegression(C=1, max_iter=1000), 'split_data_4_1_model': LogisticRegression(C=1, max_iter=1000)}
{'split_data_4_4_model': 0.8176277478353855, 'split_data_4_1_model': 0.9874673732192502}
k = 7.869428826768949e-06
alpha 0.6215920773018113
{'split_data_4_1_model': LogisticRegression(C=1, max_iter=1000)}
{'split_data_4_1_model': 0.9963522569208207}
k = 7.8694288

In [205]:
print(np.sign(classifier))

[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1. -1. -1.  1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1

In [206]:
classifier = np.where(np.sign(classifier) == -1, 0, np.sign(classifier))
target = np.where(target == -1, 0, target)
print(classifier)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [207]:
from sklearn.metrics import f1_score, recall_score, roc_auc_score
print(roc_auc_score(target, classifier))
print(f1_score(target, classifier))
print(recall_score(target, classifier))
        
    

0.5702711351602956
0.22857142857142856
0.1568627450980392
