# Loading data

In [3]:
import os
import numpy as np
import pandas as pd
from time import process_time
from glob import glob

# settings to display all columns
pd.set_option("display.max_columns", None)

# Training data

In [4]:
train_data_path = "../data/train/logistic/old_histogram/logistic_365_days_100000_samples_90.npy"
train_data = np.load(train_data_path)
train_data.shape

(365, 551)

In [5]:
train_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Testing data

In [6]:
test_data_dir = "../data/test/logistic/old_histogram"

In [7]:
test_data_paths = glob(f"{test_data_dir}/*.npy")

In [8]:
id2file = {}
for i in range(len(test_data_paths)):
    test_file = os.path.split(test_data_paths[i])[1].replace(".npy", "")
    id2file[i] = test_file
id2file

{0: 'logistic_183_days_10000_samples_70',
 1: 'logistic_365_days_10000_samples_90',
 2: 'logistic_183_days_10000_samples_90',
 3: 'logistic_365_days_10000_samples_70'}

In [9]:
test_data_all = [np.load(test_data_path) for test_data_path in test_data_paths]

In [10]:
test_data_all[0].shape

(183, 551)

In [11]:
test_data_all[0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

# Preprocessing

## Training data

In [12]:
X_train = train_data[:, :-1]
X_train = X_train / sum(X_train[0])
X_train.shape

(365, 550)

In [13]:
y_train = train_data[1:, -1]
y_train.shape

(364,)

In [14]:
print(f"0: {len(y_train[y_train == 0])}")
print(f"1: {len(y_train[y_train == 1])}")

0: 335
1: 29


## Testing data

In [15]:
X_test_all = [test_data[:, :-1] / sum(test_data[0, :-1]) for test_data in test_data_all]
print(X_test_all[0].shape)

(183, 550)


In [16]:
y_test_all = [test_data[1:, -1] for test_data in test_data_all]
y_test_all[0].shape

(182,)

# Preparing data for training

In [17]:
start_time = process_time()


# Preparing interval frequencies
min_edge, max_edge = 300, 850
bin_edges = np.arange(min_edge, max_edge + 1, 1)

train_size = len(bin_edges)
num_days_train = X_train.shape[0]
percent_days_train = np.zeros((num_days_train, train_size, train_size))

for i in range(num_days_train):
    hist = X_train[i]
    for j in range(train_size - 1):
        for k in range(j + 1, train_size):
            percent_days_train[i, j, k] = np.sum(hist[j: k])
            

# Preparing PSIs
epsilon = 1e-8 # Smoothing hyperparameters

psi_train = []
for i in range(1, num_days_train):
    psi_train.append((percent_days_train[i] - percent_days_train[i - 1]) * np.log((percent_days_train[i] + epsilon) / (percent_days_train[i - 1] + epsilon)))
psi_train = np.array(psi_train)

# PSI_0
psi_0_train = psi_train[(1 - y_train).astype(bool)]
psi_0_train = np.sum(psi_0_train, axis=0)
# Normalization
psi_0_train = psi_0_train / np.sum(1 - y_train)


# PSI_1
psi_1_train = psi_train[y_train.astype(bool)]
psi_1_train = np.sum(psi_1_train, axis=0)
# Normalization
psi_1_train = psi_1_train / np.sum(y_train)


end_time = process_time()
preparing_data_time = end_time - start_time

print(f"Time for preparing data: {preparing_data_time} s")

Time for preparing data: 290.44382638 s


# Models

In [18]:
from ortools.linear_solver import pywraplp

## Declare the model

In [19]:
solver = pywraplp.Solver.CreateSolver('SCIP')

## Create the variables

In [20]:
x = np.empty(shape=(train_size, train_size), dtype=object)

for i in range(train_size):
    for j in range(train_size):
        if j > i:
            x[i, j] = solver.IntVar(0, 1, f'x[{i}, {j}]')
        else:
            x[i, j] = 0

## Create the constraints

In [21]:
start_time = process_time()

# Each row/column has at most one 1
# Non-overlap bins (a.k.a flow constraint)
for i in range(1, train_size - 1):
    solver.Add(solver.Sum(x[: i, i]) <= 1)
    solver.Add(solver.Sum(x[i, i + 1:]) <= 1)
    solver.Add(solver.Sum(x[: i, i]) == solver.Sum(x[i, i + 1:]))
    
# Ensure in-and-out
solver.Add(solver.Sum(x[0, 1:]) == 1)
solver.Add(solver.Sum(x[0: -1, -1]) == 1)

# Ensure at most k bins
max_num_bins = 25
min_num_bins = 5
solver.Add(solver.Sum(x.flatten()) <= max_num_bins)
solver.Add(solver.Sum(x.flatten()) >= min_num_bins)

end_time = process_time()
constraints_time = end_time - start_time

print(f"Time for creating constraints: {constraints_time} s")

Time for creating constraints: 3.1517625459999863 s


## Create the objective function & Invoke the solver & Print the solution & Testing

In [22]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

In [23]:
def precision_0_recall_1_inverse_weighted_fbeta(y_true, y_pred, beta=2.0):
    precisions, recalls, fbeta_scores, supports = precision_recall_fscore_support(y_true, y_pred, beta=beta, average=None)

    precision_0 = round(precisions[0], 4)
    recall_1 = round(recalls[1], 4)
    ratio_0, ratio_1 = supports / sum(supports)
    inverse_weighted_fbeta_score = round(fbeta_scores[0]*ratio_1 + fbeta_scores[1]*ratio_0, 4)
    
    return precision_0, recall_1, inverse_weighted_fbeta_score

In [24]:
train_dir_path, file_name = os.path.split(train_data_path)
dist, num_days, _, num_samples, _, ratio = file_name.replace(".npy", "").split("_")

print(dist)
print(num_days)
print(num_samples)

logistic
365
100000


In [25]:
# Array fir storing results
results = []

alphas = np.arange(0, 1.05, 0.05)
alphas = [round(alpha, 2) for alpha in alphas]
# alphas = [0.5, 0.9]

for alpha in alphas:  
    ########################
    ### current solution ###
    ########################
    result = [dist, num_days, num_samples, alpha, preparing_data_time, constraints_time]
    print(f"alpha = {alpha}")

    
    #######################
    ### Multi-objective ###
    #######################
    solver.Maximize(solver.Sum((alpha * psi_1_train * x).flatten()) - solver.Sum(((1 - alpha) * psi_0_train * x).flatten()))
    
    
    #########################
    ### Invoke the solver ###
    #########################
    start_time = process_time()
    status = solver.Solve()
    end_time = process_time()
    solving_time = end_time - start_time
    
    result.append(solving_time)
    print(f"Time for solving: {solving_time} s")
    
    
    ##########################
    ### Print the solution ###
    ##########################
    x_solution_value = np.zeros((train_size, train_size))

    for i in range(train_size):
        for j in range(train_size):
            if j > i:
                x_solution_value[i, j] = x[i, j].solution_value()
                
    final_bin_edges = []

    if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
        total_cost = solver.Objective().Value()
        result.append(total_cost)
        print(f"Total cost = {total_cost}")
        
        objective_0 = np.sum(psi_0_train * x_solution_value)
        result.append(objective_0)
        print(f"Objective_0 = {objective_0}")
        
        objective_1 = np.sum(psi_1_train * x_solution_value)
        result.append(objective_1)
        print(f"Objective_1 = {objective_1}", "\n")

        for i in range(train_size):
            for j in range(train_size):
                if j > i and x[i, j].solution_value() == 1:
                    final_bin_edges.append(i + 300)
        final_bin_edges.append(max_edge)
    else:
        print('No solution found.')
        
    print("final_bin_edges =", final_bin_edges, "\n")
              
    
    ###############
    ### Evaluation ###
    ###############
    # thresholds = np.arange(0.01, 1.01, 0.01)
    # thresholds = [round(threshold, 2) for threshold in thresholds]
    thresholds = [0.1]
              
    # Training Acccuracy & F1 & F0.5
    num_days_train = X_train.shape[0]
    best_train_threshold = best_train_precision_0 = best_train_recall_1 = best_train_inverse_weighted_f2 = 0
    best_y_train_pred = [0] * (num_days_train - 1)
    train_acc = train_f1 = 0
    
    for threshold in thresholds:
        y_train_pred = []
        
        for i in range(num_days_train - 1):
            hist_1 = []
            for j in range(len(final_bin_edges) - 1):
                hist_1.append(np.sum(X_train[i, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_1 = np.array(hist_1)

            hist_2 = []
            for j in range(len(final_bin_edges) - 1):
                hist_2.append(np.sum(X_train[i + 1, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_2 = np.array(hist_2)

            psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
            psi = np.sum(psis)
      
            if (y_train[i] == 0 and psi < threshold) or (y_train[i] == 1 and psi >= threshold):
                y_train_pred.append(y_train[i])
            else:
                y_train_pred.append(1 - y_train[i])
        
        train_precision_0, train_recall_1, train_inverse_weighted_f2 = precision_0_recall_1_inverse_weighted_fbeta(y_train, y_train_pred, beta=2.0)
        if train_inverse_weighted_f2 > best_train_inverse_weighted_f2:
            best_train_inverse_weighted_f2 = train_inverse_weighted_f2
            best_train_threshold = threshold
            best_train_precision_0 = train_precision_0
            best_train_recall_1 = train_recall_1
            best_y_train_pred = y_train_pred
            train_acc = accuracy_score(y_train, y_train_pred)

    print("Best threshold:", best_train_threshold)
    result.append(best_train_threshold)

    print("Training Accuracy:", train_acc)
    result.append(train_acc)

    print("Best Training Precision 0:", best_train_precision_0)
    result.append(best_train_precision_0)   

    print("Best Training Recall 1:", best_train_recall_1)
    result.append(best_train_recall_1)

    print("Best Training Inverse Weighted F2", best_train_inverse_weighted_f2)
    result.append(best_train_inverse_weighted_f2) 

    print(confusion_matrix(y_train, best_y_train_pred))
              
    # Testing Acccuracy & F1 & F2
    for i in range(len(test_data_paths)):
        X_test, y_test = X_test_all[i], y_test_all[i]
        num_days_test = X_test.shape[0]
        y_test_pred = []

        for i in range(num_days_test - 1):
            hist_1 = []
            for j in range(len(final_bin_edges) - 1):
                hist_1.append(np.sum(X_test[i, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_1 = np.array(hist_1)

            hist_2 = []
            for j in range(len(final_bin_edges) - 1):
                hist_2.append(np.sum(X_test[i + 1, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_2 = np.array(hist_2)

            psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
            psi = np.sum(psis)

            if (y_test[i] == 0 and psi < best_train_threshold) or (y_test[i] == 1 and psi >= best_train_threshold):
                y_test_pred.append(y_test[i])
            else:
                y_test_pred.append(1 - y_test[i])

        test_acc = accuracy_score(y_test, y_test_pred)
        print("Testing Accuracy:", test_acc)
        result.append(test_acc)
        
        test_precision_0, test_recall_1, test_inverse_weighted_f2 = precision_0_recall_1_inverse_weighted_fbeta(y_test, y_test_pred, beta=2.0)

        print("Testing Precision 0:", test_precision_0)
        result.append(test_precision_0)   

        print("Testing Recall 1:", test_recall_1)
        result.append(test_recall_1)

        print("Testing Inverse Weighted F2:", test_inverse_weighted_f2)
        result.append(test_inverse_weighted_f2)

        print(confusion_matrix(y_test, y_test_pred))

    results.append(result)

alpha = 0.0
Time for solving: 375.047555188 s
Total cost = 0.0
Objective_0 = 8.168242582045924e-33
Objective_1 = 1.2750984459391355e-32 

final_bin_edges = [300, 301, 302, 303, 304, 850] 

Best threshold: 0.1
Training Accuracy: 0.9203296703296703
Best Training Precision 0: 0.9203
Best Training Recall 1: 0.0
Best Training Inverse Weighted F2 0.0783
[[335   0]
 [ 29   0]]
Testing Accuracy: 0.7582417582417582
Testing Precision 0: 0.7582
Testing Recall 1: 0.0
Testing Inverse Weighted F2: 0.2273
[[138   0]
 [ 44   0]]
Testing Accuracy: 0.8928571428571429
Testing Precision 0: 0.8929
Testing Recall 1: 0.0
Testing Inverse Weighted F2: 0.1046
[[325   0]
 [ 39   0]]
Testing Accuracy: 0.8736263736263736
Testing Precision 0: 0.8736
Testing Recall 1: 0.0
Testing Inverse Weighted F2: 0.1228
[[159   0]
 [ 23   0]]
Testing Accuracy: 0.6593406593406593
Testing Precision 0: 0.6593
Testing Recall 1: 0.0
Testing Inverse Weighted F2: 0.3088
[[240   0]
 [124   0]]
alpha = 0.05


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Time for solving: 492.96401857399997 s
Total cost = 4.980994794539719e-05
Objective_0 = 0.00650824894276591
Objective_1 = 0.1246529288714602 

final_bin_edges = [300, 303, 316, 317, 318, 320, 326, 329, 342, 349, 357, 364, 366, 367, 368, 387, 390, 392, 396, 399, 401, 402, 403, 644, 649, 850] 

Best threshold: 0.1
Training Accuracy: 0.9725274725274725
Best Training Precision 0: 0.971
Best Training Recall 1: 0.6552
Best Training Inverse Weighted F2 0.7268
[[335   0]
 [ 10  19]]
Testing Accuracy: 0.8296703296703297
Testing Precision 0: 0.8166
Testing Recall 1: 0.2955
Testing Inverse Weighted F2: 0.4921
[[138   0]
 [ 31  13]]
Testing Accuracy: 0.9423076923076923
Testing Precision 0: 0.9393
Testing Recall 1: 0.4615
Testing Inverse Weighted F2: 0.5676
[[325   0]
 [ 21  18]]
Testing Accuracy: 0.9505494505494505
Testing Precision 0: 0.9464
Testing Recall 1: 0.6087
Testing Inverse Weighted F2: 0.7019
[[159   0]
 [  9  14]]
Testing Accuracy: 0.782967032967033
Testing Precision 0: 0.7524
Testing R

# Saving the results

In [26]:
df_columns = ["distribution", "num_days", "num_samples", "alpha", 
            "preparing_data_time", "creating_constraints_time", "solving_time", 
            "total_cost", "objective_0", "objective_1", "best_threshold",
            "training_acc", "training_precision_0", "training_recall_1", "training_inverse_weighted_f2"]

for i in range(len(test_data_paths)):
    df_columns.append(f"{id2file[i]}_acc")
    df_columns.append(f"{id2file[i]}_precision_0")
    df_columns.append(f"{id2file[i]}_recall_1")
    df_columns.append(f"{id2file[i]}_inverse_weighted_f2")
    
len(df_columns)

31

In [27]:
results_df = pd.DataFrame(results, columns=df_columns)
results_df

Unnamed: 0,distribution,num_days,num_samples,alpha,preparing_data_time,creating_constraints_time,solving_time,total_cost,objective_0,objective_1,best_threshold,training_acc,training_precision_0,training_recall_1,training_inverse_weighted_f2,logistic_183_days_10000_samples_70_acc,logistic_183_days_10000_samples_70_precision_0,logistic_183_days_10000_samples_70_recall_1,logistic_183_days_10000_samples_70_inverse_weighted_f2,logistic_365_days_10000_samples_90_acc,logistic_365_days_10000_samples_90_precision_0,logistic_365_days_10000_samples_90_recall_1,logistic_365_days_10000_samples_90_inverse_weighted_f2,logistic_183_days_10000_samples_90_acc,logistic_183_days_10000_samples_90_precision_0,logistic_183_days_10000_samples_90_recall_1,logistic_183_days_10000_samples_90_inverse_weighted_f2,logistic_365_days_10000_samples_70_acc,logistic_365_days_10000_samples_70_precision_0,logistic_365_days_10000_samples_70_recall_1,logistic_365_days_10000_samples_70_inverse_weighted_f2
0,logistic,365,100000,0.0,290.443826,3.151763,375.047555,0.0,8.168243000000001e-33,1.275098e-32,0.1,0.92033,0.9203,0.0,0.0783,0.758242,0.7582,0.0,0.2273,0.892857,0.8929,0.0,0.1046,0.873626,0.8736,0.0,0.1228,0.659341,0.6593,0.0,0.3088
1,logistic,365,100000,0.05,290.443826,3.151763,492.964019,5e-05,0.006508249,0.1246529,0.1,0.972527,0.971,0.6552,0.7268,0.82967,0.8166,0.2955,0.4921,0.942308,0.9393,0.4615,0.5676,0.950549,0.9464,0.6087,0.7019,0.782967,0.7524,0.3629,0.5938
2,logistic,365,100000,0.1,290.443826,3.151763,466.473017,0.008633,0.01093129,0.1847139,0.1,0.989011,0.9882,0.8621,0.8954,0.967033,0.9583,0.8636,0.9129,0.978022,0.976,0.7949,0.8467,0.989011,0.9876,0.913,0.9378,0.906593,0.8759,0.7258,0.8376
3,logistic,365,100000,0.15,290.443826,3.151763,490.492781,0.018847,0.0124516,0.1962026,0.1,0.997253,0.997,0.9655,0.9744,0.978022,0.9718,0.9091,0.9424,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.953297,0.9339,0.8629,0.9209
4,logistic,365,100000,0.2,290.443826,3.151763,444.606359,0.029418,0.01313219,0.1996171,0.1,0.997253,0.997,0.9655,0.9744,0.978022,0.9718,0.9091,0.9424,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.956044,0.9375,0.871,0.9257
5,logistic,365,100000,0.25,290.443826,3.151763,460.634753,0.040082,0.01325554,0.2000945,0.1,0.997253,0.997,0.9655,0.9744,0.972527,0.965,0.8864,0.9277,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.956044,0.9375,0.871,0.9257
6,logistic,365,100000,0.3,290.443826,3.151763,457.915477,0.050756,0.01331068,0.2002455,0.1,0.997253,0.997,0.9655,0.9744,0.972527,0.965,0.8864,0.9277,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.956044,0.9375,0.871,0.9257
7,logistic,365,100000,0.35,290.443826,3.151763,442.685098,0.061434,0.01331068,0.2002455,0.1,0.997253,0.997,0.9655,0.9744,0.972527,0.965,0.8864,0.9277,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.956044,0.9375,0.871,0.9257
8,logistic,365,100000,0.4,290.443826,3.151763,441.654494,0.07212,0.01335501,0.2003313,0.1,0.997253,0.997,0.9655,0.9744,0.972527,0.965,0.8864,0.9277,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.958791,0.9412,0.879,0.9304
9,logistic,365,100000,0.45,290.443826,3.151763,454.010998,0.082804,0.01335501,0.2003313,0.1,0.997253,0.997,0.9655,0.9744,0.972527,0.965,0.8864,0.9277,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.958791,0.9412,0.879,0.9304


# Defalt threshold 0.1

In [28]:
# Array fir storing results
results = []

# alphas = np.arange(0, 1.05, 0.05)
# alphas = [round(alpha, 2) for alpha in alphas]
alphas = [0.5, 0.9]

for alpha in alphas:  
    ########################
    ### current solution ###
    ########################
    result = [dist, num_days, num_samples, alpha]
    print(f"alpha = {alpha}")

    
    #######################
    ### Multi-objective ###
    #######################
    # solver.Maximize(solver.Sum((alpha * psi_1_train * x).flatten()) - solver.Sum(((1 - alpha) * psi_0_train * x).flatten()))
    
    
    #########################
    ### Invoke the solver ###
    #########################
    # start_time = process_time()
    # status = solver.Solve()
    # end_time = process_time()
    # solving_time = solving_time
    
    # result.append(solving_time)
    # print(f"Time for solving: {solving_time} s")
    
    
    ##########################
    ### Print the solution ###
    ##########################
    # x_solution_value = np.zeros((train_size, train_size))

    # for i in range(train_size):
    #     for j in range(train_size):
    #         if j > i:
    #             x_solution_value[i, j] = x[i, j].solution_value()
                
    final_bin_edges = [300, 573, 588, 596, 603, 609, 615, 620, 624, 628, 631, 634, 637, 640, 644, 647, 650, 653, 656, 659, 663, 668, 673, 679, 689, 850]

    # if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
    #     total_cost = solver.Objective().Value()
    #     result.append(total_cost)
    #     print(f"Total cost = {total_cost}")
        
    #     objective_0 = np.sum(psi_0_train * x_solution_value)
    #     result.append(objective_0)
    #     print(f"Objective_0 = {objective_0}")
        
    #     objective_1 = np.sum(psi_1_train * x_solution_value)
    #     result.append(objective_1)
    #     print(f"Objective_1 = {objective_1}", "\n")

    #     for i in range(train_size):
    #         for j in range(train_size):
    #             if j > i and x[i, j].solution_value() == 1:
    #                 final_bin_edges.append(i + 300)
    #     final_bin_edges.append(max_edge)
    # else:
    #     print('No solution found.')
        
    print("final_bin_edges =", final_bin_edges, "\n")
              
    
    ###############
    ### Evaluation ###
    ###############
    # thresholds = np.arange(0.01, 1.01, 0.01)
    # thresholds = [round(threshold, 2) for threshold in thresholds]
    thresholds = [0.1]
              
    # Training Acccuracy & F1 & F2
    num_days_train = X_train.shape[0]
    best_train_threshold = best_train_precision_0 = best_train_recall_1 = best_train_inverse_weighted_f2 = 0
    best_y_train_pred = [0] * (num_days_train - 1)
    train_acc = train_f1 = 0
    
    for threshold in thresholds:
        y_train_pred = []
        
        for i in range(num_days_train - 1):
            hist_1 = []
            for j in range(len(final_bin_edges) - 1):
                hist_1.append(np.sum(X_train[i, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_1 = np.array(hist_1)

            hist_2 = []
            for j in range(len(final_bin_edges) - 1):
                hist_2.append(np.sum(X_train[i + 1, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_2 = np.array(hist_2)

            psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
            psi = np.sum(psis)
      
            if (y_train[i] == 0 and psi < threshold) or (y_train[i] == 1 and psi >= threshold):
                y_train_pred.append(y_train[i])
            else:
                y_train_pred.append(1 - y_train[i])
        
        train_precision_0, train_recall_1, train_inverse_weighted_f2 = precision_0_recall_1_inverse_weighted_fbeta(y_train, y_train_pred, beta=2.0)
        if train_inverse_weighted_f2 > best_train_inverse_weighted_f2:
            best_train_inverse_weighted_f2 = train_inverse_weighted_f2
            best_train_threshold = threshold
            best_train_precision_0 = train_precision_0
            best_train_recall_1 = train_recall_1
            best_y_train_pred = y_train_pred
            train_acc = accuracy_score(y_train, y_train_pred)

    print("Best threshold:", best_train_threshold)
    result.append(best_train_threshold)

    print("Training Accuracy:", train_acc)
    result.append(train_acc)

    print("Best Training Precision 0:", best_train_precision_0)
    result.append(best_train_precision_0)   

    print("Best Training Recall 1:", best_train_recall_1)
    result.append(best_train_recall_1)

    print("Best Training Inverse Weighted F2", best_train_inverse_weighted_f2)
    result.append(best_train_inverse_weighted_f2) 

    print(confusion_matrix(y_train, best_y_train_pred))
              
    # Testing Acccuracy & F1 & F2
    for i in range(len(test_data_paths)):
        X_test, y_test = X_test_all[i], y_test_all[i]
        num_days_test = X_test.shape[0]
        y_test_pred = []

        for i in range(num_days_test - 1):
            hist_1 = []
            for j in range(len(final_bin_edges) - 1):
                hist_1.append(np.sum(X_test[i, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_1 = np.array(hist_1)

            hist_2 = []
            for j in range(len(final_bin_edges) - 1):
                hist_2.append(np.sum(X_test[i + 1, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_2 = np.array(hist_2)

            psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
            psi = np.sum(psis)

            if (y_test[i] == 0 and psi < best_train_threshold) or (y_test[i] == 1 and psi >= best_train_threshold):
                y_test_pred.append(y_test[i])
            else:
                y_test_pred.append(1 - y_test[i])

        test_acc = accuracy_score(y_test, y_test_pred)
        print("Testing Accuracy:", test_acc)
        result.append(test_acc)
        
        test_precision_0, test_recall_1, test_inverse_weighted_f2 = precision_0_recall_1_inverse_weighted_fbeta(y_test, y_test_pred, beta=2.0)

        print("Testing Precision 0:", test_precision_0)
        result.append(test_precision_0)   

        print("Testing Recall 1:", test_recall_1)
        result.append(test_recall_1)

        print("Testing Inverse Weighted F2:", test_inverse_weighted_f2)
        result.append(test_inverse_weighted_f2)

        print(confusion_matrix(y_test, y_test_pred))

    results.append(result)

alpha = 0.5
final_bin_edges = [300, 573, 588, 596, 603, 609, 615, 620, 624, 628, 631, 634, 637, 640, 644, 647, 650, 653, 656, 659, 663, 668, 673, 679, 689, 850] 

Best threshold: 0.1
Training Accuracy: 0.9972527472527473
Best Training Precision 0: 0.997
Best Training Recall 1: 0.9655
Best Training Inverse Weighted F2 0.9744
[[335   0]
 [  1  28]]
Testing Accuracy: 0.978021978021978
Testing Precision 0: 0.9718
Testing Recall 1: 0.9091
Testing Inverse Weighted F2: 0.9424
[[138   0]
 [  4  40]]
Testing Accuracy: 0.9835164835164835
Testing Precision 0: 0.9819
Testing Recall 1: 0.8462
Testing Inverse Weighted F2: 0.8862
[[325   0]
 [  6  33]]
Testing Accuracy: 0.9945054945054945
Testing Precision 0: 0.9938
Testing Recall 1: 0.9565
Testing Inverse Weighted F2: 0.9692
[[159   0]
 [  1  22]]
Testing Accuracy: 0.9532967032967034
Testing Precision 0: 0.9339
Testing Recall 1: 0.8629
Testing Inverse Weighted F2: 0.9209
[[240   0]
 [ 17 107]]
alpha = 0.9
final_bin_edges = [300, 573, 588, 596, 603, 

In [29]:
df_columns = ["distribution", "num_days", "num_samples", "alpha", "best_threshold",
            "training_acc", "training_precision_0", "training_recall_1", "training_inverse_weighted_f2"]

for i in range(len(test_data_paths)):
    df_columns.append(f"{id2file[i]}_acc")
    df_columns.append(f"{id2file[i]}_precision_0")
    df_columns.append(f"{id2file[i]}_recall_1")
    df_columns.append(f"{id2file[i]}_inverse_weighted_f2")
    
len(df_columns)

25

In [30]:
results_df = pd.DataFrame(results, columns=df_columns)
results_df

Unnamed: 0,distribution,num_days,num_samples,alpha,best_threshold,training_acc,training_precision_0,training_recall_1,training_inverse_weighted_f2,logistic_183_days_10000_samples_70_acc,logistic_183_days_10000_samples_70_precision_0,logistic_183_days_10000_samples_70_recall_1,logistic_183_days_10000_samples_70_inverse_weighted_f2,logistic_365_days_10000_samples_90_acc,logistic_365_days_10000_samples_90_precision_0,logistic_365_days_10000_samples_90_recall_1,logistic_365_days_10000_samples_90_inverse_weighted_f2,logistic_183_days_10000_samples_90_acc,logistic_183_days_10000_samples_90_precision_0,logistic_183_days_10000_samples_90_recall_1,logistic_183_days_10000_samples_90_inverse_weighted_f2,logistic_365_days_10000_samples_70_acc,logistic_365_days_10000_samples_70_precision_0,logistic_365_days_10000_samples_70_recall_1,logistic_365_days_10000_samples_70_inverse_weighted_f2
0,logistic,365,100000,0.5,0.1,0.997253,0.997,0.9655,0.9744,0.978022,0.9718,0.9091,0.9424,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.953297,0.9339,0.8629,0.9209
1,logistic,365,100000,0.9,0.1,0.997253,0.997,0.9655,0.9744,0.978022,0.9718,0.9091,0.9424,0.983516,0.9819,0.8462,0.8862,0.994505,0.9938,0.9565,0.9692,0.953297,0.9339,0.8629,0.9209
