# Loading data

In [43]:
import os
import numpy as np
import pandas as pd
from time import process_time
from glob import glob

## Training data

In [44]:
train_data_path = "../data/train/histogram/logistic_60_days_100000_samples_80.npy"
train_data = np.load(train_data_path)
train_data.shape

(60, 551)

In [45]:
train_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Testing data

In [46]:
test_data_dir = "../data/test/histogram"

In [47]:
test_data_paths = glob(f"{test_data_dir}/*.npy")

In [48]:
id2file = {}
for i in range(len(test_data_paths)):
    test_file = os.path.split(test_data_paths[i])[1].replace(".npy", "")
    id2file[i] = test_file
print(len(id2file))
id2file

4


{0: 'logistic_30_days_100000_samples_80',
 1: 'logistic_30_days_1000000_samples_80',
 2: 'logistic_60_days_1000000_samples_80',
 3: 'logistic_183_days_100000_samples_80'}

In [49]:
test_data_all = [np.load(test_data_path) for test_data_path in test_data_paths]

In [50]:
test_data_all[0].shape

(30, 551)

# Preprocessing

## Training data

In [51]:
X_train = train_data[:, :-1]
X_train = X_train / sum(X_train[0])
X_train.shape

(60, 550)

In [52]:
y_train = train_data[1:, -1]
y_train.shape

(59,)

In [53]:
print(f"0: {len(y_train[y_train == 0])}")
print(f"1: {len(y_train[y_train == 1])}")

0: 52
1: 7


## Testing data

In [54]:
X_test_all = [test_data[:, :-1] / sum(test_data[0, :-1]) for test_data in test_data_all]
print(X_test_all[0].shape)

(30, 550)


In [55]:
y_test_all = [test_data[1:, -1] for test_data in test_data_all]
y_test_all[0].shape

(29,)

In [56]:
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, confusion_matrix, classification_report

In [57]:
train_dir_path, file_name = os.path.split(train_data_path)
dist, num_days, _, num_samples, _, ratio = file_name.replace(".npy", "").split("_")

print(dist)
print(num_days)
print(num_samples)
print(ratio)

logistic
60
100000
80


In [58]:
# Get cut points of EWB for histogram data
def equal_width_cut_points(lower_bound, upper_bound, n_bins, hist_data):
    for i in range(len(hist_data)):
        if hist_data[i] != 0:
            min_value = i + lower_bound
            break
    
    for i in range(len(hist_data) - 1, -1, -1):
        if hist_data[i] != 0:
            max_value = i + lower_bound
            break
    
    bin_width = (max_value - min_value) / n_bins
    cut_points = [round(min_value + i * bin_width) for i in range(0, n_bins + 1)]
    
    if lower_bound not in cut_points:
        cut_points.insert(0, lower_bound)
    if upper_bound not in cut_points:
        cut_points.append(upper_bound)
    
    return cut_points

def equal_width_cut_points_naive(lower_bound, upper_bound, n_bins):    
    bin_width = (upper_bound - lower_bound) / n_bins
    cut_points = [round(lower_bound + i * bin_width) for i in range(0, n_bins + 1)]
    
    if lower_bound not in cut_points:
        cut_points.insert(0, lower_bound)
    if upper_bound not in cut_points:
        cut_points.append(upper_bound)
    
    return cut_points

# Get cut points of EFB for histogram data
def equal_freq_cut_points(lower_bound, upper_bound, n_bins, hist_data):
    total_count = sum(hist_data)
    bin_size = total_count / n_bins
    cumulative_count = 0
    cut_points = []
    for i in range(len(hist_data)):
        cumulative_count += hist_data[i]
        if cumulative_count >= bin_size:
            cut_point = i + 1 + lower_bound
            cut_points.append(cut_point)
            cumulative_count = 0
        if len(cut_points) == n_bins - 1:
            break
    
    if lower_bound not in cut_points:
        cut_points.insert(0, lower_bound)
    if upper_bound not in cut_points:
        cut_points.append(upper_bound)
            
    return cut_points

In [59]:
# Array for storing results
results = []
num_bins = range(5, 26)
epsilon = 1e-8 # Smoothing hyperparameters

for num_bin in num_bins:
    ########################
    ### current solution ###
    ########################
    result = [dist, num_days, num_samples, num_bin]
    print(f"num_bin = {num_bin}")

    #########################
    ### Invoke the solver ###
    #########################
    start_time = process_time()
#     final_bin_edges = equal_width_cut_points(300, 850, num_bin, np.sum(X_train, axis=0))
    final_bin_edges = equal_width_cut_points_naive(300, 850, num_bin)
    end_time = process_time()
    solving_time = end_time - start_time
    result.append(solving_time)
    
    print(f"Time for solving: {solving_time} s")
    print("final_bin_edges =", final_bin_edges, "\n")


    ###############
    ### Evaluation ###
    ###############
    thresholds = np.arange(0.01, 1.01, 0.01)
    thresholds = [round(threshold, 2) for threshold in thresholds]
              
    # Training Acccuracy & F1 & F0.5
    num_days_train = X_train.shape[0]
    best_train_threshold = best_train_f2 = 0
    best_y_train_pred = [0] * (num_days_train - 1)
    train_acc = train_f1 = 0
    
    for threshold in thresholds:
        y_train_pred = []
        
        for i in range(num_days_train - 1):
            hist_1 = []
            for j in range(len(final_bin_edges) - 1):
                hist_1.append(np.sum(X_train[i, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_1 = np.array(hist_1)

            hist_2 = []
            for j in range(len(final_bin_edges) - 1):
                hist_2.append(np.sum(X_train[i + 1, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_2 = np.array(hist_2)

            psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
            psi = np.sum(psis)
      
            if (y_train[i] == 0 and psi < threshold) or (y_train[i] == 1 and psi >= threshold):
                y_train_pred.append(y_train[i])
            else:
                y_train_pred.append(1 - y_train[i])
        
        train_f2 = fbeta_score(y_train, y_train_pred, beta=2.0)
        if train_f2 >= best_train_f2:
            best_train_f2 = train_f2
            best_train_threshold = threshold
            best_y_train_pred = y_train_pred
            train_acc = accuracy_score(y_train, y_train_pred)
            train_f1 = f1_score(y_train, y_train_pred)

    print("Best threshold:", best_train_threshold)
    result.append(best_train_threshold)

    print("Training Accuracy:", train_acc)
    result.append(train_acc)

    print("Training F1", train_f1)
    result.append(train_f1)

    print("Best Training F2", best_train_f2)
    result.append(best_train_f2)    

    print(confusion_matrix(y_train, best_y_train_pred))
    print(classification_report(y_train, best_y_train_pred))
    print()
              
    # Testing Acccuracy & F1 & F0.5
    for i in range(len(test_data_paths)):
        X_test, y_test = X_test_all[i], y_test_all[i]
        num_days_test = X_test.shape[0]
        y_test_pred = []

        for i in range(num_days_test - 1):
            hist_1 = []
            for j in range(len(final_bin_edges) - 1):
                hist_1.append(np.sum(X_test[i, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_1 = np.array(hist_1)

            hist_2 = []
            for j in range(len(final_bin_edges) - 1):
                hist_2.append(np.sum(X_test[i + 1, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
            hist_2 = np.array(hist_2)

            psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
            psi = np.sum(psis)

            if (y_test[i] == 0 and psi < best_train_threshold) or (y_test[i] == 1 and psi >= best_train_threshold):
                y_test_pred.append(y_test[i])
            else:
                y_test_pred.append(1 - y_test[i])

        test_acc = accuracy_score(y_test, y_test_pred)
        result.append(test_acc)
        print("Testing Accuracy:", test_acc)

        test_f1 = f1_score(y_test, y_test_pred)
        result.append(test_f1)
        print("Testing F1:", test_f1)
        
        test_f2 = fbeta_score(y_test, y_test_pred, beta=2.0)
        result.append(test_f2)
        print("Testing F2:", test_f2)

        print(confusion_matrix(y_test, y_test_pred))
        print(classification_report(y_test, y_test_pred))
        print("\n", "#"*30, "\n")

    results.append(result)

num_bin = 5
Time for solving: 2.492299999801162e-05 s
final_bin_edges = [300, 410, 520, 630, 740, 850] 

Best threshold: 0.04
Training Accuracy: 1.0
Training F1 1.0
Best Training F2 1.0
[[52  0]
 [ 0  7]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       1.00      1.00      1.00         7

    accuracy                           1.00        59
   macro avg       1.00      1.00      1.00        59
weighted avg       1.00      1.00      1.00        59


Testing Accuracy: 0.896551724137931
Testing F1: 0.4
Testing F2: 0.3571428571428571
[[25  1]
 [ 2  1]]
              precision    recall  f1-score   support

           0       0.93      0.96      0.94        26
           1       0.50      0.33      0.40         3

    accuracy                           0.90        29
   macro avg       0.71      0.65      0.67        29
weighted avg       0.88      0.90      0.89        29


 ############################## 

Tes

# Saving the results

In [60]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,logistic,60,100000,5,2.5e-05,0.04,1.0,1.0,1.0,0.896552,...,0.357143,1.0,1.0,1.0,0.949153,0.888889,0.833333,0.961538,0.901408,0.851064
1,logistic,60,100000,6,1e-05,0.04,0.983051,0.933333,0.972222,1.0,...,1.0,1.0,1.0,1.0,0.983051,0.965517,0.945946,0.983516,0.96,0.9375
2,logistic,60,100000,7,2.2e-05,0.04,1.0,1.0,1.0,0.931034,...,0.384615,0.965517,0.909091,0.862069,0.949153,0.888889,0.833333,0.93956,0.835821,0.76087
3,logistic,60,100000,8,1e-05,0.05,1.0,1.0,1.0,0.965517,...,0.714286,1.0,1.0,1.0,0.983051,0.965517,0.945946,1.0,1.0,1.0
4,logistic,60,100000,9,9e-06,0.05,0.983051,0.933333,0.972222,1.0,...,1.0,0.965517,0.909091,0.862069,0.983051,0.965517,0.945946,0.978022,0.945946,0.91623
5,logistic,60,100000,10,1.1e-05,0.07,1.0,1.0,1.0,0.931034,...,0.384615,0.965517,0.909091,0.862069,0.949153,0.888889,0.833333,0.923077,0.78125,0.690608
6,logistic,60,100000,11,1e-05,0.07,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.983051,0.965517,0.945946,0.989011,0.973684,0.958549
7,logistic,60,100000,12,3e-05,0.06,1.0,1.0,1.0,1.0,...,1.0,0.965517,0.909091,0.862069,0.983051,0.965517,0.945946,0.983516,0.96,0.9375
8,logistic,60,100000,13,1e-05,0.07,1.0,1.0,1.0,1.0,...,1.0,0.965517,0.909091,0.862069,0.949153,0.888889,0.833333,0.983516,0.96,0.9375
9,logistic,60,100000,14,1.7e-05,0.07,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [61]:
df_columns = ["distribution", "num_days", "num_samples", "num_bin", "solving_time", 
           "best_threshold", "training_acc", "training_f1", "training_f2"]

for i in range(len(test_data_paths)):
    df_columns.append(f"{id2file[i]}_acc")
    df_columns.append(f"{id2file[i]}_f1")
    df_columns.append(f"{id2file[i]}_f2")
    
len(df_columns)

21

In [62]:
results_df.columns = df_columns
results_df

Unnamed: 0,distribution,num_days,num_samples,num_bin,solving_time,best_threshold,training_acc,training_f1,training_f2,logistic_30_days_100000_samples_80_acc,...,logistic_30_days_100000_samples_80_f2,logistic_30_days_1000000_samples_80_acc,logistic_30_days_1000000_samples_80_f1,logistic_30_days_1000000_samples_80_f2,logistic_60_days_1000000_samples_80_acc,logistic_60_days_1000000_samples_80_f1,logistic_60_days_1000000_samples_80_f2,logistic_183_days_100000_samples_80_acc,logistic_183_days_100000_samples_80_f1,logistic_183_days_100000_samples_80_f2
0,logistic,60,100000,5,2.5e-05,0.04,1.0,1.0,1.0,0.896552,...,0.357143,1.0,1.0,1.0,0.949153,0.888889,0.833333,0.961538,0.901408,0.851064
1,logistic,60,100000,6,1e-05,0.04,0.983051,0.933333,0.972222,1.0,...,1.0,1.0,1.0,1.0,0.983051,0.965517,0.945946,0.983516,0.96,0.9375
2,logistic,60,100000,7,2.2e-05,0.04,1.0,1.0,1.0,0.931034,...,0.384615,0.965517,0.909091,0.862069,0.949153,0.888889,0.833333,0.93956,0.835821,0.76087
3,logistic,60,100000,8,1e-05,0.05,1.0,1.0,1.0,0.965517,...,0.714286,1.0,1.0,1.0,0.983051,0.965517,0.945946,1.0,1.0,1.0
4,logistic,60,100000,9,9e-06,0.05,0.983051,0.933333,0.972222,1.0,...,1.0,0.965517,0.909091,0.862069,0.983051,0.965517,0.945946,0.978022,0.945946,0.91623
5,logistic,60,100000,10,1.1e-05,0.07,1.0,1.0,1.0,0.931034,...,0.384615,0.965517,0.909091,0.862069,0.949153,0.888889,0.833333,0.923077,0.78125,0.690608
6,logistic,60,100000,11,1e-05,0.07,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.983051,0.965517,0.945946,0.989011,0.973684,0.958549
7,logistic,60,100000,12,3e-05,0.06,1.0,1.0,1.0,1.0,...,1.0,0.965517,0.909091,0.862069,0.983051,0.965517,0.945946,0.983516,0.96,0.9375
8,logistic,60,100000,13,1e-05,0.07,1.0,1.0,1.0,1.0,...,1.0,0.965517,0.909091,0.862069,0.949153,0.888889,0.833333,0.983516,0.96,0.9375
9,logistic,60,100000,14,1.7e-05,0.07,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [63]:
results_df.to_csv("../output/test/results.csv", index=False)