In [1]:
import os
import numpy as np
import pandas as pd
from time import process_time
from glob import glob

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# settings to display all columns
pd.set_option("display.max_columns", None)

In [2]:
# Custom metrics
def precision_0_recall_1_inverse_weighted_fbeta(y_true, y_pred, beta=2.0):
    precisions, recalls, fbeta_scores, supports = precision_recall_fscore_support(y_true, y_pred, beta=beta, average=None)

    precision_0 = round(precisions[0], 4)
    recall_1 = round(recalls[1], 4)
    ratio_0, ratio_1 = supports / sum(supports)
    inverse_weighted_fbeta_score = round(fbeta_scores[0]*ratio_1 + fbeta_scores[1]*ratio_0, 4)
    
    return precision_0, recall_1, inverse_weighted_fbeta_score

In [3]:
# Get cut points of EWB for histogram data
def equal_width_cut_points(lower_bound, upper_bound, n_bins, hist_data):
    for i in range(len(hist_data)):
        if hist_data[i] != 0:
            min_value = i + lower_bound
            break
    
    for i in range(len(hist_data) - 1, -1, -1):
        if hist_data[i] != 0:
            max_value = i + lower_bound
            break
    
    bin_width = (max_value - min_value) / n_bins
    cut_points = [round(min_value + i * bin_width) for i in range(0, n_bins + 1)]
    
    if lower_bound not in cut_points:
        cut_points.insert(0, lower_bound)
    if upper_bound not in cut_points:
        cut_points.append(upper_bound)
    
    return cut_points

def equal_width_cut_points_naive(lower_bound, upper_bound, n_bins):    
    bin_width = (upper_bound - lower_bound) / n_bins
    cut_points = [round(lower_bound + i * bin_width) for i in range(0, n_bins + 1)]
    
    if lower_bound not in cut_points:
        cut_points.insert(0, lower_bound)
    if upper_bound not in cut_points:
        cut_points.append(upper_bound)
    
    return cut_points

# Get cut points of EFB for histogram data
def equal_freq_cut_points(lower_bound, upper_bound, n_bins, hist_data):
    total_count = sum(hist_data)
    bin_size = total_count / n_bins
    cumulative_count = 0
    cut_points = []
    for i in range(len(hist_data)):
        cumulative_count += hist_data[i]
        if cumulative_count >= bin_size:
            cut_point = i + 1 + lower_bound
            cut_points.append(cut_point)
            cumulative_count = 0
        if len(cut_points) == n_bins - 1:
            break
    
    if lower_bound not in cut_points:
        cut_points.insert(0, lower_bound)
    if upper_bound not in cut_points:
        cut_points.append(upper_bound)
            
    return cut_points

In [4]:
# Loading data
def load_data(train_data_path, test_data_dir):
    # Training data
    train_data = np.load(train_data_path)
    
    # Testing data
    test_data_paths = glob(f"{test_data_dir}/*.npy")
    test_data_all = [np.load(test_data_path) for test_data_path in test_data_paths]
        
    return train_data, test_data_all

In [5]:
# Preprocessing data
def preprocess_data(train_data, test_data_all):
    # Training features and labels
    X_train = train_data[:, :-1]
    X_train = X_train / sum(X_train[0])
    y_train = train_data[1:, -1]

    # Testing features and labels
    X_test_all = [test_data[:, :-1] / sum(test_data[0, :-1]) for test_data in test_data_all]
    y_test_all = [test_data[1:, -1] for test_data in test_data_all]

    return X_train, y_train, X_test_all, y_test_all

In [6]:
# Solve
def solve(train_data_path, test_data_dir, method):
    _, file_name = os.path.split(train_data_path)
    dist, num_days, _, num_samples, _, ratio = file_name.replace(".npy", "").split("_")

    # Load data
    train_data, test_data_all = load_data(train_data_path, test_data_dir)

    # Preprocess data
    X_train, y_train, X_test_all, y_test_all = preprocess_data(train_data, test_data_all)

    # Array for storing results
    results = []
    num_bins = range(5, 26)
    epsilon = 1e-8 # Smoothing hyperparameters

    for num_bin in num_bins:
        ########################
        ### current solution ###
        ########################
        result = [dist, num_days, num_samples, ratio, num_bin]
        print(f"num_bin = {num_bin}")

        #########################
        ### Invoke the solver ###
        #########################
        start_time = process_time()
        if method == "ewb":
            final_bin_edges = equal_width_cut_points(300, 850, num_bin, np.sum(X_train, axis=0))
        elif method == "efb":
            final_bin_edges = equal_freq_cut_points(300, 850, num_bin, np.sum(X_train, axis=0))
        else:
            raise Exception("Not implemented method")
        # final_bin_edges = equal_width_cut_points_naive(300, 850, num_bin)
        end_time = process_time()
        solving_time = end_time - start_time
        result.append(solving_time)
        
        print(f"Time for solving: {solving_time} s")
        print("final_bin_edges =", final_bin_edges, "\n")


        ###############
        ### Evaluation ###
        ###############
        # thresholds = np.arange(0.01, 1.01, 0.01)
        # thresholds = [round(threshold, 2) for threshold in thresholds]
        thresholds = [0.1]
                
        # Training Acccuracy & F1 & F2
        num_days_train = X_train.shape[0]
        best_train_threshold = best_train_precision_0 = best_train_recall_1 = best_train_inverse_weighted_f2 = 0
        best_y_train_pred = [0] * (num_days_train - 1)
        train_acc = train_f1 = 0
        
        for threshold in thresholds:
            y_train_pred = []
            
            for i in range(num_days_train - 1):
                hist_1 = []
                for j in range(len(final_bin_edges) - 1):
                    hist_1.append(np.sum(X_train[i, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
                hist_1 = np.array(hist_1)

                hist_2 = []
                for j in range(len(final_bin_edges) - 1):
                    hist_2.append(np.sum(X_train[i + 1, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
                hist_2 = np.array(hist_2)

                psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
                psi = np.sum(psis)
        
                if (y_train[i] == 0 and psi < threshold) or (y_train[i] == 1 and psi >= threshold):
                    y_train_pred.append(y_train[i])
                else:
                    y_train_pred.append(1 - y_train[i])
            
            train_precision_0, train_recall_1, train_inverse_weighted_f2 = precision_0_recall_1_inverse_weighted_fbeta(y_train, y_train_pred, beta=2.0)
            if train_inverse_weighted_f2 > best_train_inverse_weighted_f2:
                best_train_inverse_weighted_f2 = train_inverse_weighted_f2
                best_train_threshold = threshold
                best_train_precision_0 = train_precision_0
                best_train_recall_1 = train_recall_1
                best_y_train_pred = y_train_pred
                train_acc = accuracy_score(y_train, y_train_pred)

        print("Best threshold:", best_train_threshold)
        result.append(best_train_threshold)

        print("Training Accuracy:", train_acc)
        result.append(train_acc)

        print("Best Training Precision 0:", best_train_precision_0)
        result.append(best_train_precision_0)   

        print("Best Training Recall 1:", best_train_recall_1)
        result.append(best_train_recall_1)

        print("Best Training Inverse Weighted F2", best_train_inverse_weighted_f2)
        result.append(best_train_inverse_weighted_f2) 

        print(confusion_matrix(y_train, best_y_train_pred))
                
        # Testing Acccuracy & F1 & F2
        for i in range(len(X_test_all)):
            X_test, y_test = X_test_all[i], y_test_all[i]
            num_days_test = X_test.shape[0]
            y_test_pred = []

            for i in range(num_days_test - 1):
                hist_1 = []
                for j in range(len(final_bin_edges) - 1):
                    hist_1.append(np.sum(X_test[i, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
                hist_1 = np.array(hist_1)

                hist_2 = []
                for j in range(len(final_bin_edges) - 1):
                    hist_2.append(np.sum(X_test[i + 1, final_bin_edges[j] - 300: final_bin_edges[j + 1] - 300]))
                hist_2 = np.array(hist_2)

                psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
                psi = np.sum(psis)

                if (y_test[i] == 0 and psi < best_train_threshold) or (y_test[i] == 1 and psi >= best_train_threshold):
                    y_test_pred.append(y_test[i])
                else:
                    y_test_pred.append(1 - y_test[i])

            test_acc = accuracy_score(y_test, y_test_pred)
            print("Testing Accuracy:", test_acc)
            result.append(test_acc)
            
            test_precision_0, test_recall_1, test_inverse_weighted_f2 = precision_0_recall_1_inverse_weighted_fbeta(y_test, y_test_pred, beta=2.0)

            print("Testing Precision 0:", test_precision_0)
            result.append(test_precision_0)   

            print("Testing Recall 1:", test_recall_1)
            result.append(test_recall_1)

            print("Testing Inverse Weighted F2:", test_inverse_weighted_f2)
            result.append(test_inverse_weighted_f2)

            print(confusion_matrix(y_test, y_test_pred))

        results.append(result)

    return results

In [7]:
# Save results
def save_results(results, test_data_dir):
    test_data_paths = glob(f"{test_data_dir}/*.npy")
    id2file = {}
    for i in range(len(test_data_paths)):
        test_file = os.path.split(test_data_paths[i])[1].replace(".npy", "")
        id2file[i] = test_file

    df_columns = ["distribution", "num_days", "num_samples", "ratio", "num_bin", "solving_time", 
                "best_threshold", "training_acc", "training_precision_0", "training_recall_1", "training_inverse_weighted_f2"]

    for i in range(len(test_data_paths)):
        df_columns.append(f"{id2file[i]}_acc")
        df_columns.append(f"{id2file[i]}_precision_0")
        df_columns.append(f"{id2file[i]}_recall_1")
        df_columns.append(f"{id2file[i]}_inverse_weighted_f2")

    results_df = pd.DataFrame(results, columns=df_columns)

    return results_df

In [8]:
def main(train_data_dir, test_data_dir, method, save_dir):
    train_data_paths = glob(f"{train_data_dir}/*.npy")
    dist = train_data_dir.split("/")[-1]
    total_results_df = pd.DataFrame()

    for train_data_path in train_data_paths:
        results = solve(train_data_path, test_data_dir, method)
        results_df = save_results(results, test_data_dir)
        total_results_df = pd.concat([total_results_df, results_df])

    total_results_df.to_csv(f"{save_dir}/{method}_{dist}_results.csv", index=False)

In [9]:
train_data_dir = "../data/train/logistic/old_histogram"
test_data_dir = "../data/test/logistic/old_histogram"
method = "ewb"
save_dir = f"../output/"

if __name__ == "__main__":
    main(train_data_dir, test_data_dir, method, save_dir)

num_bin = 5
Time for solving: 0.0010406969999996463 s
final_bin_edges = [300, 420, 506, 592, 677, 763, 849, 850] 

Best threshold: 0.1
Training Accuracy: 0.9152542372881356
Best Training Precision 0: 0.9153
Best Training Recall 1: 0.0
Best Training Inverse Weighted F2 0.0832
[[54  0]
 [ 5  0]]
Testing Accuracy: 0.8406593406593407
Testing Precision 0: 0.8263
Testing Recall 1: 0.3409
Testing Inverse Weighted F2: 0.5297
[[138   0]
 [ 29  15]]


  _warn_prf(average, modifier, msg_start, len(result))


Testing Accuracy: 0.9230769230769231
Testing Precision 0: 0.9207
Testing Recall 1: 0.2821
Testing Inverse Weighted F2: 0.3994
[[325   0]
 [ 28  11]]
Testing Accuracy: 0.9230769230769231
Testing Precision 0: 0.9191
Testing Recall 1: 0.3913
Testing Inverse Weighted F2: 0.5134
[[159   0]
 [ 14   9]]
Testing Accuracy: 0.7692307692307693
Testing Precision 0: 0.7407
Testing Recall 1: 0.3226
Testing Inverse Weighted F2: 0.5644
[[240   0]
 [ 84  40]]
num_bin = 6
Time for solving: 0.00022606100000022167 s
final_bin_edges = [300, 420, 492, 563, 634, 706, 778, 849, 850] 

Best threshold: 0.1
Training Accuracy: 0.9830508474576272
Best Training Precision 0: 0.9818
Best Training Recall 1: 0.8
Best Training Inverse Weighted F2 0.8471
[[54  0]
 [ 1  4]]
Testing Accuracy: 0.8681318681318682
Testing Precision 0: 0.8519
Testing Recall 1: 0.4545
Testing Inverse Weighted F2: 0.6205
[[138   0]
 [ 24  20]]
Testing Accuracy: 0.9368131868131868
Testing Precision 0: 0.9339
Testing Recall 1: 0.4103
Testing Inver