# Loading data

In [1]:
import os
import numpy as np
import pandas as pd
from time import process_time
from glob import glob

# Training data

In [2]:
train_data_path = "/kaggle/input/pseudo-psi/data/train/plain/logistic_365_days_100000_samples.npy"
train_data = np.load(train_data_path)
train_data.shape

(365, 100001)

In [3]:
train_data

array([[627, 630, 619, ..., 638, 633,   0],
       [724, 657, 601, ..., 634, 693,   1],
       [666, 631, 634, ..., 666, 668,   1],
       ...,
       [612, 667, 659, ..., 640, 642,   0],
       [643, 651, 641, ..., 700, 679,   0],
       [631, 632, 631, ..., 634, 619,   1]])

## Testing data

In [4]:
test_data_dir = "/kaggle/input/pseudo-psi/data/test/plain"

In [5]:
test_data_paths = glob(f"{test_data_dir}/*.npy")

In [6]:
id2file = {}
for i in range(len(test_data_paths)):
    test_file = os.path.split(test_data_paths[i])[1].replace(".npy", "")
    id2file[i] = test_file
id2file

{0: 'logistic_30_days_10000_samples', 1: 'logistic_365_days_10000_samples'}

In [7]:
test_data_all = [np.load(test_data_path) for test_data_path in test_data_paths]

In [22]:
test_data_all[0].shape

(30, 10001)

In [8]:
test_data_all[0]

array([[643, 621, 649, ..., 679, 575,   0],
       [636, 639, 679, ..., 692, 671,   0],
       [658, 639, 623, ..., 672, 682,   0],
       ...,
       [607, 616, 626, ..., 624, 660,   0],
       [641, 638, 612, ..., 678, 660,   0],
       [646, 628, 645, ..., 653, 629,   0]])

# Preprocessing

## Training data

In [9]:
X_train = train_data[:, :-1]
X_train.shape

(365, 100000)

In [10]:
y_train = train_data[1:, -1]
y_train.shape

(364,)

In [11]:
print(f"0: {len(y_train[y_train == 0])}")
print(f"1: {len(y_train[y_train == 1])}")

0: 254
1: 110


## Testing data

In [12]:
X_test_all = [test_data[:, :-1] for test_data in test_data_all]
print(X_test_all[0].shape)

(30, 10000)


In [13]:
y_test_all = [test_data[1:, -1] for test_data in test_data_all]
y_test_all[0].shape

(29,)

# Preparing data for training

In [14]:
start_time = process_time()


# Preparing interval frequencies
min_edge, max_edge = 300, 850
bin_edges = np.arange(min_edge, max_edge + 1, 1)

train_size = len(bin_edges)
num_days_train = X_train.shape[0]
percent_days_train = np.zeros((num_days_train, train_size, train_size))

for i in range(num_days_train):
    hist, _ = np.histogram(X_train[i], bins=bin_edges)
    hist = hist / X_train.shape[1]
    for j in range(train_size - 1):
        for k in range(j + 1, train_size):
            percent_days_train[i, j, k] = np.sum(hist[j: k])
            

# Preparing PSIs
epsilon = 1e-8 # Smoothing hyperparameters

psi_train = []
for i in range(1, num_days_train):
    psi_train.append((percent_days_train[i] - percent_days_train[i - 1]) * np.log((percent_days_train[i] + epsilon) / (percent_days_train[i - 1] + epsilon)))
psi_train = np.array(psi_train)

# PSI_0
psi_0_train = psi_train[(1 - y_train).astype(bool)]
psi_0_train = np.sum(psi_0_train, axis=0)
# Normalization
psi_0_train = psi_0_train / np.sum(1 - y_train)


# PSI_1
psi_1_train = psi_train[y_train.astype(bool)]
psi_1_train = np.sum(psi_1_train, axis=0)
# Normalization
psi_1_train = psi_1_train / np.sum(y_train)


end_time = process_time()
preparing_data_time = end_time - start_time

print(f"Time for preparing data: {preparing_data_time} s")

Time for preparing data: 283.7831822 s


# Models

In [15]:
from ortools.linear_solver import pywraplp

## Declare the model

In [16]:
solver = pywraplp.Solver.CreateSolver('SCIP')

## Create the variables

In [17]:
x = np.empty(shape=(train_size, train_size), dtype=object)

for i in range(train_size):
    for j in range(train_size):
        if j > i:
            x[i, j] = solver.IntVar(0, 1, f'x[{i}, {j}]')
        else:
            x[i, j] = 0

## Create the constraints

In [18]:
start_time = process_time()

# Each row/column has at most one 1
# Non-overlap bins (a.k.a flow constraint)
for i in range(1, train_size - 1):
    solver.Add(solver.Sum(x[: i, i]) <= 1)
    solver.Add(solver.Sum(x[i, i + 1:]) <= 1)
    solver.Add(solver.Sum(x[: i, i]) == solver.Sum(x[i, i + 1:]))
    
# Ensure in-and-out
solver.Add(solver.Sum(x[0, 1:]) == 1)
solver.Add(solver.Sum(x[0: -1, -1]) == 1)

# Ensure at most k bins
max_num_bins = 25
min_num_bins = 5
solver.Add(solver.Sum(x.flatten()) <= max_num_bins)
solver.Add(solver.Sum(x.flatten()) >= min_num_bins)

end_time = process_time()
constraints_time = end_time - start_time

print(f"Time for creating constraints: {constraints_time} s")

Time for creating constraints: 4.76595519 s


## Create the objective function & Invoke the solver & Print the solution & Testing

In [19]:
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, confusion_matrix, classification_report

In [20]:
train_dir_path, file_name = os.path.split(train_data_path)
dist, num_days, _, num_samples, _ = file_name.split("_")

print(dist)
print(num_days)
print(num_samples)

logistic
365
100000


In [None]:
# Array fir storing results
results = []

alphas = np.arange(0, 1.05, 0.05)
alphas = [round(alpha, 2) for alpha in alphas]
# alphas = [0.5, 0.9]

for alpha in alphas:  
    ########################
    ### current solution ###
    ########################
    result = [dist, num_days, num_samples, alpha, preparing_data_time, constraints_time]
    print(f"alpha = {alpha}")

    
    #######################
    ### Multi-objective ###
    #######################
    solver.Maximize(solver.Sum((alpha * psi_1_train * x).flatten()) - solver.Sum(((1 - alpha) * psi_0_train * x).flatten()))
    
    
    #########################
    ### Invoke the solver ###
    #########################
    start_time = process_time()
    status = solver.Solve()
    end_time = process_time()
    solving_time = end_time - start_time
    
    result.append(solving_time)
    print(f"Time for solving: {solving_time} s")
    
    
    ##########################
    ### Print the solution ###
    ##########################
    x_solution_value = np.zeros((train_size, train_size))

    for i in range(train_size):
        for j in range(train_size):
            if j > i:
                x_solution_value[i, j] = x[i, j].solution_value()
                
    final_bin_edges = []

    if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
        total_cost = solver.Objective().Value()
        result.append(total_cost)
        print(f"Total cost = {total_cost}")
        
        objective_0 = np.sum(psi_0_train * x_solution_value)
        result.append(objective_0)
        print(f"Objective_0 = {objective_0}")
        
        objective_1 = np.sum(psi_1_train * x_solution_value)
        result.append(objective_1)
        print(f"Objective_1 = {objective_1}", "\n")

        for i in range(train_size):
            for j in range(train_size):
                if j > i and x[i, j].solution_value() == 1:
                    final_bin_edges.append(i + 300)
        final_bin_edges.append(max_edge)
    else:
        print('No solution found.')
        
    print("final_bin_edges =", final_bin_edges, "\n")
    result.append(len(final_bin_edges))
              
    
    ###############
    ### Evaluation ###
    ###############
    thresholds = np.arange(0.01, 1.01, 0.01)
    thresholds = [round(threshold, 2) for threshold in thresholds]
              
    # Training Acccuracy & F1 & F0.5
    num_days_train = X_train.shape[0]
    best_train_threshold = best_train_f_onehalf = 0
    best_y_train_pred = [0] * (num_days_train - 1)
    train_acc = train_f1 = 0
    
    for threshold in thresholds:
        y_train_pred = []
        
        for i in range(num_days_train - 1):
            hist_1, _ = np.histogram(X_train[i], bins=final_bin_edges)
            hist_1 = hist_1 / X_train.shape[1]
            
            hist_2, _ = np.histogram(X_train[i + 1], bins=final_bin_edges)
            hist_2 = hist_2 / X_train.shape[1]

            psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
            psi = np.sum(psis)
      
            if (y_train[i] == 0 and psi < threshold) or (y_train[i] == 1 and psi >= threshold):
                y_train_pred.append(y_train[i])
            else:
                y_train_pred.append(1 - y_train[i])
        
        train_f_onehalf = fbeta_score(y_train, y_train_pred, beta=0.5)
        if train_f_onehalf >= best_train_f_onehalf:
            best_train_f_onehalf = train_f_onehalf
            best_train_threshold = threshold
            best_y_train_pred = y_train_pred
            train_acc = accuracy_score(y_train, y_train_pred)
            train_f1 = f1_score(y_train, y_train_pred)

    print("Best threshold:", best_train_threshold)
    result.append(best_train_threshold)

    print("Training Accuracy:", train_acc)
    result.append(train_acc)

    print("Training F1", train_f1)
    result.append(train_f1)

    print("Best Training F0.5", best_train_f_onehalf)
    result.append(best_train_f_onehalf)    

    print(confusion_matrix(y_train, best_y_train_pred))
    print(classification_report(y_train, best_y_train_pred))
    print()
              
    # Testing Acccuracy & F1 & F0.5
    for i in range(len(test_data_paths)):
        X_test, y_test = X_test_all[i], y_test_all[i]
        num_days_test = X_test.shape[0]
        y_test_pred = []

        for i in range(num_days_test - 1):
            hist_1, _ = np.histogram(X_test[i], bins=final_bin_edges)
            hist_1 = hist_1 / X_test.shape[1]
            hist_2, _ = np.histogram(X_test[i + 1], bins=final_bin_edges)
            hist_2 = hist_2 / X_test.shape[1]

            psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
            psi = np.sum(psis)

            if (y_test[i] == 0 and psi < best_train_threshold) or (y_test[i] == 1 and psi >= best_train_threshold):
                y_test_pred.append(y_test[i])
            else:
                y_test_pred.append(1 - y_test[i])

        test_acc = accuracy_score(y_test, y_test_pred)
        result.append(test_acc)
        print("Testing Accuracy:", test_acc)

        test_f1 = f1_score(y_test, y_test_pred)
        result.append(test_f1)
        print("Testing F1:", test_f1)
        
        test_f_onehalf = fbeta_score(y_test, y_test_pred, beta=0.5)
        result.append(test_f_onehalf)
        print("Testing F0.5:", test_f_onehalf)

        print(confusion_matrix(y_test, y_test_pred))
        print(classification_report(y_test, y_test_pred))
        print("\n", "#"*30, "\n")

    results.append(result)

alpha = 0.5


# Saving the results

In [None]:
results_df = pd.DataFrame(results)
results_df

In [None]:
df_columns = ["distribution", "num_days", "num_samples", "alpha", 
            "preparing_data_time", "creating_constraints_time", "solving_time", 
            "total_cost", "objective_0", "objective_1", "num_bins", "best_threshold",
            "training_acc", "training_f1", "training_f0.5"]

for i in range(len(test_data_paths)):
    df_columns.append(f"{id2file[i]}_acc")
    df_columns.append(f"{id2file[i]}_f1")
    df_columns.append(f"{id2file[i]}_f0.5")
    
len(df_columns)

In [None]:
results_df.columns = df_columns
results_df

In [None]:
results_df.to_csv("/kaggle/working/results.csv", index=False)