# Loading data

In [1]:
import numpy as np
import os

## Training data

In [2]:
train_data_path = "/kaggle/input/logistic/logistic_30_days_10000_samples.npy"
train_data = np.load(train_data_path)
train_data.shape

(31, 10001)

In [3]:
train_data

array([[   0,    1,    2, ..., 9998, 9999,   -1],
       [ 529,  567,  567, ...,  542,  548,    0],
       [ 643,  452,  455, ...,  581,  513,    0],
       ...,
       [ 563,  507,  531, ...,  422,  556,    0],
       [ 638,  533,  552, ...,  524,  563,    0],
       [ 548,  709,  884, ...,  513,  573,    0]])

## Testing data

In [4]:
test_data_path = "/kaggle/input/logistic/logistic_30_days_1000_samples.npy"
test_data = np.load(test_data_path)
test_data.shape

(31, 1001)

In [5]:
test_data

array([[  0,   1,   2, ..., 998, 999,  -1],
       [627, 480, 486, ..., 569, 584,   0],
       [534, 537, 570, ..., 589, 829,   0],
       ...,
       [570, 629, 508, ..., 365, 614,   0],
       [470, 399, 625, ..., 560, 847,   0],
       [609, 588, 458, ..., 476, 538,   0]])

# Preparing results for saving

In [6]:
# Array fir storing results
results = []

In [7]:
train_dir_path, file_name = os.path.split(train_data_path)

In [8]:
dist, num_days, _, num_samples, _ = file_name.split("_")

In [9]:
print(dist)
print(num_days)
print(num_samples)

logistic
30
10000


# Preprocessing

## Training data

In [10]:
X_train = train_data[1:, :-1]
X_train.shape

(30, 10000)

In [11]:
y_train = train_data[2:, -1]
y_train.shape

(29,)

In [12]:
print(f"0: {len(y_train[y_train == 0])}")
print(f"1: {len(y_train[y_train == 1])}")

0: 20
1: 9


## Testing data

In [13]:
X_test = test_data[1:, :-1]
X_test.shape

(30, 1000)

In [14]:
y_test = test_data[2:, -1]
y_test.shape

(29,)

In [15]:
print(f"0: {len(y_test[y_test == 0])}")
print(f"1: {len(y_test[y_test == 1])}")

0: 23
1: 6


# Preparing interval frequencies

In [16]:
min_edge, max_edge = 300, 850
bin_edges = np.arange(min_edge, max_edge + 1, 1)
len(bin_edges)

551

In [17]:
train_size = len(bin_edges)
num_days_train = X_train.shape[0]
percent_days_train = np.zeros((num_days_train, train_size, train_size))

for i in range(num_days_train):
    hist, _ = np.histogram(X_train[i], bins=bin_edges)
    for j in range(train_size - 1):
        for k in range(j + 1, train_size):
            percent_days_train[i, j, k] = np.sum(hist[j: k])

In [18]:
print(train_size)
print(num_days_train)

551
30


In [19]:
percent_days_train = percent_days_train / X_train.shape[1]

In [20]:
percent_days_train.shape

(30, 551, 551)

# Preparing PSIs

In [21]:
epsilon = 1e-8 # Smoothing hyperparameters

In [22]:
psi_train = []

for i in range(1, num_days_train):
    psi_train.append((percent_days_train[i] - percent_days_train[i - 1]) * np.log((percent_days_train[i] + epsilon) / (percent_days_train[i - 1] + epsilon)))

psi_train = np.array(psi_train)

In [23]:
psi_0_train = psi_train[(1 - y_train).astype(bool)]
print(psi_0_train.shape)

psi_0_train = np.sum(psi_0_train, axis=0)

# Normalization
psi_0_train = psi_0_train / np.sum(1 - y_train)
print(psi_0_train.shape)

(20, 551, 551)
(551, 551)


In [24]:
psi_1_train = psi_train[y_train.astype(bool)]
print(psi_1_train.shape)

psi_1_train = np.sum(psi_1_train, axis=0)

# Normalization
psi_1_train = psi_1_train / np.sum(y_train)
print(psi_1_train.shape)

(9, 551, 551)
(551, 551)


# Models

In [25]:
from ortools.linear_solver import pywraplp

## Declare the model

In [26]:
solver = pywraplp.Solver.CreateSolver('SCIP')

## Create the variables

In [27]:
x = np.empty(shape=(train_size, train_size), dtype=object)

for i in range(train_size):
    for j in range(train_size):
        if j > i:
            x[i, j] = solver.IntVar(0, 1, f'x[{i}, {j}]')
        else:
            x[i, j] = 0

## Create the constraints

In [28]:
from time import process_time

In [29]:
start_time = process_time()

# Each row/column has at most one 1
# Non-overlap bins (a.k.a flow constraint)
for i in range(1, train_size - 1):
    solver.Add(solver.Sum(x[: i, i]) <= 1)
    solver.Add(solver.Sum(x[i, i + 1:]) <= 1)
    solver.Add(solver.Sum(x[: i, i]) == solver.Sum(x[i, i + 1:]))
    
# Ensure in-and-out
solver.Add(solver.Sum(x[0, 1:]) == 1)
solver.Add(solver.Sum(x[0: -1, -1]) == 1)

# Ensure at most k bins
max_num_bins = 20
min_num_bins = 10
solver.Add(solver.Sum(x.flatten()) <= max_num_bins)
solver.Add(solver.Sum(x.flatten()) >= min_num_bins)

end_time = process_time()
constraints_time = end_time - start_time

print(f"Time for creating constraints: {constraints_time} s")

Time for creating constraints: 4.697742847000001 s


## Create the objective function & Invoke the solver & Print the solution & Testing

In [30]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [31]:
# alphas = np.arange(0, 1.05, 0.05)
alphas = [0.5, 0.9]
print(alphas)

[0.5, 0.9]


In [32]:
for alpha in alphas:  
    ########################
    ### current solution ###
    ########################
    result = [dist, num_days, num_samples, alpha, constraints_time]
    print(f"alpha = {alpha}")

    
    #######################
    ### Multi-objective ###
    #######################
    solver.Maximize(solver.Sum((alpha * psi_1_train * x).flatten()) - solver.Sum(((1 - alpha) * psi_0_train * x).flatten()))
    
    
    #########################
    ### Invoke the solver ###
    #########################
    start_time = process_time()
    status = solver.Solve()
    end_time = process_time()
    solving_time = end_time - start_time
    
    result.append(solving_time)
    print(f"Time for solving: {solving_time} s")
    
    
    ##########################
    ### Print the solution ###
    ##########################
    x_solution_value = np.zeros((train_size, train_size))

    for i in range(train_size):
        for j in range(train_size):
            if j > i:
                x_solution_value[i, j] = x[i, j].solution_value()
                
    final_bin_edges = []

    if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
        total_cost = solver.Objective().Value()
        result.append(total_cost)
        print(f"Total cost = {total_cost}")
        
        objective_0 = np.sum(psi_0_train * x_solution_value)
        result.append(objective_0)
        print(f"Objective_0 = {objective_0}")
        
        objective_1 = np.sum(psi_1_train * x_solution_value)
        result.append(objective_1)
        print(f"Objective_1 = {objective_1}", "\n")

        for i in range(train_size):
            for j in range(train_size):
                if j > i and x[i, j].solution_value() == 1:
#                     print(i + 300, j + 300)
                    final_bin_edges.append(i + 300)
        final_bin_edges.append(max_edge)
    else:
        print('No solution found.')
        
    print("final_bin_edges =", final_bin_edges, "\n")
              
    
    ###############
    ### Testing ###
    ###############
    threshold = 0.1
              
    # Training Acccuracy & F1
    num_days_train = X_train.shape[0]
    y_train_pred = []

    for i in range(num_days_train - 1):
        hist_1, _ = np.histogram(X_train[i], bins=final_bin_edges)
        hist_1 = hist_1 / X_train.shape[1]

        hist_2, _ = np.histogram(X_train[i + 1], bins=final_bin_edges)
        hist_2 = hist_2 / X_train.shape[1]

        psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
        psi = np.sum(psis)

        if (y_train[i] == 0 and psi < threshold) or (y_train[i] == 1 and psi > threshold):
            y_train_pred.append(y_train[i])
        else:
            y_train_pred.append(1 - y_train[i])
              
    train_acc = accuracy_score(y_train, y_train_pred)
    result.append(train_acc)
    print("Training Accuracy:", train_acc)
              
    train_f1 = f1_score(y_train, y_train_pred)
    print("Training F1:", train_f1)
    result.append(train_f1)
              
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    print()
              
    # Testing Acccuracy & F1
    num_days_test = X_test.shape[0]
    y_test_pred = []

    for i in range(num_days_test - 1):
        hist_1, _ = np.histogram(X_test[i], bins=final_bin_edges)
        hist_1 = hist_1 / X_test.shape[1]

        hist_2, _ = np.histogram(X_test[i + 1], bins=final_bin_edges)
        hist_2 = hist_2 / X_test.shape[1]

        psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
        psi = np.sum(psis)

        if (y_test[i] == 0 and psi < threshold) or (y_test[i] == 1 and psi > threshold):
            y_test_pred.append(y_test[i])
        else:
            y_test_pred.append(1 - y_test[i])
    
    test_acc = accuracy_score(y_test, y_test_pred)
    result.append(test_acc)
    print("Testing Accuracy:", test_acc)
              
    test_f1 = f1_score(y_test, y_test_pred)
    result.append(test_f1)
    print("Testing F1:", test_f1)
              
    print(confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))
    print("\n", "#"*30, "\n")
    
    results.append(result)

alpha = 0.5
Time for solving: 598.117998758 s
Total cost = 0.052953107680284994
Objective_0 = 0.00895962534993031
Objective_1 = 0.11486584071050031 

final_bin_edges = [300, 394, 396, 397, 400, 402, 403, 437, 438, 482, 483, 528, 552, 566, 579, 593, 610, 655, 708, 709, 850] 

Training Accuracy: 0.8620689655172413
Training F1: 0.7142857142857143
[[20  0]
 [ 4  5]]
              precision    recall  f1-score   support

           0       0.83      1.00      0.91        20
           1       1.00      0.56      0.71         9

    accuracy                           0.86        29
   macro avg       0.92      0.78      0.81        29
weighted avg       0.89      0.86      0.85        29


Testing Accuracy: 0.8275862068965517
Testing F1: 0.2857142857142857
[[23  0]
 [ 5  1]]
              precision    recall  f1-score   support

           0       0.82      1.00      0.90        23
           1       1.00      0.17      0.29         6

    accuracy                           0.83        29
  

# Saving the results