# Loading data

In [28]:
import pandas as pd

In [29]:
data = pd.read_csv("/kaggle/input/pseudo-psi/new_pseudo_data.csv")
# data = generate_data()

In [30]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,Labels
0,642,638,628,638,630,619,584,680,697,569,...,587,481,537,560,549,576,563,569,539,1
1,876,787,520,614,674,765,579,748,552,627,...,665,840,503,812,568,488,611,621,649,1
2,536,839,465,773,534,598,820,642,636,594,...,546,804,818,610,463,545,536,574,540,1
3,485,621,765,559,799,513,783,556,668,779,...,564,631,490,515,404,684,513,640,575,1
4,535,793,726,637,566,804,845,618,540,701,...,660,638,507,807,469,606,803,626,657,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,557,569,570,519,590,575,549,585,524,620,...,577,634,475,598,644,594,660,535,579,1
996,578,606,490,536,586,550,530,577,528,531,...,533,681,578,574,487,520,538,540,582,0
997,520,569,650,540,502,603,535,541,526,555,...,582,559,537,615,543,552,533,566,521,1
998,525,638,590,564,643,535,532,575,585,544,...,572,515,577,546,588,588,571,595,576,0


In [31]:
data["Labels"].value_counts()

1    710
0    290
Name: Labels, dtype: int64

In [32]:
data["Labels"] = data["Labels"].apply(lambda x: 1.0 if x == 0.0 else 0.0)

In [34]:
# data = data[0: 35]

In [35]:
data.shape

(35, 10001)

In [36]:
data["Labels"].value_counts()

0.0    30
1.0     5
Name: Labels, dtype: int64

# Preprocessing

In [37]:
import numpy as np

In [38]:
scores = data.iloc[:, :-1].to_numpy()
print(scores)
print(scores.shape)

[[642 638 628 ... 563 569 539]
 [876 787 520 ... 611 621 649]
 [536 839 465 ... 536 574 540]
 ...
 [576 606 560 ... 609 591 598]
 [558 643 607 ... 569 570 567]
 [658 504 615 ... 607 615 463]]
(35, 10000)


In [39]:
labels = data.iloc[:, -1].to_numpy()
print(labels.shape)

(35,)


# Train-Test split

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(scores, labels, test_size=0.2, shuffle=False)

In [42]:
y_train = y_train[1:]
y_test = y_test[1:]

In [43]:
print(X_train.shape)
print(y_train.shape)

(28, 10000)
(27,)


In [44]:
print(X_test.shape)
print(y_test.shape)

(7, 10000)
(6,)


# Preparing interval frequencies

In [45]:
min_edge, max_edge = 300, 850
bin_edges = np.arange(min_edge, max_edge + 1, 1)
len(bin_edges)

551

In [46]:
train_size = len(bin_edges)
num_days_train = X_train.shape[0]
percent_days_train = np.zeros((num_days_train, train_size, train_size))

for i in range(num_days_train):
    hist, _ = np.histogram(X_train[i], bins=bin_edges)
    for j in range(train_size - 1):
        for k in range(j + 1, train_size):
            percent_days_train[i, j, k] = np.sum(hist[j: k])

# with open('/content/drive/MyDrive/Distributed Data Discretization/data/percent_days.npy', 'rb') as f:
#     percent_days = np.load(f)

In [47]:
print(train_size)
print(num_days_train)

551
28


In [48]:
percent_days_train = percent_days_train / X_train.shape[1]

In [49]:
percent_days_train.shape

(28, 551, 551)

# Preparing PSIs

In [50]:
epsilon = 1e-8 # Smoothing hyperparameters

In [51]:
psi_train = []

for i in range(1, num_days_train):
    psi_train.append((percent_days_train[i] - percent_days_train[i - 1]) * np.log((percent_days_train[i] + epsilon) / (percent_days_train[i - 1] + epsilon)))

psi_train = np.array(psi_train)

In [52]:
psi_train.shape

(27, 551, 551)

In [53]:
psi_0_train = psi_train[(1 - y_train).astype(bool)]
print(psi_0_train.shape)

psi_0_train = np.sum(psi_0_train, axis=0)

# Normalization
psi_0_train = psi_0_train / np.sum(1 - y_train)
print(psi_0_train.shape)

(25, 551, 551)
(551, 551)


In [54]:
psi_1_train = psi_train[y_train.astype(bool)]
print(psi_1_train.shape)

psi_1_train = np.sum(psi_1_train, axis=0)

# Normalization
psi_1_train = psi_1_train / np.sum(y_train)
print(psi_1_train.shape)

(2, 551, 551)
(551, 551)


# Models

In [55]:
from ortools.linear_solver import pywraplp

## Declare the model

In [56]:
solver = pywraplp.Solver.CreateSolver('SCIP')

## Create the variables

In [57]:
x = np.empty(shape=(train_size, train_size), dtype=object)

for i in range(train_size):
    for j in range(train_size):
        if j > i:
            x[i, j] = solver.IntVar(0, 1, f'x[{i}, {j}]')
        else:
            x[i, j] = 0

## Create the constraints

In [58]:
from time import process_time

In [59]:
start_time = process_time()

# Each row/column has at most one 1
# Non-overlap bins (a.k.a flow constraint)
for i in range(1, train_size - 1):
    solver.Add(solver.Sum(x[: i, i]) <= 1)
    solver.Add(solver.Sum(x[i, i + 1:]) <= 1)
    solver.Add(solver.Sum(x[: i, i]) == solver.Sum(x[i, i + 1:]))
    
# Ensure in-and-out
solver.Add(solver.Sum(x[0, 1:]) == 1)
solver.Add(solver.Sum(x[0: -1, -1]) == 1)

# Ensure at most k bins
max_num_bins = 20
min_num_bins = 10
solver.Add(solver.Sum(x.flatten()) <= max_num_bins)
solver.Add(solver.Sum(x.flatten()) >= min_num_bins)

end_time = process_time()

print(f"Time for creating constraints: {end_time - start_time} s")

Time for creating constraints: 3.826341094 s


## Create the objective function

In [60]:
alpha = 0

In [66]:
# objective_0
# solver.Minimize(solver.Sum((psi_0_train * x).flatten()))

# objective_1
# solver.Maximize(solver.Sum((psi_1_train * x).flatten()))

# Multi-objective
solver.Maximize(solver.Sum((alpha * psi_1_train * x).flatten()) - solver.Sum(((1 - alpha) * psi_0_train * x).flatten()))

## Invoke the solver

In [67]:
start_time = process_time()
status = solver.Solve()
end_time = process_time()

print(status)
print(f"Time for solving: {end_time - start_time} s")

0
Time for solving: 374.236550758 s


## Print the solution

In [68]:
final_bin_edges = []

if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
    print(f'Total cost = {solver.Objective().Value()}\n')
    for i in range(train_size):
        for j in range(train_size):
            if j > i and x[i, j].solution_value() == 1:
                print(i + 300, j + 300)
                final_bin_edges.append(i + 300)
    final_bin_edges.append(max_edge)
else:
    print('No solution found.')

Total cost = -3.415203361307368e-07

300 321
321 343
343 377
377 378
378 379
379 380
380 381
381 383
383 384
384 850


In [None]:
print(final_bin_edges)

# Testing

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [None]:
threshold = 0.01
epsilon = 1e-8

## Training Acccuracy & F1

In [None]:
num_days_train = X_train.shape[0]
y_train_pred = []

for i in range(num_days_train - 1):
    hist_1, _ = np.histogram(X_train[i], bins=final_bin_edges)
    hist_1 = hist_1 / X_train.shape[1]
    
    hist_2, _ = np.histogram(X_train[i + 1], bins=final_bin_edges)
    hist_2 = hist_2 / X_train.shape[1]
    
    psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
    psi = np.sum(psis)

    if (y_train[i] == 0 and psi < threshold) or (y_train[i] == 1 and psi > threshold):
        y_train_pred.append(y_train[i])
    else:
        y_train_pred.append(1 - y_train[i])

In [None]:
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Training F1:", f1_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

## Testing Acccuracy & F1

In [None]:
num_days_test = X_test.shape[0]
y_test_pred = []

for i in range(num_days_test - 1):
    hist_1, _ = np.histogram(X_test[i], bins=final_bin_edges)
    hist_1 = hist_1 / X_test.shape[1]
    
    hist_2, _ = np.histogram(X_test[i + 1], bins=final_bin_edges)
    hist_2 = hist_2 / X_test.shape[1]
    
    psis = (hist_1 - hist_2) * np.log((hist_1 + epsilon) / (hist_2 + epsilon))
    psi = np.sum(psis)

    if (y_test[i] == 0 and psi < threshold) or (y_test[i] == 1 and psi > threshold):
        y_test_pred.append(y_test[i])
    else:
        y_test_pred.append(1 - y_test[i])

In [None]:
print("Testing Accuracy:", accuracy_score(y_test, y_test_pred))
print("Testing F1:", f1_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))