# Loading data

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("../data/new_pseudo_data.csv")
# data = generate_data()

In [5]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,Labels
0,642,638,628,638,630,619,584,680,697,569,...,587,481,537,560,549,576,563,569,539,1
1,876,787,520,614,674,765,579,748,552,627,...,665,840,503,812,568,488,611,621,649,1
2,536,839,465,773,534,598,820,642,636,594,...,546,804,818,610,463,545,536,574,540,1
3,485,621,765,559,799,513,783,556,668,779,...,564,631,490,515,404,684,513,640,575,1
4,535,793,726,637,566,804,845,618,540,701,...,660,638,507,807,469,606,803,626,657,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,557,569,570,519,590,575,549,585,524,620,...,577,634,475,598,644,594,660,535,579,1
996,578,606,490,536,586,550,530,577,528,531,...,533,681,578,574,487,520,538,540,582,0
997,520,569,650,540,502,603,535,541,526,555,...,582,559,537,615,543,552,533,566,521,1
998,525,638,590,564,643,535,532,575,585,544,...,572,515,577,546,588,588,571,595,576,0


In [6]:
data["Labels"].value_counts()

1    710
0    290
Name: Labels, dtype: int64

In [7]:
data["Labels"] = data["Labels"].apply(lambda x: 1.0 if x == 0.0 else 0.0)

In [8]:
data["Labels"].value_counts()

0.0    710
1.0    290
Name: Labels, dtype: int64

In [9]:
# data = data[0: 35]

In [10]:
data.shape

(1000, 10001)

In [11]:
data["Labels"].value_counts()

0.0    710
1.0    290
Name: Labels, dtype: int64

# Preprocessing

In [12]:
import numpy as np

In [13]:
scores = data.iloc[:, :-1].to_numpy()
print(scores)
print(scores.shape)

[[642 638 628 ... 563 569 539]
 [876 787 520 ... 611 621 649]
 [536 839 465 ... 536 574 540]
 ...
 [520 569 650 ... 533 566 521]
 [525 638 590 ... 571 595 576]
 [624 614 511 ... 560 597 548]]
(1000, 10000)


In [14]:
labels = data.iloc[:, -1].to_numpy()
print(labels.shape)

(1000,)


# Train-Test split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(scores, labels, test_size=0.15, random_state=42)

In [17]:
y_train = y_train[1:]
y_test = y_test[1:]

In [18]:
print(X_train.shape)
print(y_train.shape)

(850, 10000)
(849,)


In [19]:
print(X_test.shape)
print(y_test.shape)

(150, 10000)
(149,)


# Preparing interval frequencies

In [20]:
min_edge, max_edge = 300, 850
bin_edges = np.arange(min_edge, max_edge + 1, 1)
len(bin_edges)

551

In [21]:
size = len(bin_edges)
num_days = X_train.shape[0]
percent_days = np.zeros((num_days, size, size))

for i in range(num_days):
    hist, _ = np.histogram(scores[i], bins=bin_edges)
    for j in range(size - 1):
        for k in range(j + 1, size):
            percent_days[i, j, k] = np.sum(hist[j: k])

# with open('/content/drive/MyDrive/Distributed Data Discretization/data/percent_days.npy', 'rb') as f:
#     percent_days = np.load(f)

In [22]:
print(size)
print(num_days)

551
850


In [23]:
percent_days = percent_days / X_train.shape[1] * 100

In [24]:
percent_days.shape

(850, 551, 551)

## Preparing PSIs

In [25]:
epsilon = 1e-8 # Smoothing hyperparameters

In [26]:
psi = []

for i in range(1, num_days):
    psi.append((percent_days[i] - percent_days[i - 1]) * np.log((percent_days[i] + epsilon) / (percent_days[i - 1] + epsilon)))

psi = np.array(psi)

In [27]:
psi.shape

(849, 551, 551)

In [28]:
psi_0 = psi[(1 - y_train).astype(bool)]
psi_0 = np.sum(psi_0, axis=0)
psi_0.shape

(551, 551)

In [29]:
psi_1 = psi[y_train.astype(bool)]
psi_1 = np.sum(psi_1, axis=0)
psi_1.shape

(551, 551)

# Models

In [30]:
from ortools.linear_solver import pywraplp

## Declare the model

In [31]:
solver = pywraplp.Solver.CreateSolver('SCIP')

## Create the variables

In [32]:
x = np.empty(shape=(size, size), dtype=object)

for i in range(size):
    for j in range(size):
        if j > i:
            x[i, j] = solver.IntVar(0, 1, f'x[{i}, {j}]')
        else:
            x[i, j] = 0

## Create the constraints

In [33]:
# Each row/column has at most one 1
# Non-overlap bins (a.k.a flow constraint)
for i in range(1, size - 1):
    solver.Add(solver.Sum(x[: i, i]) <= 1)
    solver.Add(solver.Sum(x[i, i + 1:]) <= 1)
    solver.Add(solver.Sum(x[: i, i]) == solver.Sum(x[i, i + 1:]))

In [34]:
# Ensure in-and-out
solver.Add(solver.Sum(x[0, 1:]) == 1)
solver.Add(solver.Sum(x[0: -1, -1]) == 1)

<ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7f65c10afe70> >

In [35]:
# Ensure at most k bins
max_num_bins = 20
min_num_bins = 10

solver.Add(solver.Sum(x.flatten()) <= max_num_bins)
solver.Add(solver.Sum(x.flatten()) >= min_num_bins)

<ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7f65c000afc0> >

## Create the objective function

In [None]:
# objective_0
solver.Minimize(solver.Sum((psi_0 * x).flatten()))

# objective_1
# solver.Maximize(solver.Sum((psi_1 * x).flatten()))

## Invoke the solver

In [None]:
status = solver.Solve()

## Print the solution

In [None]:
# Objective_0
if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
    print(f'Total cost = {solver.Objective().Value()}\n')
    for i in range(size):
        for j in range(size):
            if j > i and x[i, j].solution_value() == 1:
                print(i + 300, j + 300)
else:
    print('No solution found.')

In [None]:
# Objective_1
# if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
#     print(f'Total cost = {solver.Objective().Value()}\n')
#     for i in range(size):
#         for j in range(size):
#             if j > i and x[i, j].solution_value() == 1:
#                 print(i + 300, j + 300)
# else:
#     print('No solution found.')