# Loading data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../data/new_pseudo_data.csv")
# data = generate_data()

In [7]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,Labels
0,642,638,628,638,630,619,584,680,697,569,...,587,481,537,560,549,576,563,569,539,1
1,876,787,520,614,674,765,579,748,552,627,...,665,840,503,812,568,488,611,621,649,1
2,536,839,465,773,534,598,820,642,636,594,...,546,804,818,610,463,545,536,574,540,1
3,485,621,765,559,799,513,783,556,668,779,...,564,631,490,515,404,684,513,640,575,1
4,535,793,726,637,566,804,845,618,540,701,...,660,638,507,807,469,606,803,626,657,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,557,569,570,519,590,575,549,585,524,620,...,577,634,475,598,644,594,660,535,579,1
996,578,606,490,536,586,550,530,577,528,531,...,533,681,578,574,487,520,538,540,582,0
997,520,569,650,540,502,603,535,541,526,555,...,582,559,537,615,543,552,533,566,521,1
998,525,638,590,564,643,535,532,575,585,544,...,572,515,577,546,588,588,571,595,576,0


In [3]:
data["Labels"].value_counts()

1    710
0    290
Name: Labels, dtype: int64

In [4]:
data["Labels"] = data["Labels"].apply(lambda x: 1.0 if x == 0.0 else 0.0)

In [5]:
data["Labels"].value_counts()

0.0    710
1.0    290
Name: Labels, dtype: int64

In [6]:
# data = data[0: 35]

In [7]:
data.shape

(35, 10001)

In [8]:
data["Labels"].value_counts()

0.0    30
1.0     5
Name: Labels, dtype: int64

# Preprocessing

In [9]:
import numpy as np

In [10]:
scores = data.iloc[:, :-1].to_numpy()
print(scores)
print(scores.shape)

[[642 638 628 ... 563 569 539]
 [876 787 520 ... 611 621 649]
 [536 839 465 ... 536 574 540]
 ...
 [576 606 560 ... 609 591 598]
 [558 643 607 ... 569 570 567]
 [658 504 615 ... 607 615 463]]
(35, 10000)


In [11]:
labels = data.iloc[:, -1].to_numpy()
labels = labels.reshape(-1, 1, 1)
print(labels.shape)

(35, 1)


# Train-Test split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(scores, labels, test_size=0.15, random_state=42)

In [14]:
y_train = y_train[1:]
y_test = y_test[1:]

In [15]:
print(X_train.shape)
print(y_train.shape)

(29, 10000)
(28, 1)


In [16]:
print(X_test.shape)
print(y_test.shape)

(6, 10000)
(5, 1)


# Preparing interval frequencies

In [17]:
min_edge, max_edge = 300, 850
bin_edges = np.arange(min_edge, max_edge + 1, 1)
len(bin_edges)

551

In [19]:
size = len(bin_edges)
num_days = X_train.shape[0]
percent_days = np.zeros((num_days, size, size))

for i in range(num_days):
    hist, _ = np.histogram(scores[i], bins=bin_edges)
    for j in range(size - 1):
        for k in range(j + 1, size):
            percent_days[i, j, k] = np.sum(hist[j: k])

In [20]:
print(size)
print(num_days)

551
29


In [21]:
percent_days = percent_days / X_train.shape[1] * 100

In [22]:
percent_days.shape

(29, 551, 551)

## Preparing PSIs

In [23]:
epsilon = 1e-8 # Smoothing hyperparameters

In [24]:
psi = []

for i in range(1, num_days):
    psi.append((percent_days[i] - percent_days[i - 1]) * np.log((percent_days[i] + epsilon) / (percent_days[i - 1] + epsilon)))

psi = np.array(psi)

In [25]:
psi.shape

(28, 551, 551)

# Models

In [46]:
from ortools.linear_solver import pywraplp

## Declare the model

In [47]:
solver = pywraplp.Solver.CreateSolver('SCIP')

## Create the variables

In [48]:
x = np.empty(shape=(size, size), dtype=object)

for i in range(size):
    for j in range(size):
        x[i, j] = solver.IntVar(0, 1, f'x[{i}, {j}]')

## Create the constraints

In [49]:
# Ensure one direction flow
for i in range(size):
    solver.Add(solver.Sum(x[i, : i + 1]) == 0)

In [50]:
# Each row/column has at most one 1
# Non-overlap bins (a.k.a flow constraint)
for i in range(size):
    solver.Add(solver.Sum(x[: i, i]) <= 1)
    solver.Add(solver.Sum(x[i, i + 1:]) <= 1)
    solver.Add(solver.Sum(x[: i, i]) == solver.Sum(x[i, i + 1:]))

In [51]:
# Ensure in-and-out
solver.Add(solver.Sum(x[0, 1:]) == 1)
solver.Add(solver.Sum(x[0: -1, -1]) == 1)

<ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7fc2daf44870> >

In [52]:
# Ensure at most k bins
max_num_bins = 20
solver.Add(solver.Sum(x.flatten()) <= max_num_bins)

<ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7fc2eadfe510> >

## Create the objective function

In [53]:
# objective_0
# solver.Minimize(solver.Sum(((1 - y_train) * (x * psi)).flatten()))

# # objective_1
solver.Maximize(solver.Sum((y_train * (x * psi)).flatten()))

## Invoke the solver

In [54]:
status = solver.Solve()

## Print the solution

In [55]:
if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
    print(f'Total cost = {solver.Objective().Value()}\n')
    for i in range(size):
        for j in range(size):
            if x[i, j].solution_value() != 0:
                print(i + 300, j + 300)
else:
    print('No solution found.')

No solution found.


In [56]:
for j in range(size):
    for k in range(size):
        if x[j, k].solution_value() == 1.0:
            print(x[j, k].solution_value())