# Loading data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../data/new_pseudo_data.csv", index_col=0)

In [3]:
data = data[0: 65]

In [4]:
data["Labels"].value_counts()

1.0    43
0.0    22
Name: Labels, dtype: int64

In [5]:
data["Labels"] = data["Labels"].apply(lambda x: 1.0 if x == 0.0 else 0.0)

In [6]:
data["Labels"].value_counts()

0.0    43
1.0    22
Name: Labels, dtype: int64

# Preprocessing

In [7]:
import numpy as np

In [8]:
scores = data.iloc[:, :-1].to_numpy()
scores

array([[ 0.40959253,  0.94494696, -0.12819402, ...,  0.73761301,
         0.71226716,  0.64223865],
       [ 0.58667494,  0.49399238, -0.08458038, ..., -0.09755332,
         0.64293718,  0.75908591],
       [ 0.47084428,  0.42749783,  0.57500986, ...,  0.26635691,
        -0.07087235,  0.69639212],
       ...,
       [ 0.09490054,  0.4908168 ,  0.66920938, ...,  0.09700787,
         0.33219844,  0.6323111 ],
       [ 0.93496226,  0.4016521 ,  0.70545613, ...,  0.51206863,
         0.99274307,  0.56813488],
       [ 0.31636877,  0.5633752 ,  0.61427664, ...,  0.60725233,
         0.63451596,  0.2321254 ]])

In [9]:
labels = data.iloc[:, -1].to_numpy()
labels

array([0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0.])

# Train-Test split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(scores, labels, test_size=0.15, random_state=42)

In [12]:
print(X_train.shape)
print(y_train.shape)

(55, 10000)
(55,)


In [13]:
print(X_test.shape)
print(y_test.shape)

(10, 10000)
(10,)


# Preparing interval frequencies

In [14]:
bin_edges = list(np.round(np.arange(-1.0, 1.005, 0.005), 3))
bin_edges.insert(0, -np.inf)
bin_edges.append(np.inf)
len(bin_edges)

403

In [15]:
size = len(bin_edges) - 1
num_days = X_train.shape[0]
percent_days = np.zeros((num_days, size, size))

for i in range(num_days):
    hist, _ = np.histogram(scores[i], bins=bin_edges)
    for j in range(size):
        for k in range(j, size):
            percent_days[i, j, k] = np.sum(hist[j: k + 1])

In [16]:
percent_days = percent_days / X_train.shape[1] * 100

# Models

In [17]:
from ortools.linear_solver import pywraplp

## Declare the model

In [18]:
solver = pywraplp.Solver.CreateSolver('SCIP')

## Create the variables

In [19]:
x = np.empty(shape=(size, size), dtype=object)

for i in range(size):
    for j in range(size):
        x[i, j] = solver.IntVar(0, 1, f'x[{i}, {j}]')

## Create the constraints

In [20]:
# Each row has at most one 1
for i in range(size):
    solver.Add(solver.Sum(x[i]) <= 1)

In [None]:
# Non-overlap bins
for i in range(size):
    for j in range(i, size):
        # invalid_bins = [x[a, b] for b in range(size) for a in range(i + 1, j)]
        # solver.Add(x[i, j] + solver.Sum(invalid_bins) <= 1)
        solver.Add(x[i, j] + solver.Sum(x[i + 1: j - 1, :].flatten()) <= 1)

In [None]:
# Ensure in-and-out
solver.Add(solver.Sum(x[0]) == 1)
solver.Add(solver.Sum(x[-1]) == 1)

<ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7f1ab91b9bd0> >

In [None]:
# Ensure k bins
num_bins = 10
solver.Add(solver.Sum(x.flatten()) == num_bins)

<ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7f1abb44efc0> >

## Create the objective function

In [None]:
from math import log

In [None]:
objective_0 = []
epsilon = 1e-8

In [None]:
# Objective_0
for i in range(1, num_days):
    if labels[i] == 0.0:
        for j in range(size):
            for k in range(size):
                objective_0.append(x[j, k] * (percent_days[i, j, k] - percent_days[i - 1, j, k]) * log((percent_days[i, j, k] + epsilon) / (percent_days[i - 1, j, k] + epsilon)))

In [None]:
solver.Minimize(solver.Sum(objective_0))

## Invoke the solver

In [None]:
status = solver.Solve()

## Print the solution

In [None]:
if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
    print(f'Total cost = {solver.Objective().Value()}\n')
    for i in range(size):
        for j in range(size):
            if x[i, j].solution_value() != 0:
                print(i, j)
else:
    print('No solution found.')

Total cost = 0.0

0 401
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
401 400
