## Do full evaluation on Italia Telecom (Milan) dataset

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from copy import deepcopy
import os
import json
import numpy as np
import pandas as pd

from lib.problems import ProblemDataset
from baselines.utils import eval_method
from baselines.CPP import methods_registry
from baselines.CPP.methods_registry import CUDA_METHODS
from lib.ltr.utils import load_model
from lib.ltr.ccp.method import CKMeans

In [None]:
SIZE = 1# 00
SEED = 1
NSEEDS = 1 #3
CUDA = False
K_NOT_KNOWN = False
N = 2020 #500 #2020 #200
CORES = 4
T_LIM = 180 #480 #900 #180
DSET = "telecom_italia"

SAVE_DIR = f"./outputs_eval/{DSET}_ccp{N}/"
BL_DIR = os.path.join(SAVE_DIR, "baselines")
M_DIR = os.path.join(SAVE_DIR, "model")
smp_cfg = {"sample_size": SIZE}
INF = float("inf")

#DS_PTH = f"data/CCP/benchmark/{DSET}/sub/test_n{N}_s100_cap1_1_seed4321.npz"
DS_PTH = f"data/CCP/benchmark/{DSET}/full_test_n2020_k25_cap1_1.npz"
CKPT = "outputs/final/italia_tel_ccp_200/gnn_pool_pointwise/2023-01-15_10-58-21_399618/checkpoints/epoch=197_val_acc=0.9814.ckpt"

NUM_INIT = 8

In [None]:
metrics = {}
RESULTS = {}
seeds = [SEED+i for i in range(NSEEDS)]
ds = ProblemDataset(problem="CCP", seed=SEED, data_pth=DS_PTH)
ds = ds.sample(**smp_cfg)

Our evaluation mainly hinges on the random selection method. This method sequentially selects labels for each node
at random while trying to respect the weight constraints.
Since for the subsamples of the telecom dataset the value of k is not known but the results are highly dependent on that value,
we select it as the smallest k value for which the random method finds a feasible solution.
This is also motivated by the fact, that in case another method does not find a feasible solution for
the same value of k, we replace its cost by the cost achieved through the random method.

In [None]:

mthd = "random_select"
result, smry = eval_method(
    method=getattr(methods_registry, mthd),
    dataset=ds,
    seeds=seeds,
    save_dir=BL_DIR,
    cuda=False,
    k_not_known=True,
    sample_cfg=smp_cfg,
    method_str=mthd,
    verbose=False,
)
m_id = mthd
RESULTS[m_id] = result
print(smry)
metrics[m_id] = smry
rs_res = deepcopy(result)
if not "full" in DS_PTH:
    # retrieve the respective k values and set them for the instances
    # get the minimum k value found by the random method for all seeds
    k_vals = np.array([r['nc'] for r in rs_res]).reshape(NSEEDS, -1).min(axis=0)
    data = ds.data.copy()
    assert len(k_vals) == len(data), f"{len(k_vals)} != {len(data)}"
    for i in range(len(data)):
        data[i] = data[i].update(num_components=int(k_vals[i]))
    ds.data = data


In [None]:
mthd = "random_center_knn"
#if not CUDA or CUDA and mthd in CUDA_METHODS:
result, smry = eval_method(
    method=getattr(methods_registry, mthd),
    dataset=ds,
    seeds=seeds,
    save_dir=BL_DIR,
    cuda=CUDA,
    k_not_known=K_NOT_KNOWN,
    sample_cfg=smp_cfg,
    method_str=mthd,
    verbose=False,
)
m_id = f"{mthd}{'_cuda' if CUDA and mthd in CUDA_METHODS else ''}"
RESULTS[m_id] = result
rnd_res = deepcopy(result)
costs = np.array([r['tot_center_dist'] for r in rnd_res])
max_cost = np.nanmax(costs[costs != INF])
print(max_cost)
costs = costs.reshape(NSEEDS, -1)
is_inf = np.all(costs == INF, axis=0)
print(f"inf: {is_inf.sum()}")
costs[costs == INF] = max_cost
# buffer rnd cost mean as drop in replacement
rnd_mean_cost = np.nanmean(costs, axis=0)
#rnd_mean_cost[is_inf] = max_cost
smry['center_dist_mean'] = rnd_mean_cost.mean()
print(smry)
metrics[m_id] = smry

In [None]:
mthd = "topk_center_knn"
if not CUDA or CUDA and mthd in CUDA_METHODS:
    result, smry = eval_method(
        method=getattr(methods_registry, mthd),
        dataset=ds,
        seeds=seeds,
        save_dir=BL_DIR,
        cuda=CUDA,
        k_not_known=K_NOT_KNOWN,
        sample_cfg=smp_cfg,
        method_str=mthd,
        verbose=False,
    )
    m_id = f"{mthd}{'_cuda' if CUDA and mthd in CUDA_METHODS else ''}"
    RESULTS[m_id] = result
    # replace infeasible runs with mean cost of random method
    res = deepcopy(result)
    costs = np.array([r['tot_center_dist'] for r in res])
    costs = costs.reshape(NSEEDS, -1)
    for i, c_rnd in enumerate(rnd_mean_cost):
        inst_cost = costs[:, i]
        inf_msk = inst_cost == INF
        if np.any(inf_msk):
            inst_cost[inf_msk] = c_rnd
            costs[:, i] = inst_cost

    smry['center_dist_mean'] = np.mean(costs)
    smry['center_dist_std'] = np.mean(np.std(costs, axis=0))
    print(f"adapted summary: {smry}")
    metrics[m_id] = smry


In [None]:
mthd = "ccp_mh"
if not CUDA or CUDA and mthd in CUDA_METHODS:
    result, smry = eval_method(
        method=getattr(methods_registry, mthd),
        dataset=ds,
        seeds=seeds,
        save_dir=BL_DIR,
        cuda=CUDA,
        k_not_known=K_NOT_KNOWN,
        sample_cfg=smp_cfg,
        num_init=NUM_INIT,
        num_cores=CORES,
        t_total=T_LIM,
        t_local=T_LIM//10,
        g_initial=40,
        #raise_error=True
    )
    m_id = f"{mthd}{'_cuda' if CUDA and mthd in CUDA_METHODS else ''}"
    RESULTS[m_id] = result
    # replace infeasible runs with mean cost of random method
    res = deepcopy(result)
    costs = np.array([r['tot_center_dist'] for r in res])
    costs = costs.reshape(NSEEDS, -1)
    for i, c_rnd in enumerate(rnd_mean_cost):
        inst_cost = costs[:, i]
        inf_msk = inst_cost == INF
        if np.any(inf_msk):
            inst_cost[inf_msk] = c_rnd
            costs[:, i] = inst_cost

    smry['center_dist_mean'] = np.mean(costs)
    smry['center_dist_std'] = np.mean(np.std(costs, axis=0))
    print(f"adapted summary: {smry}")
    metrics[m_id] = smry

In [None]:
mthd = "rpack"
if not CUDA or CUDA and mthd in CUDA_METHODS:
    result, smry = eval_method(
        method=getattr(methods_registry, mthd),
        dataset=ds,
        seeds=seeds,
        save_dir=BL_DIR,
        cuda=CUDA,
        k_not_known=K_NOT_KNOWN,
        sample_cfg=smp_cfg,
        num_init=NUM_INIT,
        num_cores=CORES,
        gurobi_timeout=T_LIM//8,  # 4 # 2
        timeout=T_LIM,
        timeout_kill=(T_LIM*2)+1,
        verbose=False,
    )
    m_id = f"{mthd}{'_cuda' if CUDA and mthd in CUDA_METHODS else ''}"
    RESULTS[m_id] = result
    # replace infeasible runs with mean cost of random method
    res = deepcopy(result)
    costs = np.array([r['tot_center_dist'] for r in res])
    costs = costs.reshape(NSEEDS, -1)
    for i, c_rnd in enumerate(rnd_mean_cost):
        inst_cost = costs[:, i]
        inf_msk = inst_cost == INF
        if np.any(inf_msk):
            inst_cost[inf_msk] = c_rnd
            costs[:, i] = inst_cost

    smry['center_dist_mean'] = np.mean(costs)
    smry['center_dist_std'] = np.mean(np.std(costs, axis=0))
    print(f"adapted summary: {smry}")
    metrics[m_id] = smry

In [None]:
mthd = "cap_kmeans"
if not CUDA or CUDA and mthd in CUDA_METHODS:
    result, smry = eval_method(
        method=getattr(methods_registry, mthd),
        dataset=ds,
        seeds=seeds,
        save_dir=BL_DIR,
        cuda=CUDA,
        k_not_known=K_NOT_KNOWN,
        sample_cfg=smp_cfg,
        verbose=False,
        num_init=NUM_INIT,
        tol=1e-4,
        max_iter=80,
        init_method="ckm++",
    )
    m_id = f"{mthd}{'_cuda' if CUDA and mthd in CUDA_METHODS else ''}"
    RESULTS[m_id] = result
    # replace infeasible runs with mean cost of random method
    res = deepcopy(result)
    costs = np.array([r['tot_center_dist'] for r in res])
    costs = costs.reshape(NSEEDS, -1)
    for i, c_rnd in enumerate(rnd_mean_cost):
        inst_cost = costs[:, i]
        inf_msk = inst_cost == INF
        if np.any(inf_msk):
            inst_cost[inf_msk] = c_rnd
            costs[:, i] = inst_cost

    smry['center_dist_mean'] = np.mean(costs)
    smry['center_dist_std'] = np.mean(np.std(costs, axis=0))
    print(f"adapted summary: {smry}")
    metrics[m_id] = smry


In [None]:
# greedily assigns the last 'opt_last_frac' fraction of total nodes
# ordered by their absolute priority to the closest center

mthd = "ncc_greedy"
model = load_model("ccp", CKPT)

ckmeans = CKMeans(
    max_iter=50,
    num_init=NUM_INIT,
    model=model,
    seed=SEED,
    nbh_knn=25,
    init_method="ckm++",
    convergence_criterion="inertia",
    tol=0.001,
    pre_iter=0, #2 for full
    verbose=False,
    opt_last_frac=0.25,
    opt_last_samples=1, # no multiple samples
    opt_last_prio=True
)

result, smry = eval_method(
    method=ckmeans.inference,
    dataset=ds,
    seeds=seeds,
    save_dir=M_DIR,
    cuda=CUDA,
    k_not_known=K_NOT_KNOWN,
    sample_cfg=smp_cfg,
    method_str=mthd,
)
m_id = f"{mthd}{'_cuda' if CUDA and mthd in CUDA_METHODS else ''}"
RESULTS[m_id] = result
# replace infeasible runs with mean cost of random method
res = deepcopy(result)
costs = np.array([r['tot_center_dist'] for r in res])
costs = costs.reshape(NSEEDS, -1)

smry['center_dist_mean'] = np.mean(costs)
smry['center_dist_std'] = np.mean(np.std(costs, axis=0))
print(f"adapted summary: {smry}")
metrics[m_id] = smry

In [None]:
# samples multiple assignments for the last 'opt_last_frac' fraction of total nodes
# and selects the best one

mthd = "ncc_samp"
model = load_model("ccp", CKPT)

ckmeans = CKMeans(
    max_iter=50,
    num_init=NUM_INIT,
    model=model,
    seed=SEED,
    nbh_knn=25,
    init_method="ckm++",
    convergence_criterion="inertia",
    permute_k=False,
    tol=0.001,
    pre_iter=0,
    verbose=False,
    opt_last_frac=0.25,
    opt_last_samples=64,
    opt_last_prio=True,
)
# 2, 25, 128

result, smry = eval_method(
    method=ckmeans.inference,
    dataset=ds,
    seeds=seeds,
    save_dir=M_DIR,
    cuda=CUDA,
    k_not_known=K_NOT_KNOWN,
    sample_cfg=smp_cfg,
    method_str=mthd,
)
m_id = f"{mthd}{'_cuda' if CUDA and mthd in CUDA_METHODS else ''}"
RESULTS[m_id] = result
# replace infeasible runs with mean cost of random method
res = deepcopy(result)
costs = np.array([r['tot_center_dist'] for r in res])
costs = costs.reshape(NSEEDS, -1)

smry['center_dist_mean'] = np.mean(costs)
smry['center_dist_std'] = np.mean(np.std(costs, axis=0))
print(f"adapted summary: {smry}")
metrics[m_id] = smry

In [None]:
# convert to dataframe for nice table ;)
metric_df = pd.DataFrame(metrics)
metric_df


In [None]:
save_metrics = metric_df.to_dict()
os.makedirs(SAVE_DIR, exist_ok=True)
file = os.path.join(SAVE_DIR, f"full_results.json")
sv = True
if os.path.exists(file):
    sv = False
    inp = input("File exists! Overwrite? (y/n)")
    if str(inp).lower() == "y":
        sv = True
if sv:
    with open(file, 'w') as fp:
        json.dump(save_metrics, fp)

In [None]:
import torch

res = torch.load("outputs_eval/shanghai_telecom_ccp2373/baselines/rpack/full_test_n2373_k40_cap1_1/eval_results_full_1.pkl")
res
