In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import train_test_split  # type: ignore
from sklearn.linear_model import LinearRegression  # type: ignore
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import time
import matplotlib.pyplot as plt

import json

from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import dask.dataframe as dd
import os

dataset_params = {
        "SIFT100M": {
            "M": 32,
            "efC": 500,
            "efS": 500,
            "li": 1,
            "label": "SIFT100M",
        },
        "GIST1M": {
            "M": 32,
            "efC": 500,
            "efS": 1000,
            "li": 1,
            "label": "GIST1M",
        },
        "GLOVE100": {
            "M": 16,
            "efC": 500,
            "efS": 500,
            "li": 1,
            "label": "GLOVE1M",
        },
        "DEEP100M":{
            "M": 32,
            "efC": 500,
            "efS": 750,
            "li": 1,
            "label": "DEEP100M",
        },
        "T2I100M":{
            "M": 80,
            "efC": 1000,
            "efS": 2500,
            "li": 2,
            "label": "T2I100M",
        }
    }

SEED = 42

index_metric_feats = ["step", "dists", "inserts"]
neighbor_distances_feats = ["first_nn_dist", "nn_dist", "furthest_dist"]
neighbor_stats_feats = ["avg_dist", "variance", "percentile_25", "percentile_50", "percentile_75"]
data_dims_feats = ["dim_l2_norm", "dim_l1_norm", "dim_mean", "dim_median", "dim_std", "dim_var", "dim_min", "dim_max", "dim_range", "dim_energy", "dim_skewness", "dim_kurtosis", "dim_perc_25", "dim_perc_75", "dim_perc_95"]
neighbor_stats_feats_new = ["std", "range", "energy", "skewness", "kurtosis", "percentile_95"]

In [None]:
ds_name = "SIFT100M"
k = 50
li = 5
queries = 10000
M = dataset_params[ds_name]["M"]
efC = dataset_params[ds_name]["efC"]
efS = dataset_params[ds_name]["efS"]

columns_to_load = ["qid", "elaps_ms"] + index_metric_feats + neighbor_distances_feats + neighbor_stats_feats + data_dims_feats + neighbor_stats_feats_new + ["r", "feats_collect_time_ms"]

datapath = f"/data/mchatzakis/et_training_data/early-stop-training/{ds_name}/k{k}/M{M}_efC{efC}_efS{efS}_qs{queries}_li{li}_imp.txt"
all_queries_dask = dd.read_csv(datapath, usecols=columns_to_load)
all_queries_data = all_queries_dask.compute()

In [None]:
all_queries_data.head()

In [None]:
all_queries_data.sample(10)

In [None]:
all_queries_data.describe()

In [None]:
# Average collection time:
all_queries_data["feats_collect_time_ms"].mean()