In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.neighbors import KDTree
import math
import faiss
from joblib import Parallel, delayed
from tqdm import tqdm
import multiprocessing


In [2]:

# Columns to use
columns_order = [
    'IMPERV', 'HEIGHT', 'COAST', 'ELEV', 'POP',  
    'RH', 'SP', 'PRECIP', 'T_2M_COR', 'WS', 'TCC',  
    'CAPE', 'BLH', 'SSR', 'SOLAR_ELEV', 'DECL'
]

# Load min-max scaling info and ensure correct matching
min_max_df = pd.read_csv("data/CLUSTER3_min_max.csv", sep=';')
min_max_df.set_index(min_max_df.columns[0], inplace=True)

# Ensure all required columns exist in the min-max file
missing_cols = [col for col in columns_order if col not in min_max_df.index]
if missing_cols:
    raise ValueError(f"The following required columns are missing in CLUSTER3_min_max.csv: {missing_cols}")

# Get min/max values in the same order as columns_order
min_vals = min_max_df.loc[columns_order, 'min'].astype(float).to_numpy()
max_vals = min_max_df.loc[columns_order, 'Max'].astype(float).to_numpy()

# Avoid division by zero
scale = max_vals - min_vals
scale[scale == 0] = 1.0

def min_max_scale(data, min_vals, scale):
    return (data - min_vals) / scale

In [3]:
# Load importances
importances_df = pd.read_csv('AOA_data/importances_CL3.csv')
importances_df.set_index('Feature', inplace=True)

# Map feature names to column names in test_scaled
importance_to_column_map = {
    'IMPERV': 'IMPERV',
    'HEIGHT': 'HEIGHT',
    'COAST': 'COAST',
    'ELEV': 'ELEV',
    'POP': 'POP',
    'RH': 'RH',
    'SP': 'SP',
    'PRECIP': 'PRECIP',
    'T_2M': 'T_2M_COR',  # Important mapping
    'wind_speed': 'WS',
    'TCC': 'TCC',
    'CAPE': 'CAPE',
    'BLH': 'BLH',
    'SSR': 'SSR',
    'SOLAR_ELEV': 'SOLAR_ELEV',
    'DECL': 'DECL'
}

# Build weights array in the same order as test_scaled columns
weights = np.array([
    importances_df.loc[feature, 'Importance'] 
    for feature in importance_to_column_map 
    if importance_to_column_map[feature] in columns_order
])


In [4]:
# File path
test_file = 'data/CLUSTER3_VALIDATION_cleaned_cities.csv'

# Define numerical and categorical structure
columns_order = [
    'IMPERV', 'HEIGHT', 'COAST', 'ELEV', 'POP',  
    'RH', 'SP', 'PRECIP', 'T_2M_COR', 'WS', 'TCC',  
    'CAPE', 'BLH', 'SSR', 'SOLAR_ELEV', 'DECL'
]

# Load test data with LC_CORINE
test_df = pd.read_csv(test_file, usecols=columns_order + ['LC_CORINE'])
test_scaled = min_max_scale(test_df[columns_order].to_numpy(), min_vals, scale)
test_df_scaled = pd.DataFrame(test_scaled, columns=columns_order)
test_df_scaled['LC_CORINE'] = test_df['LC_CORINE'].values
print('test data ready, working on trees now')

# Build KDTree for each LC_CORINE class
trees_by_class = {}
for lc_class in range(1, 16):
    class_subset = test_df_scaled[test_df_scaled['LC_CORINE'] == lc_class]
    if len(class_subset) == 0:
        print(lc_class)
        continue  # No test points for this class
    features = class_subset[columns_order].to_numpy() * weights  # Apply weights
    trees_by_class[lc_class] = KDTree(features, leaf_size=40)


test data ready, working on trees now
5


In [5]:
train_file = 'data/CLUSTER3_TRAIN_cleaned_cities.csv'

distances_list = []
chunk_size = 1_000_000
print("Processing TRAIN data in chunks...")
chunk_idx = 0
chunk_start_time = time.time()

for chunk in pd.read_csv(train_file, usecols=columns_order + ['LC_CORINE'], chunksize=chunk_size):
    chunk_idx += 1

    # Scale numerical features
    chunk_scaled = min_max_scale(chunk[columns_order].to_numpy(), min_vals, scale)
    weighted_features = chunk_scaled * weights
    lc_values = chunk['LC_CORINE'].values

    # Initialize distances for this chunk
    chunk_dists = np.full(len(chunk), np.inf)

    # For each class in this chunk, query the corresponding KDTree
    for lc_class in np.unique(lc_values):
        if lc_class not in trees_by_class:
            print('oops')
            continue  # No matching test points

        mask = lc_values == lc_class
        query_points = weighted_features[mask]

        # Query KDTree
        dists, _ = trees_by_class[lc_class].query(query_points, k=1)
        chunk_dists[mask] = dists.flatten()

    # Save distances
    distances_list.append(pd.DataFrame(chunk_dists, columns=['dist']))

    duration = time.time() - chunk_start_time
    print(f"Chunk {chunk_idx} processed in {duration:.2f}s")
    chunk_start_time = time.time()


Processing TRAIN data in chunks...
Chunk 1 processed in 319.42s
Chunk 2 processed in 336.29s
Chunk 3 processed in 374.01s
Chunk 4 processed in 332.08s
Chunk 5 processed in 388.23s
Chunk 6 processed in 394.34s
Chunk 7 processed in 367.44s
Chunk 8 processed in 346.32s
Chunk 9 processed in 289.43s
Chunk 10 processed in 278.19s
Chunk 11 processed in 281.28s
Chunk 12 processed in 265.24s
Chunk 13 processed in 358.34s
Chunk 14 processed in 357.71s
Chunk 15 processed in 431.86s
Chunk 16 processed in 380.23s
Chunk 17 processed in 394.37s
Chunk 18 processed in 362.25s
Chunk 19 processed in 353.45s
Chunk 20 processed in 365.60s
Chunk 21 processed in 324.94s
Chunk 22 processed in 315.83s
Chunk 23 processed in 416.01s
Chunk 24 processed in 397.36s
Chunk 25 processed in 407.20s
Chunk 26 processed in 404.10s
Chunk 27 processed in 395.61s
Chunk 28 processed in 355.89s
Chunk 29 processed in 290.30s
Chunk 30 processed in 318.77s
Chunk 31 processed in 333.41s
Chunk 32 processed in 308.91s
Chunk 33 proce

In [6]:
# Write distances to CSV
all_distances = pd.concat(distances_list, ignore_index=True)
output_file = 'results/CLUSTER3_TRAIN_min_dist.csv'
all_distances.to_csv(output_file, index=False)
print(f"\nAll distances written to: {output_file}")



All distances written to: results/CLUSTER3_TRAIN_min_dist.csv
