In [6]:
import sys
sys.path.append("..")
from config import *
import os
import json
from scripts.run_active_learning import *
import numpy as np

In [7]:
# Paths
TOP5_SUMMARY = Top5_Similarity_Summary
MIN_MAX_SUMMARY = EDX_min_max_summary
MAPPED_CENTROIDS_JSON = MAPPED_CENTROIDS_JSON
OUTPUT_DIR = DATA_CLEAN_InIT_CHOICES
# Create output dir
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load inputs
top5_df = pd.read_csv(TOP5_SUMMARY)
minmax_df = pd.read_csv(MIN_MAX_SUMMARY)
with open(MAPPED_CENTROIDS_JSON, 'r') as f:
    centroids_mapped = json.load(f)

# Collect only short folders (e.g., 10268, 10269, etc.)
short_folders = sorted(
    set(top5_df['Folder'].astype(str)) |
    set(minmax_df['Folder'].astype(str))
)

for folder in short_folders:
    folder_indices = {}

    # Top5 Similarity indices
    top5_indices = top5_df[top5_df['Folder'].astype(str).str.startswith(folder)]['index'].tolist()
    if top5_indices:
        folder_indices['Top5Similarity'] = top5_indices

    #  Max/Min Comp grouping
    max_comp = []
    min_comp = []

    minmax_subset = minmax_df[minmax_df['Folder'].astype(str).str.startswith(folder)]
    for _, row in minmax_subset.iterrows():
        min_idx = row['MinIndex']
        max_idx = row['MaxIndex']
        min_comp.append(min_idx)
        max_comp.append(max_idx)

    if max_comp:
        folder_indices['Max Comp'] = max_comp
    if min_comp:
        folder_indices['Min Comp'] = min_comp

    # Centroids mapped
    matching_wafer_id = f"00{folder}"
    for wafer_key, centroids_dict in centroids_mapped.items():
        if wafer_key.startswith(matching_wafer_id):
            for sat_level, centroid_entries in centroids_dict.items():
                nearest_indices = []
                for entry in centroid_entries:
                    if isinstance(entry, dict) and 'nearest_stage_index' in entry:
                        nearest_indices.append(entry['nearest_stage_index'])

                cleaned_key = f"Centroids_{sat_level.replace(' ', '_')}"
                folder_indices[cleaned_key] = nearest_indices[:5]  # Limit to 5

    #  Only save if we collected something
    if folder_indices:
        save_path = os.path.join(OUTPUT_DIR, f"{folder}_indices.json")
        with open(save_path, 'w') as f:
            json.dump(folder_indices, f, indent=4)
   


In [8]:
# List of datasets from config
datasets = [
    DATASET_10272_Ag_Au_Pd_RT,
    DATASET_10275_Ag_Au_Pd_Pt_Rh_RT,
    DATASET_10304_Au_Pd_Pt_Rh_RT,
    DATASET_10311_Au_Pd_Pt_Rh_Ru_RT,
    DATASET_10403_Ag_Au_Cu_Pd_Pt_RT,
    DATASET_10402_Ag_Au_Pd_Pt_RT,
    DATASET_10399_Au_Cu_Pd_Pt_RT,
    DATASET_10374_Ir_Pd_Pt_Rh_Ru 
]

In [9]:

# Helper to convert non-serializable types
def convert_to_serializable(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, np.bool_):
        return bool(obj)
    return obj

for dataset_path in datasets:

    dataset_name = os.path.basename(dataset_path).split("_")[0]
    print(f"\n Processing dataset: {dataset_name}")
    
    data_exp = pd.read_csv(dataset_path)

    if data_exp.empty:
        print(f" Warning: Dataset {dataset_name} is empty.")
        continue

    json_path = os.path.join(DATA_CLEAN_InIT_CHOICES, f"{dataset_name}_indices.json")
    #print(f" Loading init strategies from: {json_path}")

    if not os.path.exists(json_path):
        #print(f" JSON file not found for {dataset_name}, skipping.")
        continue

    try:
        with open(json_path, "r") as f:
            init_choices = json.load(f)
            #print(f"Loaded init strategies: {init_choices}")
    except json.JSONDecodeError as e:
        print(f" JSON decode error in {json_path}: {e}")
        init_choices = {}


    all_columns = data_exp.columns.tolist()
    features = [col for col in all_columns if col not in ["ID", "x", "y", "Resistance"]]
    target = ["Resistance"]

    data_exp[target] = np.log(data_exp[target])

    output_dir = os.path.join(UNCERTAINTY_PATH, dataset_name + "_results")
    os.makedirs(output_dir, exist_ok=True)

    device = Resistance(data_exp, features=features, target=target)
    X_all = device.get_features()
    y_all = device.df[target[0]].values.reshape(-1, 1)
    init_strategies = select_initial_indices(X_all, n_init=5)
    init_choices.update(init_strategies)

    renamed_init_choices = {}
    for key, val in init_choices.items():
        if "Centroids_saturation_and_contrast_+++" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_+++", "Centroids_saturation_high")
        elif "Centroids_saturation_and_contrast_++" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_++", "Centroids_saturation_medium")
        elif "Centroids_saturation_and_contrast_+" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_+", "Centroids_saturation_low")
        else:
            new_key = key
        renamed_init_choices[new_key] = val

    init_choices = renamed_init_choices

    print(init_choices)
    with open(json_path, "w") as f:
        json.dump(init_choices, f, indent=4, default=convert_to_serializable)
       


 Processing dataset: 10272
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ag' 'Au' 'Pd' 'Resistance']
{'Top5Similarity': [323, 334, 310, 295, 278], 'Max Comp': [178, 23, 278], 'Min Comp': [114, 220, 94], 'Centroids_saturation_high': [130, 192, 252, 268, 103], 'Centroids_saturation_medium': [172, 233, 110, 83, 270], 'Centroids_saturation_low': [151, 231, 213, 268, 110], 'Random': [262, 149, 222, 30, 148], 'LHS': [82, 36, 127, 336, 44], 'K-Means': [5, 11, 0, 112, 105], 'Farthest': [32, 178, 278, 325, 339], 'ODAL': [10, 23, 38, 55, 74], 'K-Center': [180, 240, 5, 338, 38]}

 Processing dataset: 10275
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ag' 'Au' 'Pd' 'Pt' 'Rh' 'Resistance']
{'Top5Similarity': [24, 11, 0, 12, 1], 'Max Comp': [220, 74, 295, 335, 0], 'Min Comp': [114, 220, 39, 74, 323], 'Centroids_saturation_high': [231, 124, 103, 145, 147], 'Centroids_saturation_medium': [125, 146, 83, 83, 231], 'Centroids_saturation_low': [104, 146, 83, 167, 231], 'Random': [262, 149, 222, 30, 148], 'LHS': [224, 1