In [1]:
import numpy as np
import pandas as pd
import sys
import os
from itertools import product


from SigMA.SigMA import SigMA
from Loop_functions import setup_ICRS_ps, remove_field_stars, extract_signal_remove_spurious, extract_signal, \
    save_output_summary, consensus_function
from IsochroneArchive.myTools import my_utility
from PlotlyResults import plot

In [2]:
def scale_factors(filepath: str, c_solution: int):

    if c_solution == 6:
        stds = np.genfromtxt(filepath, usecols=(1,2,3), skip_header=2,max_rows=5)
    elif c_solution == 4:
        stds = np.genfromtxt(filepath, usecols=(1,2,3), skip_header=10, max_rows=5)
    else:
        print("Only 4C and 6C solutions available at the moment")
        stds = None

    sfs = np.empty(shape=(5,3))
    for h, row in enumerate(stds[:]):
        flipped_row = row[::-1]
        sfs[h] = 1/flipped_row

    return sfs

In [3]:
# set sys and output paths
sys.path.append('/Users/alena/PycharmProjects/Sigma_Orion')
script_name = my_utility.get_calling_script_name(__file__)
output_path = my_utility.set_output_path(main_path='/Users/alena/Library/CloudStorage/OneDrive-Personal/Work/PhD/Projects/Sigma_Orion/Coding/Code_output/', script_name=script_name)

NameError: name '__file__' is not defined

In [None]:
region = 0.0
run = 2

df_load = pd.read_csv("/Users/alena/PycharmProjects/SigMA_Orion/Start_data/Orion_labeled_segments_KNN_300_15-11-23.csv")
df_region = df_load[df_load.region == region]
result_path = output_path + f"Run_{run}/Region_{int(region)}/"
if not os.path.exists(result_path):
    os.makedirs(result_path)

In [None]:
# define fixed SigMA parameters
step = 2
alpha = 0.05
beta = 0.99
knn_initcluster_graph = 35
knn = 30
bh = False
n_resampling = 0
scaling = None

feature_space = ['ra', 'dec', 'plx', 'pmra', 'pmdec']

In [None]:
std_path = "/Users/alena/PycharmProjects/SigMA_Orion/Start_data/Region_0/simulated_sfs.txt"
ra_scaling, dec_scaling, plx_scaling, pmra_scaling, pmdec_scaling = scale_factors(std_path, 6)
pmdec_new = pmdec_scaling[1:]
pmra_new = pmra_scaling[:2]
# create the 243 possible combinations
combinations = np.array(list(product(ra_scaling, dec_scaling, plx_scaling, pmra_new, pmdec_new)))

In [None]:
seed_value = 42
np.random.seed(seed_value)

sampled_rows = np.random.choice(combinations.shape[0], size=108, replace=False)

# Use the sampled rows to extract the corresponding entries
sampled_entries = combinations[sampled_rows]
# Calculate the mean of each column
column_means = np.mean(sampled_entries, axis=0)

# Print or use the column means
print(column_means)

In [None]:
df_region.rename(columns={"parallax": "plx"}, inplace=True)

In [None]:
# --------------------- Evaluate sampled gps ---------------------
# initialize SigMA for computational efficiency
setup_kwargs, df_focus = setup_ICRS_ps(df_fit=df_region, sf_params=['ra', 'dec', 'plx'],
                                       sf_range=[ra_scaling, dec_scaling, plx_scaling], KNN_list=[knn], beta=beta,
                                       knn_initcluster_graph=knn_initcluster_graph, scaling=scaling)
sigma_kwargs = setup_kwargs["sigma_kwargs"]
scale_factor_list = setup_kwargs["scale_factor_list"]
clusterer = SigMA(
    data=df_focus, **sigma_kwargs)

# set mean of sampled list as SF
scale_factors = {'pos': {'features': ['ra', 'dec', 'plx'], 'factor': list(column_means[:3])},
                 'vel': {'features': ['pmra', 'pmdec'], 'factor': list(column_means[3:])}}
clusterer.set_scaling_factors(scale_factors)
print(clusterer.scale_factors)
# save X_mean
X_mean_sf = clusterer.X

label_matrix_rfs = np.empty(shape=(len(sampled_rows), len(df_focus)))
label_matrix_rsc = np.empty(shape=(len(sampled_rows), len(df_focus)))
label_matrix_simple = np.empty(shape=(len(sampled_rows), len(df_focus)))

In [None]:
# --------------------- Loop ---------------------
outer = np.empty(shape=(1, 3))
# outer_names = []
rho_sum = np.zeros(df_focus.shape[0], dtype=np.float32)

# Evaluate every grid point of the sample
for j, combo in enumerate(sampled_entries[:]):
    print(f"--- Gridpoint {j} ---")

    scale_factors = {'pos': {'features': ['ra', 'dec', 'plx'], 'factor': list(combo[:3])},
                     'vel': {'features': ['pmra', 'pmdec'], 'factor': list(combo[3:])}}
    #                 'vel': {'features': ['pmra', 'pmdec'], 'factor': [0.5,0.5]}}
    clusterer.set_scaling_factors(scale_factors)
    print(f"Performing clustering for scale factor {clusterer.scale_factors['pos']['factor']}, {clusterer.scale_factors['vel']['factor']}...")

    # Fit
    clusterer.fit(alpha=alpha, knn=knn, bh_correction=bh)
    label_array = clusterer.labels_
    # density and X
    rho, X = clusterer.weights_, clusterer.X
    rho_sum += rho

    # a) remove field stars
    nb_rfs = remove_field_stars(label_array, rho, label_matrix_rfs, j)
    # b) remove spurious clusters
    nb_es, nb_rsc = extract_signal_remove_spurious(df_focus, label_array, rho, X, label_matrix_rsc, j)
    # c) extract signal
    nb_simple = extract_signal(label_array, clusterer, label_matrix_simple, j)

    labels_rsc = label_matrix_rsc[j, :].reshape(label_matrix_rsc.shape[1], )
    labels_rfs = label_matrix_rfs[j, :].reshape(label_matrix_rfs.shape[1], )
    labels_simple = label_matrix_simple[j, :].reshape(label_matrix_simple.shape[1], )

# Perform consensus clustering on the a) and b) arrays (automatically generates and saves a html-plot)
df_save = df_focus
label_lists = [label_matrix_rfs, label_matrix_rsc, label_matrix_simple]

# Perform consensus clustering on the c) and d) steps
labels_cc, n_cc = zip(
    *(consensus_function(jl, rho_sum, df_focus, f"Run_{run}_real_{name}_CC",
                         output_path, plotting=False) for jl, name in zip(label_lists, ["rfs", "rsc", "simple"])))
n_occ = list(n_cc)
labels_occ = list(labels_cc)

names = ["rfs", "rsc", "new"]
for i, entry in enumerate(labels_occ):
    plot(entry, df_focus, f"Run_{run}_{names[i]}", output_path, icrs=True, return_fig=False)


save_output_summary(
    summary_str={"run": run, "knn": knn,
                 "n_rfs": n_occ[0], "n_rsc": n_occ[1], "n_new": n_occ[2]},
    file=output_path + f"summary_run_{run}_real_CC.csv")

# save the labels in a csv file and plot the result
df_save["rsc"] = labels_occ[1]
df_save["rfs"] = labels_occ[0]
df_save["simple"] = labels_occ[2]
df_save.to_csv(output_path + f"Run_{run}_results_CC.csv")


outer[0,:] = [(len(np.unique(i)) - 1) for i in labels_occ]
summary_df = pd.DataFrame(data=outer, columns=["n_rfs", "n_rsc", "n_new"])
summary_df.to_csv(output_path + f"{run}_real_knn_{knn}_bh_{bh}_summary.csv")

##########
# Output log-file
all_fixed = {"step": step, "alpha": alpha, "beta": beta, "knn_initcluster_graph": knn_initcluster_graph,
             "KNN": knn, "sfs_list": f"cc_{len(sampled_rows)}_seed_{seed_value}", "scaling": scaling, "bh_correction":
                 bh}

filename = output_path + f"Parameters_run_{run}.txt"
with open(filename, 'w') as file:
    for key, value in all_fixed.items():
        file.write(f"{key} = {value}\n")
###########
