In [17]:
# Python modules
import os
import sys
import pandas as pd
import seaborn as sns
import time, datetime

# SigMA modules
from coordinate_transformations.sky_convert import transform_sphere_to_cartesian

#### DistantSigMA modules
from DistantSigMA.DistantSigMA.clustering_routine import *
from DistantSigMA.DistantSigMA.PlotlyResults import plot
from DistantSigMA.DistantSigMA.cluster_simulations import calculate_std_devs
from DistantSigMA.DistantSigMA.scalefactor_sampling import lhc_lloyd
from DistantSigMA.Analysis.IsochroneArchive.myTools import my_utility


# 1.) Paths
# ---------------------------------------------------------
# set sys and output paths
sys.path.append('/Users/alena/PycharmProjects/Sigma_Orion')

In [18]:
script_name = my_utility.get_calling_script_name("kd_tree_data_test")
output_path = my_utility.set_output_path(main_path='/Users/alena/Library/CloudStorage/OneDrive-Personal/Work/PhD/'
                                                   'Projects/Sigma_Orion/Coding/Code_output/', script_name=script_name)


In [19]:
df = pd.read_csv("../../Collab/ISM-FLOW-WS2/Vela_clusters_DR3_preprocessed.csv", usecols=['ra', 'dec', 'parallax','X', 'Y', 'Z', 'v_a_lsr', 'v_d_lsr', 'label', 'distance'])

df.head()

Unnamed: 0,label,ra,dec,parallax,distance,X,Y,Z,v_a_lsr,v_d_lsr
0,3,119.534503,-45.289629,2.5518,391.880241,-67.308483,-381.88728,-56.584425,-9.259812,12.231144
1,3,117.478434,-41.361619,2.8974,345.137019,-83.825874,-331.653957,-45.808709,-7.282962,12.960175
2,3,117.383706,-41.167946,2.8569,350.029753,-86.22596,-336.075311,-46.252531,-8.473165,14.458632
3,3,115.778062,-42.446179,2.4887,401.816209,-94.921531,-385.052296,-64.659868,-9.041788,13.458839
4,3,119.258807,-43.488479,2.4041,415.956075,-83.36003,-403.829318,-54.703236,-10.48907,12.865119


In [20]:
cluster_features = ['X', 'Y', 'Z', 'v_a_lsr', 'v_d_lsr']   # Cols for Galactic Cart. clustering
df_far = df[df["distance"] < 500]
df_near = df[df["distance"] < 350]
# Parameters to vary
knn = 100
alpha = 0.05
bh_correction = True
# Fixed parameters that work out quite well
beta = 0.99
knn_initcluster_graph = 30

sigma_kwargs = dict(
    cluster_features=cluster_features,  # Columns to cluster on
    scale_factors={'vel': {'features': ['v_a_lsr', 'v_d_lsr'], 'factor': 11}},  # Columns that will be scaled
    nb_resampling=0,  # Resampling - e.g., important for calculating stability of membership
    max_knn_density=knn + 1,  # This need
    beta=beta,
    knn_initcluster_graph=knn_initcluster_graph,
)



In [21]:
# Initialize SigMA instance
clusterer = SigMA(
    data=df_near,
    kd_tree_data = df_near,
    **sigma_kwargs
)

In [22]:
# Fit
st = time.time()
print('Start clustering...')
clusterer.fit(alpha=alpha, knn=knn, bh_correction=True)
labels= clusterer.labels_

delta_t = str(datetime.timedelta(seconds=time.time() - st)).split('.')[0]
print(f'Done! [took {delta_t}]. Found {np.unique(labels).size} clusters')

# Save point-wise density
rho = clusterer.weights_

Start clustering...
Performing gradient ascend using a 100-NN density estimation.
Updated significance threshold: 2.86e-02
Done! [took 0:00:00]. Found 7 clusters


In [23]:
ln = LabelEncoder().fit_transform(labels)  # Sets labels to 0-(N-1)
print(ln.shape)

df_near["SigMA_label"] =ln

(3189,)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [24]:
plot(labels=ln, df=df_near, filename="VelaNear_same_kdtree_data", output_pathname=output_path)
