In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from seaborn import kdeplot


mapping = np.load("Data/mapping.npy")
freq = np.load('Data/frequencies.npy')
subtraction_signals = np.load('Data/Processed/subtraction_signals.npy')
shifted_signals = np.load("Data/Processed/shifted_signals.npy")
shifted_residual = np.load("Data/Processed/shifted_residual.npy")

data_info = pd.read_csv('Data/data_info.csv')  # Provided
velocity_info = pd.read_csv('Data/velocity_info.csv')  # Calculated
velocity_info = velocity_info.dropna(subset=["Automatic velocity"]).reset_index(drop=True)
n_lines = pd.read_csv("Data/Processed/almagal_nlines_all.csv")

CO_idx = np.argmin(np.abs(freq - 220398.42455376702))

# Load and re-order labels such that largest cluster is cluster 0 and smalles is cluster -1
labels_10 = np.load("Data/Processed/labels10.npy")
labels_23 = np.load("Data/Processed/labels23.npy")

labels_mask_10 = labels_10 != -1  # Mask for clustered signals
labels_mask_23 = labels_23 != -1  # Mask for clustered signals

ordered_labels_10 = labels_10.copy()
ordered_labels_23 = labels_23.copy()

unique_labels, sizes = np.unique(labels_10[labels_mask_10], return_counts=True)
for i, label in enumerate(unique_labels[np.argsort(sizes)[::-1]]):
    ordered_labels_10[labels_10 == label] = i
    
unique_labels, sizes = np.unique(labels_23[labels_mask_23], return_counts=True)
for i, label in enumerate(unique_labels[np.argsort(sizes)[::-1]]):
    ordered_labels_23[labels_23 == label] = i

# Match data from tables to data from available sources
sc_pair = []
data_index_list = []
v_index_list = []
nlines_index_list = []

# Filter data from data_info not in velocity_info
velocity_info = velocity_info.dropna(subset=["Automatic velocity"]).reset_index(drop=True)
for i in range(len(data_info)):
    source = data_info['CLUMP'].iloc[i]
    core = data_info['ID'].iloc[i]
    
    v_index = velocity_info[(velocity_info["Source"] == source) * (velocity_info["Core"] == core)].index.to_list()
    nlines_index = n_lines[(n_lines["Region"] == source) * (n_lines["Core"] == core)].index.to_list()
    
    if len(v_index) > 0 and len(n_lines) > 0:
        if len(v_index) > 1:
            print(f"v_index: {v_index}")
        if len(nlines_index) > 1:
            print(f"n_lines: {nlines_index}")
            
        v_index_list.append(v_index[0])
        nlines_index_list.append(nlines_index[0])
        data_index_list.append(i)
        
        sc_pair.append((source, core))


velocity_info = velocity_info.iloc[v_index_list]
n_lines = n_lines.iloc[nlines_index_list]
data_info = data_info.iloc[data_index_list]

assert np.all(velocity_info["Source"].values == n_lines["Region"].values)
assert np.all(velocity_info["Source"].values == data_info["CLUMP"].values)
assert np.all(velocity_info["Core"].values == n_lines["Core"].values)
assert np.all(velocity_info["Core"].values == data_info["ID"].values)

ordered_labels_10 = ordered_labels_10[v_index_list]
ordered_labels_23 = ordered_labels_23[v_index_list]

subtraction_signals = subtraction_signals[v_index_list]
subtraction_signals /= np.max(subtraction_signals, axis=1, keepdims=True)

# Delete data from H_II regions
h2_mask = data_info['RADIO_MATCH'] == 0

In [None]:
import seaborn as sns

In [None]:
variables = ["Tclump", "Llump/Mclump", "Surfd_nd", "n(H2)"]