In [1]:
import pandas as pd
import os
import csv
import ast
import numpy as np

In [2]:
file_names = os.listdir('GadgetX-NewMDCLUSTER-2/combined-csv/subset/')

In [3]:
dataframes = []
for file_name in file_names:
    df = pd.read_csv(f'GadgetX-NewMDCLUSTER-2/combined-csv/subset/{file_name}')
    dataframes.append(df)

In [4]:
all_data = pd.concat(dataframes)

# Convert the 'ProgenitorsID' column from string to list
all_data['ProgenitorsID'] = all_data['ProgenitorsID'].apply(ast.literal_eval)

# Sort the snapshots in descending order
all_data = all_data.sort_values(by=['snapshot', 'ID'], ascending=[False, True])

In [5]:
snapshots = sorted(all_data['snapshot'].unique(), reverse=True)

In [6]:
# Create the pairs of halos
pairs = []

for snapshot in snapshots[:-1]:
    halos_n = all_data[all_data['snapshot'] == snapshot]
    halos_n_minus_1 = all_data[all_data['snapshot'] == snapshot - 1]
    id_to_index_n_minus_1 = {row['ID']: i for i, row in halos_n_minus_1.iterrows()}
    for i, halo in halos_n.iterrows():
        progenitors = halo['ProgenitorsID']
        for progenitor in progenitors:
            if progenitor == -1:
                continue
            j = id_to_index_n_minus_1.get(progenitor)
            if j is not None:
                progenitor_halo = halos_n_minus_1.loc[j]
                pair = pd.concat([halo, progenitor_halo.rename(lambda x: f'progenitor_{x}')])

                # calculate relative location and velocity for each dimension
                rel_location = 0
                rel_velocity = 0
                for dim, dim_c in zip(['X', 'Y', 'Z'], ['Xc', 'Yc', 'Zc']):
                    rel_location += (pair[dim_c] - pair[f'progenitor_{dim_c}']) ** 2
                for dim, dim_c in zip(['VX', 'VY', 'VZ'], ['VXc', 'VYc', 'VZc']):
                    rel_velocity += (pair[dim_c] - pair[f'progenitor_{dim_c}']) ** 2
                
                # compute the square root to get the Euclidean distance
                pair['rel_location'] = np.sqrt(rel_location)
                pair['rel_velocity'] = np.sqrt(rel_velocity)
                
                pairs.append(pair.to_dict())

pairs = pd.DataFrame(pairs)
pairs['Is_Progenitor'] = 1
print(pairs.head())


   hostHalo  numSubStruct          Mvir  npart             Xc             Yc  \
0         0             4  6.414070e+12  13083  495272.854964  497052.659591   
1         0             4  6.414070e+12  13083  495272.854964  497052.659591   
2         0             9  6.013450e+12  10385  510320.252899  502030.028865   
3         0             5  5.575940e+12   9760  500891.359031  508205.209655   
4         0             5  5.575940e+12   9760  500891.359031  508205.209655   

              Zc     VXc     VYc     VZc  ...  progenitor_Phi0  \
0  483021.971854   59.96   97.43  558.35  ...        2698450.0   
1  483021.971854   59.96   97.43  558.35  ...          28978.6   
2  508462.315490 -380.67  134.33 -250.79  ...        1292910.0   
3  498181.761168  -12.89 -365.33   83.94  ...        1166280.0   
4  498181.761168  -12.89 -365.33   83.94  ...          63277.8   

   progenitor_cNFW   progenitor_ID  progenitor_numProgenitors  \
0         11.81760  59000000000001                       

In [7]:
pairs.to_csv('GadgetX-NewMDCLUSTER-2/progenitor-pair/progenitor-paris.csv', index=False)

In [8]:
# Define a function to compute the Euclidean distance between two halos
def compute_distance(halo1, halo2):
    return np.sqrt((halo1['Xc'] - halo2['Xc'])**2 + (halo1['Yc'] - halo2['Yc'])**2 + (halo1['Zc'] - halo2['Zc'])**2)

# Determine the number of non-progenitor pairs to create
num_non_progenitor_pairs = len(pairs)

non_progenitor_pairs = []

# Define the maximum distance for non-progenitors
max_distance = 5000

for snapshot in snapshots[:-1]:
    halos_n = all_data[all_data['snapshot'] == snapshot]
    halos_n_minus_1 = all_data[all_data['snapshot'] == snapshot - 1]
    id_to_index_n_minus_1 = {row['ID']: i for i, row in halos_n_minus_1.iterrows()}
    
    for i, halo in halos_n.iterrows():
        progenitors = halo['ProgenitorsID']
        for j, progenitor_halo in halos_n_minus_1.iterrows():
            progenitor_id = progenitor_halo['ID']
            # Check if the halo is not a progenitor and the distance is less than the maximum distance
            if progenitor_id not in progenitors and compute_distance(halo, progenitor_halo) < max_distance:
                pair = pd.concat([halo, progenitor_halo.rename(lambda x: f'progenitor_{x}')])

                # calculate relative location and velocity
                rel_location = 0
                rel_velocity = 0
                for dim, dim_c in zip(['X', 'Y', 'Z'], ['Xc', 'Yc', 'Zc']):
                    rel_location += (pair[dim_c] - pair[f'progenitor_{dim_c}']) ** 2
                for dim, dim_c in zip(['VX', 'VY', 'VZ'], ['VXc', 'VYc', 'VZc']):
                    rel_velocity += (pair[dim_c] - pair[f'progenitor_{dim_c}']) ** 2
                
                # compute the square root to get the Euclidean distance
                pair['rel_location'] = np.sqrt(rel_location)
                pair['rel_velocity'] = np.sqrt(rel_velocity)

                non_progenitor_pairs.append(pair.to_dict())
                
                # Stop adding non-progenitor pairs when we reach the desired number
                if len(non_progenitor_pairs) == num_non_progenitor_pairs:
                    break
        # Stop adding non-progenitor pairs when we reach the desired number
        if len(non_progenitor_pairs) == num_non_progenitor_pairs:
            break
    # Stop adding non-progenitor pairs when we reach the desired number
    if len(non_progenitor_pairs) == num_non_progenitor_pairs:
        break

non_progenitor_pairs = pd.DataFrame(non_progenitor_pairs)

# Add the target variable
non_progenitor_pairs['Is_Progenitor'] = 0

non_progenitor_pairs.head()


Unnamed: 0,hostHalo,numSubStruct,Mvir,npart,Xc,Yc,Zc,VXc,VYc,VZc,...,progenitor_Phi0,progenitor_cNFW,progenitor_ID,progenitor_numProgenitors,progenitor_ProgenitorsID,progenitor_redshift,progenitor_snapshot,rel_location,rel_velocity,Is_Progenitor
0,0,4,6414070000000.0,13083,495272.854964,497052.659591,483021.971854,59.96,97.43,558.35,...,851794.0,3.24499,59000000000009,2,"[58000000000007, 58000000005047]",3.602,59,3884.336562,255.661674,0
1,0,4,6414070000000.0,13083,495272.854964,497052.659591,483021.971854,59.96,97.43,558.35,...,933438.0,6.51166,59000000000015,1,[58000000000016],3.602,59,3410.850014,486.773204,0
2,0,4,6414070000000.0,13083,495272.854964,497052.659591,483021.971854,59.96,97.43,558.35,...,759210.0,6.7452,59000000000037,1,[58000000000038],3.602,59,4765.201826,460.730587,0
3,0,4,6414070000000.0,13083,495272.854964,497052.659591,483021.971854,59.96,97.43,558.35,...,675215.0,8.59044,59000000000076,2,"[58000000000076, 58000000007710]",3.602,59,3267.978497,514.887903,0
4,0,4,6414070000000.0,13083,495272.854964,497052.659591,483021.971854,59.96,97.43,558.35,...,492646.0,4.41347,59000000000083,2,"[58000000000083, 58000000003416]",3.602,59,3278.185041,327.023288,0


In [9]:
non_progenitor_pairs.to_csv('GadgetX-NewMDCLUSTER-2/progenitor-pair/non_progenitor_pairs.csv', index=False)

In [10]:
# Combine progenitor and non-progenitor pairs into a single dataset
training_data = pd.concat([pairs, non_progenitor_pairs])

# Shuffle the data
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)

training_data.head()

Unnamed: 0,hostHalo,numSubStruct,Mvir,npart,Xc,Yc,Zc,VXc,VYc,VZc,...,progenitor_Phi0,progenitor_cNFW,progenitor_ID,progenitor_numProgenitors,progenitor_ProgenitorsID,progenitor_redshift,progenitor_snapshot,rel_location,rel_velocity,Is_Progenitor
0,0,0,242019000000.0,381,506912.297927,499620.030324,508184.983944,-103.75,106.71,-449.36,...,18540.7,11.0848,59000000013620,1,[58000000013909],3.602,59,4585.597579,414.003574,0
1,0,1,350722000000.0,515,482453.955256,503169.318904,505760.369094,308.02,-259.43,47.77,...,18486.9,8.677,59000000012231,1,[58000000013154],3.602,59,4924.119153,273.4356,0
2,0,1,241070000000.0,332,505227.992168,500811.517239,502602.059108,-399.17,16.79,198.14,...,32903.2,11.0625,59000000008993,1,[58000000008353],3.602,59,1595.207634,573.391194,0
3,0,0,284420000000.0,443,506901.903764,492927.711653,504353.645993,-134.22,453.85,111.22,...,18535.1,-1.0,59000000011175,0,[-1],3.602,59,2184.298394,675.501887,0
4,0,2,568451000000.0,1137,494688.52465,492990.911007,517620.535997,15.56,-206.26,-284.93,...,20338.3,7.9286,59000000012105,1,[58000000013171],3.602,59,2112.470953,174.366665,0


In [12]:
training_data.to_csv('GadgetX-NewMDCLUSTER-2/progenitor-pair/testing_data.csv', index=False)