In [1]:
import h5py
from tqdm.notebook import tqdm
import scipy
import torch
from torch_geometric.data import Data
import os

import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import matplotlib.pyplot as plt
import illustris_python as il

ROOT = Path("..").resolve().parent
tng_base_path = f"{ROOT}/illustris_data/TNG300-1/output"


# Get cosmic web params

In [2]:
cw = h5py.File(f"{ROOT}/illustris_data/TNG300-1/postprocessing/disperse/disperse_099.hdf5")
cw.keys()

<KeysViewHDF5 ['d_minima', 'd_node', 'd_saddle_1', 'd_saddle_2', 'd_skel', 'subhalo_ID']>

In [3]:
cw = pd.DataFrame(
    {k: cw[k] for k in cw.keys()}
).rename({"subhalo_ID": "subhalo_id"}, axis=1).set_index("subhalo_id")

# normalize
cw = (cw - cw.mean(0)) / cw.std(0)

cw.head()

Unnamed: 0_level_0,d_minima,d_node,d_saddle_1,d_saddle_2,d_skel
subhalo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-0.405779,-1.471773,-1.259529,-1.104576,-1.163798
1,-0.671898,-1.505375,-0.801792,-0.777483,-1.195676
2,-0.488557,-1.194772,-1.493097,-1.245941,-1.161969
3,-0.48864,-0.783244,-1.150201,-1.423575,-1.180119
4,-0.490822,-1.1321,-1.446336,-1.392441,-1.150521


# Make PyG dataset

In [5]:
snapshot=99
r_link=5
pad=2.5
split=6
use_gal=False
h=0.6774
undirected=True 
periodic=False 
use_loops=True
in_projection=False
train_test_frac_split = split**2

cuts = {
    "minimum_log_stellar_mass": 9,
    "minimum_log_halo_mass": 10,
    "minimum_n_star_particles": 50
}

config_params = dict(
    boxsize=51.7e3,    # box size in comoving kpc/h
    h_reduced=0.704,   # reduced Hubble constant
    snapshot=99,       # z = 0
)

normalization_params = dict(
    minimum_n_star_particles=10., # min star particles to be considered a galaxy
    norm_half_mass_radius=8., 
    norm_velocity=100., # note: use value of 1 if `use_central_galaxy_frame=True`
)

In [6]:
use_cols = ['subhalo_x', 'subhalo_y', 'subhalo_z', 'subhalo_vx', 'subhalo_vy', 'subhalo_vz', 'subhalo_loghalomass', 'subhalo_logvmax'] 
y_cols = ['subhalo_logstellarmass']

subhalo_fields = [
    "SubhaloPos", "SubhaloMassType", "SubhaloLenType", "SubhaloHalfmassRadType", 
    "SubhaloVel", "SubhaloVmax", "SubhaloGrNr", "SubhaloFlag"
]
subhalos = il.groupcat.loadSubhalos(tng_base_path, snapshot, fields=subhalo_fields) 

pos = subhalos["SubhaloPos"][:,:3]
min_box, max_box = np.rint(np.min(pos)), np.rint(np.max(pos))
box_size = max_box/(h*1e3) # in Mpc

halo_fields = ["Group_M_Crit200", "GroupFirstSub", "GroupPos", "GroupVel"]
halos = il.groupcat.loadHalos(tng_base_path, snapshot, fields=halo_fields)

subhalo_pos = subhalos["SubhaloPos"][:] / (h*1e3)
subhalo_stellarmass = subhalos["SubhaloMassType"][:,4]
subhalo_halomass = subhalos["SubhaloMassType"][:,1]
subhalo_n_stellar_particles = subhalos["SubhaloLenType"][:,4]
subhalo_stellarhalfmassradius = subhalos["SubhaloHalfmassRadType"][:,4]  / normalization_params["norm_half_mass_radius"]
subhalo_vel = subhalos["SubhaloVel"][:] /  normalization_params["norm_velocity"]
subhalo_vmax = subhalos["SubhaloVmax"][:] / normalization_params["norm_velocity"]
subhalo_flag = subhalos["SubhaloFlag"][:]
halo_id = subhalos["SubhaloGrNr"][:].astype(int)

halo_mass = halos["Group_M_Crit200"][:]
halo_primarysubhalo = halos["GroupFirstSub"][:].astype(int)
group_pos = halos["GroupPos"][:] / (h*1e3)
group_vel = halos["GroupVel"][:]  / normalization_params["norm_velocity"]

halos = pd.DataFrame(
    np.column_stack((np.arange(len(halo_mass)), group_pos, group_vel, halo_mass, halo_primarysubhalo)),
    columns=['halo_id', 'halo_x', 'halo_y', 'halo_z', 'halo_vx', 'halo_vy', 'halo_vz', 'halo_mass', 'halo_primarysubhalo']
)
halos['halo_id'] = halos['halo_id'].astype(int)
halos.set_index("halo_id", inplace=True)

# get subhalos/galaxies      
subhalos = pd.DataFrame(
    np.column_stack([halo_id, subhalo_flag, np.arange(len(subhalo_stellarmass)), subhalo_pos, subhalo_vel, subhalo_n_stellar_particles, subhalo_stellarmass, subhalo_halomass, subhalo_stellarhalfmassradius, subhalo_vmax]), 
    columns=['halo_id', 'subhalo_flag', 'subhalo_id', 'subhalo_x', 'subhalo_y', 'subhalo_z', 'subhalo_vx', 'subhalo_vy', 'subhalo_vz', 'subhalo_n_stellar_particles', 'subhalo_stellarmass', 'subhalo_halomass', 'subhalo_stellarhalfmassradius', 'subhalo_vmax'],
)
subhalos["is_central"] = (halos.loc[subhalos.halo_id]["halo_primarysubhalo"].values == subhalos["subhalo_id"].values)

subhalos = subhalos[subhalos["subhalo_flag"] != 0].copy()
subhalos['halo_id'] = subhalos['halo_id'].astype(int)
subhalos['subhalo_id'] = subhalos['subhalo_id'].astype(int)

subhalos.drop("subhalo_flag", axis=1, inplace=True)

# impose stellar mass and particle cuts
subhalos = subhalos[subhalos["subhalo_n_stellar_particles"] > cuts["minimum_n_star_particles"]].copy()
subhalos["subhalo_logstellarmass"] = np.log10(subhalos["subhalo_stellarmass"] / h)+10

subhalos["subhalo_loghalomass"] = np.log10(subhalos["subhalo_halomass"] / h)+10
subhalos["subhalo_logvmax"] = np.log10(subhalos["subhalo_vmax"])
subhalos["subhalo_logstellarhalfmassradius"] = np.log10(subhalos["subhalo_stellarhalfmassradius"])

subhalos = subhalos[subhalos["subhalo_loghalomass"] > cuts["minimum_log_halo_mass"]].copy()

subhalos = subhalos[subhalos["subhalo_logstellarmass"] > cuts["minimum_log_stellar_mass"]].copy()


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [7]:
subhalos = subhalos.join(cw, on="subhalo_id", how="left")

In [24]:
data_path = f"{ROOT}/illustris_data/TNG300-1/postprocessing/pyg_data_disperse.h5"
data = []
for n in tqdm(range(split), position=0):
    for g in tqdm(range(split), position=1, leave=False):
        for k in tqdm(range(split), position=2, leave=False):
            # print(n,g,k)
            xlims = np.array([box_size/split*n+pad, box_size/split*(n+1)-pad])
            ylims = np.array([box_size/split*g+pad, box_size/split*(g+1)-pad])
            zlims = np.array([box_size/split*k+pad, box_size/split*(k+1)-pad])

            pos = np.vstack(subhalos[['subhalo_x', 'subhalo_y', 'subhalo_z']].to_numpy())

            xmask = np.logical_and(pos[:,0]>xlims[0],pos[:,0]<xlims[1])
            ymask = np.logical_and(pos[:,1]>ylims[0],pos[:,1]<ylims[1])
            zmask = np.logical_and(pos[:,2]>zlims[0],pos[:,2]<zlims[1])
            mask = np.logical_and(zmask, np.logical_and(xmask, ymask))

            df = subhalos.iloc[mask].copy()
            df.reset_index(drop=True)

            # remove extraneous columns
            df.drop(["subhalo_n_stellar_particles", "subhalo_stellarmass", "subhalo_halomass"], axis=1, inplace=True)

            # set new zero point

            df[['subhalo_x', 'subhalo_y', 'subhalo_z']] = df[['subhalo_x', 'subhalo_y', 'subhalo_z']] - np.array([box_size/split*n+pad, box_size/split*g+pad, box_size/split*k+pad])

            #make positions for clustering

            if in_projection:
                pos = np.vstack(df[['subhalo_x', 'subhalo_y']].to_numpy())    
            else:
                pos = np.vstack(df[['subhalo_x', 'subhalo_y', 'subhalo_z']].to_numpy())

            kd_tree = scipy.spatial.KDTree(pos, leafsize=25, boxsize=box_size)
            edge_index = kd_tree.query_pairs(r=r_link, output_type="ndarray")

            # normalize positions

            df[['subhalo_x', 'subhalo_y', 'subhalo_z']] = df[['subhalo_x', 'subhalo_y', 'subhalo_z']]/(box_size/2)

            if undirected:
            # Add reverse pairs
                reversepairs = np.zeros((edge_index.shape[0],2))
                for i, pair in enumerate(edge_index):
                    reversepairs[i] = np.array([pair[1], pair[0]])
                edge_index = np.append(edge_index, reversepairs, 0)

                edge_index = edge_index.astype(int)

                # Write in pytorch-geometric format
                edge_index = edge_index.reshape((2,-1))
                num_pairs = edge_index.shape[1]

            row, col = edge_index

            diff = pos[row]-pos[col]
            dist = np.linalg.norm(diff, axis=1)

            use_gal = True

            if periodic:
                # Take into account periodic boundary conditions, correcting the distances
                for i, pos_i in enumerate(diff):
                    for j, coord in enumerate(pos_i):
                        if coord > r_link:
                            diff[i,j] -= box_size  # Boxsize normalize to 1
                        elif -coord > r_link:
                            diff[i,j] += box_size  # Boxsize normalize to 1

            centroid = np.mean(pos,axis=0) # define arbitrary coordinate, invarinat to translation/rotation shifts, but not stretches
            # centroid+=1.2

            unitrow = (pos[row]-centroid)/np.linalg.norm((pos[row]-centroid), axis=1).reshape(-1,1)
            unitcol = (pos[col]-centroid)/np.linalg.norm((pos[col]-centroid), axis=1).reshape(-1,1)
            unitdiff = diff/dist.reshape(-1,1)
            # Dot products between unit vectors
            cos1 = np.array([np.dot(unitrow[i,:].T,unitcol[i,:]) for i in range(num_pairs)])
            cos2 = np.array([np.dot(unitrow[i,:].T,unitdiff[i,:]) for i in range(num_pairs)])

            edge_attr = np.concatenate([dist.reshape(-1,1), cos1.reshape(-1,1), cos2.reshape(-1,1)], axis=1)

            if use_loops:
                loops = np.zeros((2,pos.shape[0]),dtype=int)
                atrloops = np.zeros((pos.shape[0], edge_attr.shape[1]))
                for i, posit in enumerate(pos):
                    loops[0,i], loops[1,i] = i, i
                    atrloops[i,0], atrloops[i,1], atrloops[i,2] = 0., 1., 0.
                edge_index = np.append(edge_index, loops, 1)
                edge_attr = np.append(edge_attr, atrloops, 0)
            edge_index = edge_index.astype(int)

            x = torch.tensor(np.vstack(df[use_cols].to_numpy()), dtype=torch.float)
            y = torch.tensor(np.vstack(df[y_cols].to_numpy()), dtype=torch.float)
            edge_index = torch.tensor(edge_index, dtype=torch.long)
            edge_attr=torch.tensor(edge_attr, dtype=torch.float)
            pos = torch.tensor(pos, dtype=torch.float)
            is_central = torch.tensor(df.is_central.values, dtype=bool)
            overdensity = torch.zeros(len(x), dtype=x.dtype)
            for i in range(len(x)):
                neighbors = edge_index[1, edge_index[0] == i] # get neighbor indices
                overdensity[i] = torch.log10((10**x[neighbors, -2]).sum()) # get sum of masses of neighbors (2nd to last index in `x`)

            cosmic_web = torch.tensor(np.vstack(df[["d_minima", "d_node", "d_saddle_1", "d_saddle_2", "d_skel"]].to_numpy()), dtype=torch.float)

            data.append(Data(x=x, y=y, pos=pos, is_central=is_central, edge_index=edge_index, edge_attr=edge_attr, overdensity=overdensity, cosmic_web=cosmic_web))

            proj_str = "-projected" if in_projection else ""

            if not os.path.isdir(os.path.join(tng_base_path, 'cosmic_graphs')):
                os.mkdir(os.path.join(tng_base_path, 'cosmic_graphs'))

            with open(data_path, 'wb') as handle:
                pickle.dump(data, handle)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

# Train random forest

In [25]:
#TODO - join subhalo cat with disperse

def train_validate_disperse(data, k, split=6):
    """Here data should just be subhalo catalog joined with disperse catalog"""
    
    data_train = data[:k*train_test_frac_split] + data[(k+1)*train_test_frac_split:]
    data_valid = data[k*train_test_frac_split:(k+1)*train_test_frac_split]

    X_train_disperse = np.concatenate([torch.hstack([d.x[:, -2:], d.disperse[:, :]]) for d in data_train], 0)
    y_train = np.concatenate([d.y[:, 0] for d in data_train])
    X_valid_disperse = np.concatenate([torch.hstack([d.x[:, -2:], d.disperse[:, :]]) for d in data_valid], 0)
    y_valid = np.concatenate([d.y[:, 0] for d in data_valid])

    rf_disperse = RandomForestRegressor()
    rf_disperse.fit(X_train_disperse, y_train)

    p_log_Mhalo_rf_disperse = rf_disperse.predict(X_valid_disperse)

    return pd.DataFrame({
        "p_RF_disperse": p_log_Mhalo_rf_disperse, 
        "log_Mstar": y_valid,
    })