In [21]:
%load_ext autoreload
%autoreload 2
%aimport

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Modules to reload:
all-except-skipped

Modules to skip:



In [22]:
%matplotlib inline

In [23]:
import numpy as np
import matplotlib.pyplot as plt 
import warnings
from tqdm import tqdm

In [24]:
from multicam.mah import get_mah

# Collect MAH data

In [25]:
mah_data = get_mah('../../data/processed/bolshoi_m12/', cutoff_missing=0.05, cutoff_particle=0.05)

In [26]:
# catalog
cat = mah_data['cat']
xoff = cat['x0']
cvir = cat['cvir']
ma = mah_data['ma']
am = mah_data['am']
# ma_peak = mah_data['ma_peak']

indices = mah_data['indices']
scales = mah_data['scales']
mass_bins = mah_data['mass_bins']
print(cvir.shape, xoff.shape, am.shape, ma.shape, scales.shape, mass_bins.shape)

(10000,) (10000,) (10000, 100) (10000, 165) (165,) (100,)


# Construct indicators and merger ratio at each snapshot

In [27]:
# scale factor coarser bins 
min_scale, max_scale = np.min(scales), np.max(scales)
scale_bins = np.linspace(min_scale, max_scale, 20)
scale_bins

array([0.18635   , 0.22919211, 0.27203421, 0.31487632, 0.35771842,
       0.40056053, 0.44340263, 0.48624474, 0.52908684, 0.57192895,
       0.61477105, 0.65761316, 0.70045526, 0.74329737, 0.78613947,
       0.82898158, 0.87182368, 0.91466579, 0.95750789, 1.00035   ])

In [28]:
# for each simulation scale, get MM 
# first we need merger ratio at every scale 
merger_ratio_inst  = np.zeros((len(cat), len(scales)))
merger_ratio_present = np.zeros((len(cat), len(scales)))
for i in range(len(cat)):
    for j, idx in enumerate(indices): 
        m2_name = f'm2_a{idx}' # TODO: Check what this corresponds to
        cpg_name = f'coprog_mvir_a{idx-1}' # want coprogenitor mass at previous timestep.
        mvir_name = f'mvir_a{idx}'
        
        # get inst ratios
        m2_ratio = cat[m2_name][i].item() / cat[mvir_name][i].item()
        merger_ratio_inst[i, j] = m2_ratio
        
        # present ratios
        m2_ratio = cat[m2_name][i] / cat['mvir'][i].item()
        merger_ratio_present[i, j] = m2_ratio
    

In [29]:
# construct indicator for each halo whether they have a MM in a given scale bine 
Mu = [0.3/1.3, 0.1 / 1.1 , 0.03 / (1 + 0.03), 0.01 / 1.01]
inst_mask = np.zeros((len(Mu), len(cat), len(scale_bins)-1))
present_mask = np.zeros((len(Mu), len(cat), len(scale_bins)-1))

# ignore all nanmax warning, these will return np.nan's which is expected. 
warnings.simplefilter("ignore", category=RuntimeWarning)

for kk, mu in enumerate(Mu):
    for jj in range(len(scale_bins) - 1):
        # get largest merger ratio in this scale bin
        mask = (scales >= scale_bins[jj]) & (scales < scale_bins[jj+1])
        inst_mask[kk, :, jj] = np.nanmax(merger_ratio_inst[:, mask], axis=1) > mu
        present_mask[kk, :, jj] = np.nanmax(merger_ratio_present[:, mask], axis=1) > mu

In [30]:
# use coarser bins for merger bins like in the 'merger residuals' plots

x1 = ma

mask2 = np.isnan(merger_ratio_inst)
x2 = np.where(mask2, 0, merger_ratio_inst)

# use scale_bins instead of scales, take max over mergers in each bin.
x3 = np.zeros((x2.shape[0], scale_bins.shape[0]))
for ii, scale in enumerate(scales):
    jj = np.where(scale >= scale_bins)[0][-1]
    arr = np.vstack([x3[:, jj], x2[:, ii]])
    assert arr.shape == (2, 10000)
    x3[:, jj] = np.max(arr, axis=0)

In [31]:
from sklearn.model_selection import train_test_split

x = np.concatenate([x1, x2], axis=1)

y = np.concatenate([cat['cvir'][:, None], cat['x0'][:, None], cat['t/|u|'][:, None], 
                    cat['spin_bullock'][:, None], cat['c_to_a'][:,None]], 
                    axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Training

In [33]:
from multicam.models import LinearRegression

## Train standard MultiCAM

In [34]:
mc = LinearRegression(165, 5)

In [35]:
mc.fit(x_train[:, :165], y_train)

In [36]:
mc.predict(x_test[:,:165])[4, 0], mc.predict(x_test[4, None, :165])[0,0] # almost the same

(5.641537189483643, 5.6473517417907715)

In [37]:
# get mean cvir and mean predicted cvir for sanity check
cvir_mean = np.mean(y_test[:, 0])
cvir_pred_mean = np.mean(mc.predict(x_test[:,:165])[:,0])
np.abs(cvir_mean - cvir_pred_mean) / np.abs(cvir_mean) # very low error

0.008627065044904316

## Train with coarse merger ratios

In [38]:
mc_merger = LinearRegression(330, 5)

In [39]:
mc_merger.fit(x_train, y_train)

## Correlations

In [40]:
y_pred1 = mc.predict(x_test[:,:165])
y_pred2 = mc_merger.predict(x_test)

In [41]:
from multicam.correlations import spearmanr

spearmans1 = spearmanr(y_test[:, 0], y_pred1[:, 0])
spearmans2 = spearmanr(y_test[:, 0], y_pred2[:, 0])

print(spearmans1, spearmans2)

0.7852376886380997 0.6919254816649411
