Files were obtained from: https://www.slac.stanford.edu/~behroozi/MultiDark_Hlists_Rockstar/

In [None]:
import numpy as np
import astropy
from astropy.table import Table 
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore")

# pandas example. 

In [None]:
#import pandas as pd
# cat = pd.read_table("catalogues/hlist_0.07835.txt", header=0, comment='#', sep='\s+', engine='python')

# Astropy table 

In [None]:
#read the one at redshift z = 0. 
cat = Table.read("../data/hlist_0.11515.list.gz", format="ascii")
#cat = Table.read("catalogues/hlist_1.00035.list.gz", format="ascii")
new_names = [name[:name.rfind('(')] for name in cat.colnames]
for name, new_name in zip(cat.colnames, new_names): 
    cat.rename_column(name, new_name)

In [None]:
print(cat.colnames)

In [None]:
# add some of the missing parameters. 
# c_vir, phi_L, shape q
#some phi_L have NaNs 
cvir = cat['rvir'] / cat['rs']
phi_l = np.arccos(
    ((cat['A[x]']*cat['Jx'] + cat['A[y]']*cat['Jy'] + cat['A[z]']*cat['Jz'])
     /
    (np.sqrt(cat['A[x]']**2 + cat['A[y]']**2 + cat['A[z]']**2)*np.sqrt(cat['Jx']**2 + cat['Jy']**2 + cat['Jz']**2) )
    )
)
q = (1/2)*(cat['b_to_a'] + cat['c_to_a']);

In [None]:
#now we add these columns to our table. 
cat.add_column(cvir, name='cvir')
cat.add_column(phi_l, name='phi_l')
cat.add_column(q, name='q')

In [None]:
print("length of catalogue:", len(cat))
print("Number of nan:", sum(np.isnan(cat['phi_l'])))

#We can ignore the 'nan's from Phi_L too for simplicity. 
cat = cat[~np.isnan(cat['phi_l'])]

print("final length of catalogue:", len(cat))

# Spearman correlation example

In [None]:
#variables we care about for computing correlations 
params = ['mvir', 'cvir' , 'T/|U|', 'Xoff', 'Voff', 'Spin', 'q', 'phi_l',
         #rho_rms
         ]

In [None]:
print("Spearman correlation coefficient between parameters...")
for i, param1 in enumerate(params): 
    for param2 in params[i+1:]:
        corr, p = spearmanr(cat[param1], cat[param2])
        print(f"{param1}, {param2}: {corr, p}")
        

In [None]:
#prettier version. 
corrs = np.zeros((len(params), len(params)))
for i,param1 in enumerate(params): 
    for j,param2 in enumerate(params):
        corr, p = spearmanr(cat[param1], cat[param2])
        corrs[i,j] = corr


In [None]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
mask =  np.tri(corrs.shape[0], k=-1)
A = np.ma.array(corrs, mask=mask)
im = ax.matshow(A,cmap='bwr',vmin=-1, vmax=1)
plt.colorbar(im, ax = ax)
ax.set_xticklabels(['']+params);
ax.set_yticklabels(['']+params);

# Correlation between Mvir vs. Xoff , rho_rms

Draw contour plots to illustrate the correlation between `Xoff` and `rho_rms` with respect to Mvir

## mass and rho_rms

In [None]:
# plt.scatter(cat['mvir'], cat['rho_rms'])
# plt.xscale('log')
# plt.yscale('log')

## mass and xoff 

**Question**: How to resolve this ambiguity? 

In [None]:
#all masses in catalogue
plt.scatter(cat['mvir'], cat['Xoff'], alpha=0.1)
plt.xscale('log')
plt.yscale('log')
print("correlation:", spearmanr(cat['mvir'], cat['Xoff'])[0])

We also plot **contours**: 

In [None]:
from astroML.plotting import scatter_contour

In [None]:
scatter_contour(cat['mvir'], cat['Xoff'], filled_contour=True, threshold=100, levels=5)

# Histograms of the variables

## halo masses 

Divide into relaxed/unrelaxed and different Mpc box (90, 180) 

In [None]:
plt.hist(np.log10(cat['mvir']), bins=20, histtype='step');

The figure in the paper uses a 90Mpc box and only halos with more than 100 particles and log(mvir)~12

In [None]:
params = ['cvir', 'Spin', 'q', 'phi_l', 'Xoff', 'T/|U|'
         ]

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(8,10))
cat12 = cat[ (11.5< np.log10(cat['mvir'])) & (np.log10(cat['mvir'])< 12.5) & (cat['q']!=0)]
print('mean, median, and std of:')
for param, ax in zip(params, axs.flatten()): 
    x = np.log10(cat12[param]) 
    px = (x - np.mean(x))/ np.std(x)
    ax.set_title(param)
    ax.hist(px)
    print(f'{param}:', np.mean(x), np.median(x), np.std(x))


# PCA 


## mass threshold

In [None]:
from sklearn.decomposition import PCA

In [None]:
params = ['mvir', 'cvir','Spin', 'q','phi_l', 'Xoff', 'T/|U|'
         ]

In [None]:
pca = PCA(n_components=4)

In [None]:
def remove_outliers(x, p=0.95): 
    return x[x < ]

In [None]:
#normalize and take log.
#cuts on mass >= 12 once we get z= 0 catalogue and different fixed mass regimes too.  
# remove extreme outliers too. 
cat12 = cat[(cat['q']!=0) &(cat['Spin']!=0)]
X = np.zeros((len(params), len(cat12['mvir'])))
for i, param in enumerate(params): 
    x = np.log10(cat12[param])
    px = (x - np.mean(x))/ np.std(x)
    px 
    X[i] = px 
    

In [None]:
X.shape

In [None]:
pca.fit(X.T)

In [None]:
np.set_printoptions(threshold=50)
print(params)
print(pca.singular_values_)
print(pca.explained_variance_ratio_)  
print(pca.components_)

In [None]:
pca.components_[0]**2

In [None]:
0.5**2 + 0.5**2 + 0.5**2

## fixed mass

In [None]:
# paper uses 12, 13.3, 13.6 