In [None]:
%matplotlib inline

In [None]:
import numpy as np
import astropy
from astropy.table import Table 
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore")

# preamble 

In [None]:
def scatter_binning(x, y, nxbins=10, ax=None, title=None): 
    xs = np.linspace(np.min(x), np.max(x), nxbins)
    xbbins = [(xs[i], xs[i+1]) for i in range(len(xs)-1)]
    
    masks = [((xbbin[0] < x) & ( x < xbbin[1])) for xbbin in xbbins]
    binned_x = [x[mask] for mask in masks]
    binned_y = [y[mask] for mask in masks]
    
    xmeds = [np.median(xbin) for xbin in binned_x]
    ymeds = [np.median(ybin) for ybin in binned_y]
    
    xqs = np.array([[xmed - np.quantile(xbin, 0.25), np.quantile(xbin,0.75) - xmed] for (xmed,xbin) in zip(xmeds,binned_x)]).T
    yqs = np.array([[ymed - np.quantile(ybin, 0.25), np.quantile(ybin,0.75) - ymed] for (ymed,ybin) in zip(ymeds,binned_y)]).T
    
    if ax is None:
        plt.errorbar(xmeds, ymeds, xerr=xqs, yerr=yqs, fmt='ro--', capsize=10)
    else: 
        ax.errorbar(xmeds, ymeds, xerr=xqs, yerr=yqs, fmt='ro--', capsize=10)
        
    if title is not None: 
        ax.set_title(title)

In [None]:
#functions I need to define myself. 
cvir = lambda cat: cat['rvir'] / cat['rs']
phi_l = lambda cat: np.arccos(
    ((cat['A[x]']*cat['Jx'] + cat['A[y]']*cat['Jy'] + cat['A[z]']*cat['Jz'])
     /
    (np.sqrt(cat['A[x]']**2 + cat['A[y]']**2 + cat['A[z]']**2)*np.sqrt(cat['Jx']**2 + cat['Jy']**2 + cat['Jz']**2) )
    )
)
q = lambda cat: (1/2)*(cat['b_to_a'] + cat['c_to_a']);

In [None]:
#information about catalogue (https://www.cosmosim.org/cms/simulations/bolshoi/)
particle_mass = 1.35e8
total_particles = 2048**3 
box_size = 250 #Mpc/h

# read

In [None]:
from astropy.io import ascii

In [None]:
filename = '../data/hlist_1.00109.csv'

In [None]:
#100 Mb chunks of maximum memory in each iteration. 
#this returns a generator. 
tbls = ascii.read(filename, format='csv', guess=False,
                 fast_reader={'chunk_size': 100 * 1000000, 'chunk_generator': True})

In [None]:
#params we actually care about and we include in our table. 
params = ['mvir', 'cvir' , 'T/|U|', 'Xoff', 'Voff', 'Spin', 'q', 'phi_l', 'Acc_Rate_Inst',
          'Acc_Rate_1*Tdyn','scale_of_last_MM']


In [None]:
ftbls = [] 
for i, tbl in enumerate(tbls): 
    tbl.add_column(cvir(tbl), name='cvir')
    tbl.add_column(phi_l(tbl), name='phi_l')
    tbl.add_column(q(tbl), name='q')
    
    ftbls.append(tbl[params])
    if i%10 ==0: 
        print(i)

ftbl = astropy.table.vstack(ftbls)

In [None]:
#only look at things that have at least 1000 particles as several authors suggest. 
#others might be too noisy. 
cat = ftbl[ (np.log10(ftbl['mvir']) > 12) & 
          (ftbl['Spin'] != 0) &
          (ftbl['q'] !=0)]

# histograms

In [None]:
#mass
plt.hist(np.log10(cat['mvir']), bins=50, histtype='step');

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(10,12))
params = ['mvir', 'cvir' , 'T/|U|', 'Xoff', 'Voff', 'Spin', 'q', 'phi_l']
print('mean, median, and std of:')
for param, ax in zip(params, axs.flatten()): 
    x = np.log10(cat[param]) 
    px = (x - np.mean(x))/ np.std(x)
    ax.set_title(param)
    ax.hist(px, histtype='step', bins=50)
    print(f'{param}:', np.mean(x), np.median(x), np.std(x))

fig.tight_layout() 

# Vanilla Scatters and Contours

In [None]:
#all masses in catalogue
plt.scatter(cat['mvir'], cat['Xoff'], alpha=0.1)
plt.xscale('log')
plt.yscale('log')
print("correlation:", spearmanr(cat['mvir'], cat['Xoff'])[0])

In [None]:
from astroML.plotting import scatter_contour
scatter_contour(np.log10(cat['mvir']), np.log10(cat['Xoff']), filled_contour=True, threshold=100, levels=10)

# correlations

In [None]:
params = ['mvir', 'cvir' , 'T/|U|', 'Xoff', 'Voff', 'Spin', 'q', 'phi_l']

In [None]:
#prettier version. 
corrs = np.zeros((len(params), len(params)))
for i,param1 in enumerate(params): 
    for j,param2 in enumerate(params):
        corr, p = spearmanr(cat[param1], cat[param2])
        corrs[i,j] = corr

In [None]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
mask =  np.tri(corrs.shape[0], k=-1)
A = np.ma.array(corrs, mask=mask)
im = ax.matshow(A,cmap='bwr',vmin=-1, vmax=1)
plt.colorbar(im, ax = ax)
ax.set_xticklabels(['']+params);
ax.set_yticklabels(['']+params);

In [None]:
#print by hand. 
np.set_printoptions(threshold=100)
print(params)
corrs

## Correlations between parameters, graphically

In [None]:
fig, axs = plt.subplots(1,3, figsize=(20,10))

scatter_binning(np.log10(cat['Xoff']), np.log10(cat['T/|U|']), nxbins=10, ax=axs[0], title='xoff and T/U')

scatter_binning(np.log10(cat['mvir']), np.log10(cat['T/|U|']), nxbins=10, ax=axs[1], title='mvir and T/U')

scatter_binning(np.log10(cat['mvir']), np.log10(cat['Xoff']), nxbins=10, ax=axs[2], title='mvir and xoff')


# PCA 

In [None]:
from sklearn.decomposition import PCA

In [None]:
params = ['mvir', 'cvir','Spin', 'q','phi_l', 'Xoff', 'T/|U|'
         ]

In [None]:
pca = PCA(n_components=4)

In [None]:
#normalize and take log.
#cuts on mass >= 12 once we get z= 0 catalogue and different fixed mass regimes too.  
# remove extreme outliers too. 
X = np.zeros((len(params), len(cat['mvir'])))
for i, param in enumerate(params): 
    x = np.log10(cat[param])
    px = (x - np.mean(x))/ np.std(x)
    X[i] = px 
    

In [None]:
pca.fit(X.T)

In [None]:
np.set_printoptions(threshold=50)
print(params)
print('singular values:', pca.singular_values_)
print('explained variance ratio:', pca.explained_variance_ratio_)  
print('components \n ', pca.components_)

# dynamical comparisons 

Useful to compare to the Power et al. 2011 paper

In [None]:
params = ['mvir', 'cvir' , 'T/|U|', 'Xoff', 'Voff', 'Spin', 'q', 'phi_l', 'Acc_Rate_Inst',
          'Acc_Rate_1*Tdyn','Acc_Rate_2*Tdyn']

In [None]:
#not sure what Halfmass_Scale corresponds to but maybe inverse of (1+zform)^(-1)? 
fig, ax = plt.subplots(1,1,figsize=(8,8))
scatter_binning(np.log10(cat['mvir']), cat['cvir'], ax = ax, title='mass and concentration')
plt.title('mass and concentration')

In [None]:
fig, axs = plt.subplots(1,2,figsize=(20,10))

scatter_binning(np.log10(cat['mvir']), cat['Acc_Rate_1*Tdyn'], title='mvir and Acc_Rate_1*Tdyn' ,ax=axs[0])
axs[0].set_yscale('log')

scatter_binning(np.log10(cat['mvir']), cat['Acc_Rate_Inst'], title='mvir and Acc_Rate_Inst', ax=axs[1])
axs[1].set_yscale('log')
# scatter_binning(np.log10(cat['mvir']), np.log10(cat['Acc_Rate_1*Tdyn']) )

In [None]:
fig, axs = plt.subplots(2,2,figsize=(20,10))
axes = axs.flatten()
scatter_binning(np.log10(cat['Xoff']), cat['Acc_Rate_1*Tdyn'], title='Xoff and Acc_Rate_1*Tdyn' ,ax=axes[0], nxbins=20)
axes[0].set_yscale('log')

scatter_binning(np.log10(cat['Xoff']), cat['Acc_Rate_Inst'], title='Xoff and Acc_Rate_Inst', ax=axes[1], nxbins=20)
axes[1].set_yscale('log')

scatter_binning(np.log10(cat['T/|U|']), cat['Acc_Rate_1*Tdyn'], title='Eta and Acc_Rate_1*Tdyn' ,ax=axes[2], nxbins=15)
axes[2].set_yscale('log')

scatter_binning(np.log10(cat['T/|U|']), cat['Acc_Rate_Inst'], title='Eta and Acc_Rate_Inst', ax=axes[3], nxbins=15)
axes[3].set_yscale('log')
# scatter_binning(np.log10(cat['mvir']), np.log10(cat['Acc_Rate_1*Tdyn']) )

In [None]:
fig, axs = plt.subplots(1,2,figsize=(20,10))
axes = axs.flatten()

scatter_binning( cat['scale_of_last_MM'], np.log10(cat['Xoff']), title='scale_of_last_MM and Xoff' ,ax=axes[0], nxbins=10)

scatter_binning( cat['scale_of_last_MM'], np.log10(cat['T/|U|']), title='scale_of_last_MM and Eta' ,ax=axes[1], nxbins=10)


In [None]:
#ToDo: 
    #look at statistical errors of these correlations? 
    #need a way of identifying outliers of one parameters and obtaining the halo id? 
    #framework for making all this preprocessing really easy and under-the-hood. 
    # maybe try to look at relaxed halos or dynamical properties? 