# Preamble

In [1]:
import astropy.units as u
from astropy.coordinates import SkyCoord
from astropy.time import Time
from astroquery.gaia import Gaia
import healpy as hp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import copy
from time import time as tictoc
import pandas as pd
import seaborn as sns
from os import listdir
import gzip
from tqdm import tqdm, tqdm_notebook
from time import time as tictoc
from scipy.special import erf
import scipy as sp

from IPython.core.display import display, HTML
from IPython.display import display, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(edgeitems=3, linewidth=200) 
pd.set_option('display.max_columns', None)
pd.set_option('max_rows',200) and pandas.set_option('max_columns',20)

from MyUnits import *

Created TAP+ (v1.2.1) - Connection:
	Host: gea.esac.esa.int
	Use HTTPS: True
	Port: 443
	SSL Port: 443
Created TAP+ (v1.2.1) - Connection:
	Host: geadata.esac.esa.int
	Use HTTPS: True
	Port: 443
	SSL Port: 443


In [4]:
edr3_data = '/Users/crimondino/Dropbox (PI)/MyLensVelocity2/data/gedr3_gaia_source/'

# Acceleration field

## Function for parallel binning

In [5]:
list_files = listdir(edr3_data); 
list_files = [file for file in list_files if file[-7:]=='.csv.gz'] #select only files ending with 'csv.gz'
print(list_files[0:10])
len(list_files)

['GaiaSource_003112-005263.csv.gz', 'GaiaSource_459543-459553.csv.gz', 'GaiaSource_006602-007952.csv.gz', 'GaiaSource_664985-665011.csv.gz', 'GaiaSource_005264-006601.csv.gz', 'GaiaSource_778885-779312.csv.gz', 'GaiaSource_000000-003111.csv.gz']


7

In [6]:
n = 8
nside = 2**n
fac_source_id = 2**(59-2*n)
npix = hp.nside2npix(nside)
print('nside =',nside,', npix =',npix)
print('linear pixel size =',str(np.sqrt(4*np.pi / npix) / arcsec)[0:7],' arcsec =', str(np.sqrt(4*np.pi / npix) / degree)[0:7],' degree')

nside = 256 , npix = 786432
linear pixel size = 824.516  arcsec = 0.22903  degree


In [7]:
# bin definitions
bins_parallax = np.concatenate([[-1000],np.logspace(np.log10(0.05),np.log10(2),10),[1000]])
print(bins_parallax)
bins_G = np.arange(3,23,1) # floor or the min and max G mag in the entire catalog are 3 and 21
print(bins_G)

[-1.00000000e+03  5.00000000e-02  7.53315095e-02  1.13496727e-01  1.70997595e-01  2.57630139e-01  3.88153345e-01  5.84803548e-01  8.81082680e-01  1.32746577e+00  2.00000000e+00  1.00000000e+03]
[ 3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]


In [5]:
def fn_acc_stats(tab, th_count=3, return_tab=False, n_sigma_out = 3): 
    """
    Bins the stars in tab in healpix, G mag and parallax and computes the mean and variance of acc_ra and acc_dec per bin.
    If return_tab=False, returns the statistic in each bin.
    If return_tab=True, returns the stars in tab removing the outliers at more than n_sigma_out from the mean.
    """

    ### healpix binning
    q_pix = np.floor(tab['source_id'].to_numpy() / fac_source_id).astype(int)
    bins_pix = np.arange(np.min(np.unique(q_pix)), np.max(np.unique(q_pix))+2,1) # should be +2 to include sources in the last bin
    q_binpix = np.digitize(q_pix, bins_pix)-1  # need to access the histogram matrix elements

    ### assign to G bins
    tab_G = tab['phot_g_mean_mag'].to_numpy()
    q_binG = np.digitize(tab_G, bins_G)-1      
    
    ### probabilistic assignment to parallax bins
    tab_parallax = tab['parallax'].to_numpy(); tab_parallax_error = tab['parallax_error'].to_numpy();
    prob_parallax = np.nan * np.ones((len(tab),len(bins_parallax)-1))
    for i in range(len(bins_parallax)-1):
        x1_list = (bins_parallax[i]-tab_parallax)/tab_parallax_error/np.sqrt(2)
        x2_list = (bins_parallax[i+1]-tab_parallax)/tab_parallax_error/np.sqrt(2)
        prob_parallax[:,i] = 0.5*(erf(x2_list)-erf(x1_list))

    tab_acc_ra = tab['accel_ra'].to_numpy(); tab_acc_dec = tab['accel_dec'].to_numpy();
    ### histogram of summed probabilities
    hist_prob = sp.stats.binned_statistic_dd([tab_G,q_pix],np.transpose(prob_parallax), bins=[bins_G,bins_pix],statistic='sum')[0] 
    ### histogram of average acc_ra weighted by probabilities
    hist_acc_ra = sp.stats.binned_statistic_dd([tab_G,q_pix],np.transpose(prob_parallax) * tab_acc_ra, bins=[bins_G,bins_pix],statistic='sum')[0] #sum first in each bin
    hist_acc_ra = hist_acc_ra / (hist_prob + 1e-20) #then divide by number in each bin
    hist_acc_dec = sp.stats.binned_statistic_dd([tab_G,q_pix],np.transpose(prob_parallax) * tab_acc_dec, bins=[bins_G,bins_pix],statistic='sum')[0] #sum first in each bin
    hist_acc_dec = hist_acc_dec / (hist_prob + 1e-20) #then divide by number in each bin
    
    ### For each star, get the mean acc of the corresponding bin
    mean_acc_ra = hist_acc_ra[:, q_binG, q_binpix].T; mean_acc_dec = hist_acc_dec[:, q_binG, q_binpix].T

    ### histogram of acc variance weighted by parallax bin probabilities
    hist_acc_ra_var = sp.stats.binned_statistic_dd([tab_G,q_pix],np.transpose(prob_parallax) * (mean_acc_ra.T - tab_acc_ra)**2,
                                                   bins=[bins_G,bins_pix],statistic='sum')[0] #sum first in each bin
    hist_acc_ra_var = hist_acc_ra_var / (hist_prob - 1 + 1e-20) # the estimator should have a -1 (this matches for example var() computed with panda's groupy)
    hist_acc_dec_var = sp.stats.binned_statistic_dd([tab_G,q_pix],np.transpose(prob_parallax) * (mean_acc_dec.T - tab_acc_dec)**2,
                                                    bins=[bins_G,bins_pix],statistic='sum')[0] #sum first in each bin
    hist_acc_dec_var = hist_acc_dec_var / (hist_prob - 1 + 1e-20) 
    hist_acc_radec_var = sp.stats.binned_statistic_dd([tab_G,q_pix],np.transpose(prob_parallax) * (mean_acc_ra.T - tab_acc_ra) * (mean_acc_dec.T - tab_acc_dec),
                                                      bins=[bins_G,bins_pix],statistic='sum')[0] #sum first in each bin
    hist_acc_radec_var = hist_acc_radec_var / (hist_prob - 1 + 1e-20) 
    
    ### set to nan bins where there are too few stars
    hist_acc_ra[hist_prob < th_count] = np.nan; hist_acc_dec[hist_prob < th_count] = np.nan
    hist_acc_ra_var[hist_prob < th_count] = np.nan; hist_acc_dec_var[hist_prob < th_count] = np.nan; hist_acc_radec_var[hist_prob < th_count] = np.nan

    if return_tab==False: # returns the data frame with the statistics computed using tab
        ###  filler for generalized bins indices
        hist_bins_pix = np.ones(np.shape(hist_prob)) * bins_pix[:-1]
        hist_bins_G = np.transpose(np.transpose(np.ones(np.shape(hist_prob)),axes=[0,2,1]) * bins_G[:-1],axes=[0,2,1])
        hist_bins_parallax = np.transpose(np.transpose(np.ones(np.shape(hist_prob)),axes=[2,1,0]) * bins_parallax[:-1],axes=[2,1,0])

        ###  collect data and output
        data = np.transpose([hist_bins_pix, hist_bins_G, hist_bins_parallax, hist_prob, hist_acc_ra, hist_acc_dec, hist_acc_ra_var, hist_acc_dec_var, hist_acc_radec_var],axes=[1,2,3,0])
        data = data.reshape(-1, data.shape[-1])
        return pd.DataFrame(data,columns=['pix','G_bin','parallax_bin','number','mean_acc_ra','mean_acc_dec','var_acc_ra','var_acc_dec','var_acc_radec'])
    
    else: # returns tab where the acc outliers more than n_sigma_out away from zero have been removed
        ### For each star, get the acc mean and variance of the corresponding bin (after excluding the low count bins)
        mean_acc_ra = hist_acc_ra[:, q_binG, q_binpix].T; mean_acc_dec = hist_acc_dec[:, q_binG, q_binpix].T
        var_acc_ra = hist_acc_ra_var[:, q_binG, q_binpix].T; var_acc_dec = hist_acc_dec_var[:, q_binG, q_binpix].T; var_acc_radec = hist_acc_radec_var[:, q_binG, q_binpix].T;    

        ###  Get the mean and var for each star
        tab_sum_pw = np.sum(prob_parallax, axis=1, where=(~np.isnan(mean_acc_ra)))  # sum of the parallax weights for each star using only bins with enough statistics 
        tab_mean_acc_ra = np.sum(np.nan_to_num(mean_acc_ra*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)
        tab_mean_acc_dec = np.sum(np.nan_to_num(mean_acc_dec*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)
        tab_var_acc_ra = np.sum(np.nan_to_num(var_acc_ra*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)
        tab_var_acc_dec = np.sum(np.nan_to_num(var_acc_dec*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)
        tab_var_acc_radec = np.sum(np.nan_to_num(var_acc_radec*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)        
        
        ### Replace the effective variance with the measurement errors for stars that have 0 mean (fall into empty bins)
        tab_var_acc_ra[tab_var_acc_ra==0] = (tab['accel_ra_error'].to_numpy()[tab_var_acc_ra==0])**2
        tab_var_acc_dec[tab_var_acc_dec==0] = (tab['accel_dec_error'].to_numpy()[tab_var_acc_dec==0])**2
        tab_var_acc_radec[tab_var_acc_radec==0] = (np.zeros(len(tab))*tab['accel_ra_error'].to_numpy()*tab['accel_dec_error'].to_numpy())[tab_var_acc_radec==0]
        
        ### subtracted acc and inverse covariance for outlier removal
        acc_sub = np.array([tab['accel_ra'].to_numpy()-tab_mean_acc_ra, tab['accel_dec'].to_numpy()-tab_mean_acc_dec]).T
        inv_cov_acc = np.linalg.inv(np.array([[tab_var_acc_ra, tab_var_acc_radec], [tab_var_acc_radec, tab_var_acc_dec]]).T)
        acc_over_sigma_sq = inv_cov_acc[:, 0, 0]*acc_sub[:, 0]**2 + inv_cov_acc[:, 1, 1]*acc_sub[:, 1]**2 + 2*inv_cov_acc[:, 0, 1]*acc_sub[:, 0]*acc_sub[:, 1]
        
        return tab.iloc[acc_over_sigma_sq < n_sigma_out**2]

In [8]:
np.zeros(3)

array([0., 0., 0.])