In [3]:
import numpy as np
import pandas as pd

import healpy as hp
from tqdm import tqdm, tqdm_notebook
import scipy as sp
from scipy.special import erf

import sys
from my_units import * 

from angular_fn import *
from template_fn import *
from cov_matrix_fn import *

import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.axes_grid1 import make_axes_locatable

HomeDir = './'
DataDir = HomeDir #+ 'code/'
### Set this to the directory where you store your data files (see below how to download them)
ListDir = HomeDir+'lists/hist_stats/'
FigDir = HomeDir+'figures/'

# Read in files

In [2]:
df_back = pd.read_csv(DataDir+'pairs_background_ruwe_w_st.csv', usecols = ['ra', 'dec', 'pmra', 'pmdec','pmra_error', 'pmdec_error', 'pmra_pmdec_corr', 'pmra_eff_error', 'pmdec_eff_error', 'pmra_pmdec_eff_corr', 'parallax', 'parallax_error', 'pmra_sub', 'pmdec_sub', 'phot_g_mean_mag', 'l', 'b', 'ruwe'])
print('Backgrounds read in.')

df_fore = pd.read_csv(DataDir+'pairs_foreground_ruwe_w_st.csv', usecols = ['ra', 'dec', 'pmra', 'pmdec','pmra_error', 'pmdec_error', 'parallax', 'parallax_error', 'phot_g_mean_mag', 'l', 'b', 'ruwe'])
print('Foregrounds read in.')

Backgrounds read in.
Foregrounds read in.


# Binning in parallax, $G$-magnitude, and radial distance

## Bin definitions

In [4]:
# parallax bins
bins_parallax = np.concatenate([[-1000],np.logspace(np.log10(0.05),np.log10(2),10),[1000]])

# g-magnitude bins
bins_G = np.arange(10,24,1)

# radial separation bins
step = 0.1 # Size of bin
start = 0.5 # Starting bin in arcsec
end = 3 + 2 * step # Ending bin in arcsec
bins_bil = np.arange(start, end, step)

len(bins_parallax), len(bins_G), len(bins_bil)

(12, 14, 27)

In [5]:
bins_parallax

array([-1.00000000e+03,  5.00000000e-02,  7.53315095e-02,  1.13496727e-01,
        1.70997595e-01,  2.57630139e-01,  3.88153345e-01,  5.84803548e-01,
        8.81082680e-01,  1.32746577e+00,  2.00000000e+00,  1.00000000e+03])

## Bin assignments

In [4]:
# assign to G bins
bg_G = df_back['phot_g_mean_mag'].to_numpy()
q_binG = np.digitize(bg_G, bins_G) - 1  

# assign to radial bins
## necessary columns
fg_ra = df_fore['ra'].to_numpy()
fg_dec = df_fore['dec'].to_numpy()
bg_ra = df_back['ra'].to_numpy()
bg_dec = df_back['dec'].to_numpy()
## bil separation vectors
l_bilvec = fn_angular_sep(fg_ra*degree, fg_dec*degree, bg_ra*degree, bg_dec*degree)
l_bilnormsq = l_bilvec[:, 0]**2 + l_bilvec[:, 1]**2
## bin definitions
q_bil = np.sqrt(l_bilnormsq)/arcsec
q_bin_bil = np.digitize(q_bil, bins_bil) - 1

In [14]:
bins_parallax

array([-1.00000000e+03,  5.00000000e-02,  7.53315095e-02,  1.13496727e-01,
        1.70997595e-01,  2.57630139e-01,  3.88153345e-01,  5.84803548e-01,
        8.81082680e-01,  1.32746577e+00,  2.00000000e+00,  1.00000000e+03])

In [5]:
# probabilistic assignment to parallax bins
bg_parallax = df_back['parallax'].to_numpy(); bg_parallax_error = df_back['parallax_error'].to_numpy()
prob_parallax = np.nan * np.ones((len(df_back),len(bins_parallax)-1))
for i in tqdm(range(len(bins_parallax)-1)):
    x1_list = (bins_parallax[i]-bg_parallax)/bg_parallax_error/np.sqrt(2)
    x2_list = (bins_parallax[i+1]-bg_parallax)/bg_parallax_error/np.sqrt(2)
    prob_parallax[:,i] = 0.5*(erf(x2_list)-erf(x1_list))

100%|██████████| 11/11 [00:40<00:00,  3.71s/it]


In [6]:
prob_parallax.shape

(61138843, 11)

# Testing

In [7]:
bg_pmra = df_back['pmra'].to_numpy(); bg_pmdec = df_back['pmdec'].to_numpy()
### histogram of summed probabilities
hist_prob = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax), bins=[bins_G,bins_bil],statistic='sum')[0] 
### histogram of average pmra weighted by probabilities
hist_pmra = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * bg_pmra, bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
hist_pmra = hist_pmra / (hist_prob + 1e-20) #then divide by number in each bin
hist_pmdec = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * bg_pmdec, bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
hist_pmdec = hist_pmdec / (hist_prob + 1e-20) #then divide by number in each bin

In [8]:
### For each star, get the mean pm of the corresponding bin
mean_pmra = hist_pmra[:, q_binG, q_bin_bil].T; mean_pmdec = hist_pmdec[:, q_binG, q_bin_bil].T

In [9]:
### histogram of pm variance weighted by parallax bin probabilities
hist_pmra_var = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * (mean_pmra.T - bg_pmra)**2,
                                                  bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
hist_pmra_var = hist_pmra_var / (hist_prob - 1 + 1e-20) # the estimator should have a -1 (this matches for example var() computed with panda's groupy)
hist_pmdec_var = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * (mean_pmdec.T - bg_pmdec)**2,
                                                   bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
hist_pmdec_var = hist_pmdec_var / (hist_prob - 1 + 1e-20) 
hist_pmradec_var = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * (mean_pmra.T - bg_pmra) * (mean_pmdec.T - bg_pmdec),
                                                     bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
hist_pmradec_var = hist_pmradec_var / (hist_prob - 1 + 1e-20) 

In [10]:
th_count = 3
### set to nan bins where there are too few stars
hist_pmra[hist_prob < th_count] = np.nan; hist_pmdec[hist_prob < th_count] = np.nan
hist_pmra_var[hist_prob < th_count] = np.nan; hist_pmdec_var[hist_prob < th_count] = np.nan; hist_pmradec_var[hist_prob < th_count] = np.nan


## Actual function

In [11]:
def fn_pm_stats(df_fore, df_back, th_count=3, return_tab=False, n_sigma_out = 3): ### if return_tab=True returns the tab without pm outliers, else returns the data frame with stats 
    from scipy.special import erf

    ### assign to G bins
    # assign to G bins
    bg_G = df_back['phot_g_mean_mag'].to_numpy()
    q_binG = np.digitize(bg_G, bins_G) - 1  

    # assign to radial bins
    ## necessary columns
    fg_ra = df_fore['ra'].to_numpy()
    fg_dec = df_fore['dec'].to_numpy()
    bg_ra = df_back['ra'].to_numpy()
    bg_dec = df_back['dec'].to_numpy()
    ## bil separation vectors
    l_bilvec = fn_angular_sep(fg_ra*degree, fg_dec*degree, bg_ra*degree, bg_dec*degree)
    l_bilnormsq = l_bilvec[:, 0]**2 + l_bilvec[:, 1]**2
    ## bin definitions
    q_bil = np.sqrt(l_bilnormsq)/arcsec
    q_bin_bil = np.digitize(q_bil, bins_bil) - 1     
    
    # probabilistic assignment to parallax bins
    bg_parallax = df_back['parallax'].to_numpy(); bg_parallax_error = df_back['parallax_error'].to_numpy()
    prob_parallax = np.nan * np.ones((len(df_back),len(bins_parallax)-1))
    for i in range(len(bins_parallax)-1):
        x1_list = (bins_parallax[i]-bg_parallax)/bg_parallax_error/np.sqrt(2)
        x2_list = (bins_parallax[i+1]-bg_parallax)/bg_parallax_error/np.sqrt(2)
        prob_parallax[:,i] = 0.5*(erf(x2_list)-erf(x1_list))

    bg_pmra = df_back['pmra'].to_numpy(); bg_pmdec = df_back['pmdec'].to_numpy()
    ### histogram of summed probabilities
    hist_prob = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax), bins=[bins_G,bins_bil],statistic='sum')[0] 
    ### histogram of average pmra weighted by probabilities
    hist_pmra = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * bg_pmra, bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
    hist_pmra = hist_pmra / (hist_prob + 1e-20) #then divide by number in each bin
    hist_pmdec = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * bg_pmdec, bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
    hist_pmdec = hist_pmdec / (hist_prob + 1e-20) #then divide by number in each bin
    
    ### For each star, get the mean pm of the corresponding bin
    mean_pmra = hist_pmra[:, q_binG, q_bin_bil].T; mean_pmdec = hist_pmdec[:, q_binG, q_bin_bil].T

    ### histogram of pm variance weighted by parallax bin probabilities
    hist_pmra_var = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * (mean_pmra.T - bg_pmra)**2,
                                                    bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
    hist_pmra_var = hist_pmra_var / (hist_prob - 1 + 1e-20) # the estimator should have a -1 (this matches for example var() computed with panda's groupy)
    hist_pmdec_var = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * (mean_pmdec.T - bg_pmdec)**2,
                                                    bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
    hist_pmdec_var = hist_pmdec_var / (hist_prob - 1 + 1e-20) 
    hist_pmradec_var = sp.stats.binned_statistic_dd([bg_G,q_bil],np.transpose(prob_parallax) * (mean_pmra.T - bg_pmra) * (mean_pmdec.T - bg_pmdec),
                                                        bins=[bins_G,bins_bil],statistic='sum')[0] #sum first in each bin
    hist_pmradec_var = hist_pmradec_var / (hist_prob - 1 + 1e-20) 
    
    ### set to nan bins where there are too few stars
    hist_pmra[hist_prob < th_count] = np.nan; hist_pmdec[hist_prob < th_count] = np.nan
    hist_pmra_var[hist_prob < th_count] = np.nan; hist_pmdec_var[hist_prob < th_count] = np.nan; hist_pmradec_var[hist_prob < th_count] = np.nan

    if return_tab==False: # returns the data frame with the statistics computed using tab
        ###  filler for generalized bins indices
        hist_bins_bil = np.ones(np.shape(hist_prob)) * bins_bil[:-1]
        hist_bins_G = np.transpose(np.transpose(np.ones(np.shape(hist_prob)),axes=[0,2,1]) * bins_G[:-1],axes=[0,2,1])
        hist_bins_parallax = np.transpose(np.transpose(np.ones(np.shape(hist_prob)),axes=[2,1,0]) * bins_parallax[:-1],axes=[2,1,0])

        ###  collect data and output
        data = np.transpose([hist_bins_bil, hist_bins_G, hist_bins_parallax, hist_prob, hist_pmra, hist_pmdec, hist_pmra_var, hist_pmdec_var, hist_pmradec_var],axes=[1,2,3,0])
        data = data.reshape(-1, data.shape[-1])
        return pd.DataFrame(data,columns=['bil_bin','G_bin','parallax_bin','number','mean_pmra','mean_pmdec','var_pmra','var_pmdec','var_pmradec'])
    
    else: # returns tab where the pm outliers more than n_sigma_out away from zero have been removed
        ### For each star, get the pm mean and variance of the corresponding bin (after excluding the low count bins)
        mean_pmra = hist_pmra[:, q_binG, q_bin_bil].T; mean_pmdec = hist_pmdec[:, q_binG, q_bin_bil].T
        var_pmra = hist_pmra_var[:, q_binG, q_bin_bil].T; var_pmdec = hist_pmdec_var[:, q_binG, q_bin_bil].T; var_pmradec = hist_pmradec_var[:, q_binG, q_bin_bil].T;    

        ###  Get the mean and var for each star
        tab_sum_pw = np.sum(prob_parallax, axis=1, where=(~np.isnan(mean_pmra)))  # sum of the parallax weights for each star using only bins with enough statistics 
        tab_mean_pmra = np.sum(np.nan_to_num(mean_pmra*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)
        tab_mean_pmdec = np.sum(np.nan_to_num(mean_pmdec*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)
        tab_var_pmra = np.sum(np.nan_to_num(var_pmra*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)
        tab_var_pmdec = np.sum(np.nan_to_num(var_pmdec*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)
        tab_var_pmradec = np.sum(np.nan_to_num(var_pmradec*prob_parallax), axis=1)/(tab_sum_pw + 1e-20)        
        
        ### Replace the effective variance with the measurement errors for stars that have 0 mean (fall into empty bins)
        tab_var_pmra[tab_var_pmra==0] = (df_back['pmra_error'].to_numpy()[tab_var_pmra==0])**2
        tab_var_pmdec[tab_var_pmdec==0] = (df_back['pmdec_error'].to_numpy()[tab_var_pmdec==0])**2
        tab_var_pmradec[tab_var_pmradec==0] = (df_back['pmra_pmdec_corr'].to_numpy()*df_back['pmra_error'].to_numpy()*df_back['pmdec_error'].to_numpy())[tab_var_pmradec==0]
        
        ### subtracted pm and inverse covariance for outlier removal
        pm_sub = np.array([df_back['pmra'].to_numpy()-tab_mean_pmra, df_back['pmdec'].to_numpy()-tab_mean_pmdec]).T
        inv_cov_pm = np.linalg.inv(np.array([[tab_var_pmra, tab_var_pmradec], [tab_var_pmradec, tab_var_pmdec]]).T)
        mu_over_sigma_sq = inv_cov_pm[:, 0, 0]*pm_sub[:, 0]**2 + inv_cov_pm[:, 1, 1]*pm_sub[:, 1]**2 + 2*inv_cov_pm[:, 0, 1]*pm_sub[:, 0]*pm_sub[:, 1]
        
        return mu_over_sigma_sq

In [12]:
i=0
out_frac=1
n_iter = 7 # each iteration around 5 minutes
n_sigma_out = 3

while (i<n_iter) & (out_frac>1E-5):
    mu_sig_sq = fn_pm_stats(df_fore, df_back, th_count=3, return_tab=True, n_sigma_out = 3)
    i+=1; len_bg = len(df_back)
    df_back = df_back.iloc[mu_sig_sq < n_sigma_out**2]
    df_fore = df_fore.iloc[mu_sig_sq < n_sigma_out**2]
    out_frac=(1-len(df_back)/len_bg); 
    print('Iter '+str(i)+' -- fraction of outliers removed: '+str(out_frac*100)[:7]+' %')


Iter 1 -- fraction of outliers removed: 2.52360 %
Iter 2 -- fraction of outliers removed: 1.48025 %
Iter 3 -- fraction of outliers removed: 0.56995 %
Iter 4 -- fraction of outliers removed: 0.21454 %
Iter 5 -- fraction of outliers removed: 0.08275 %
Iter 6 -- fraction of outliers removed: 0.03317 %
Iter 7 -- fraction of outliers removed: 0.01408 %


In [13]:
df_pm_stats = fn_pm_stats(df_fore, df_back, return_tab=False)  
#df_pm_stats.to_csv('bg_pm_hist.csv') #write to file

In [17]:
df_pm_stats.shape

(3718, 9)