In [None]:
# ------------------------------------------------------------------------
#
# TITLE - clean_data.ipynb
# AUTHOR - James Lane
# PROJECT - ges-mass
#
# ------------------------------------------------------------------------
#
# Docstrings and metadata:
'''Clean the Gaia & APOGEE data and calculate kinematic quantities. Then 
plot and mask based on abundances / kinematics.
'''

__author__ = "James Lane"

In [None]:
### Imports

## Basic
import numpy as np, pdb, sys, os, dill as pickle
import matplotlib.pyplot as plt
import astropy.units as apu
from tqdm.notebook import tqdm

## galpy
from galpy import orbit
from galpy import potential
from galpy import actionAngle as aA

sys.path.append('../../src/')
from ges_mass import util as putil
from ges_mass import ssf as pssf
from ges_mass import plot as pplot

### Notebook setup

%matplotlib inline
plt.style.use('../../src/mpl/project.mplstyle') # This must be exactly here
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

### Keywords, Pathing, Loading, Data Preparation

In [None]:
# %load ../../src/nb_modules/keywords_pathing_loading_data_prep.py
## Keywords
cdict = putil.load_config_to_dict()
keywords = ['BASE_DIR','APOGEE_DR','APOGEE_RESULTS_VERS','GAIA_DR','NDMOD',
            'DMOD_MIN','DMOD_MAX','LOGG_MIN','LOGG_MAX','FEH_MIN','FEH_MAX',
            'FEH_MIN_GSE','FEH_MAX_GSE','DF_VERSION','KSF_VERSION','NPROCS',
            'RO','VO','ZO']
base_dir,apogee_dr,apogee_results_vers,gaia_dr,ndmod,dmod_min,dmod_max,\
    logg_min,logg_max,feh_min,feh_max,feh_min_gse,feh_max_gse,df_version,\
    ksf_version,nprocs,ro,vo,zo = putil.parse_config_dict(cdict,keywords)
logg_range = [logg_min,logg_max]
feh_range = [feh_min,feh_max]
feh_range_gse = [feh_min_gse,feh_max_gse]
feh_range_all = [feh_min,feh_max_gse]
# feh_range_fit = copy.deepcopy( # Need to choose here


## Pathing
fit_paths = putil.prepare_paths(base_dir,apogee_dr,apogee_results_vers,gaia_dr,
                                df_version,ksf_version)
data_dir,version_dir,ga_dir,gap_dir,df_dir,ksf_dir,fit_dir = fit_paths

In [None]:
## Potential and action angle object
mwpot = potential.MWPotential2014
potential.turn_physical_on(mwpot,ro=ro,vo=vo)
phi0 = potential.evaluatePotentials(mwpot,1e10,0).value
aAS = aA.actionAngleStaeckel(pot=mwpot, delta=0.4, ro=ro, vo=vo, zo=zo, c=True)

## Decide how to sample 6D kinematics / calculate kinematic quantities
sample_kinematics = True
force_sampling = False # Force re-sampling even if file exists
save_kinematic_samples = True
force_kinematics = False # Force re-calculation even if file exists
if sample_kinematics:
    n_samples = 100
    input_kinematics_filename = gap_dir+'input_kinematics_sampled.npy'
    input_kinematics_samples_filename = gap_dir+'_input_kinematics_raw_samples.npy'
    sampled_obs_filename = gap_dir+'sampled_obs.npy'
    sampled_obs_psd_mask_filename = gap_dir+'sampled_obs_psd_mask.npy'
else:
    input_kinematics_filename = gap_dir+'input_kinematics_no_sample.npy'
    vxvv_no_sample_filename = gap_dir+'vxvv_no_sample.npy'

## Get data

In [None]:
# Load APOGEE data
allstar_filename = ga_dir+'apogee_allstar.npy'
print('APOGEE data release is: '+apogee_dr+', and results version is: '+apogee_results_vers)
print('Loading APOGEE from '+allstar_filename)
allstar = np.load(allstar_filename)
print(str(len(allstar))+' stars in total sample.')

# load APOGEE statistical sample index
apogee_stat_indx_filename = ga_dir+'apogee_statIndx.npy'
print('\nLoading APOGEE DR16 statistical sample from '+apogee_stat_indx_filename)
apogee_stat_indx = np.load(apogee_stat_indx_filename)
print(str(np.sum(apogee_stat_indx))+' stars in statistical sample.')

# Gaia data and Gaia-APOGEE match index
gaia_data_filename = ga_dir+'gaia_data.npy'
gaia_apogee_matches_filename = ga_dir+'gaia_apogee_matches.npy'
print('\nGaia data release is: '+gaia_dr)
print('Loading Gaia catalog from '+gaia_data_filename)
gaia_data = np.load(gaia_data_filename, allow_pickle=True)
print('Loading Gaia-APOGEE matches from '+gaia_apogee_matches_filename)
gaia_apogee_matches_indx = np.load(gaia_apogee_matches_filename)

# Apply the statistical sample index and Gaia-APOGEE matching index
allstar_gaia = allstar[apogee_stat_indx][gaia_apogee_matches_indx]

## Clean data
apply:
- data finiteness cuts
- quality cuts (distance error, logg error)
- remove bulge
- remove GCs

In [None]:
# Should have defined 6D kinematics for eccentricities
omask_finite = np.isfinite(gaia_data['RA']) &\
               np.isfinite(gaia_data['DEC']) &\
               np.isfinite(gaia_data['pmra']) &\
               np.isfinite(gaia_data['pmdec']) &\
               np.isfinite(allstar_gaia['weighted_dist']) &\
               np.isfinite(allstar_gaia['VHELIO_AVG'])

# Cut high fractional distance uncertainty, undefined eccentricity, 
# undefined Fe/H, undefined Al/Fe, high log(g) uncertainty
omask_quality = ((allstar_gaia['weighted_dist_error']/\
                  allstar_gaia['weighted_dist'] < 0.2) &\
                 # (allstar_gaia['MG_FE'] > -9999) &\
                 (allstar_gaia['FE_H'] > -9999) &\
                 (allstar_gaia['AL_FE'] > -9999) &\
                 (allstar_gaia['LOGG_ERR'] < 0.1)
                )
omask_quality2 = ((allstar_gaia['weighted_dist_error']/\
                  allstar_gaia['weighted_dist'] < 0.2) &\
                 # (allstar_gaia['MG_FE'] > -9999) &\
                 (allstar_gaia['FE_H'] > -9999) &\
                 (allstar_gaia['AL_FE'] > -9999)#  &\
                 # (allstar_gaia['LOGG_ERR'] < 0.1)
                )

# Cut bulge fields. Within 20 degrees of the galactic center
omask_bulge = ~(((allstar_gaia['GLON'] > 340.) |\
                 (allstar_gaia['GLON'] < 20.)) &\
                (np.fabs(allstar_gaia['GLAT']) < 20.)
               )

# Remove globular cluster fields. See corresponding notebook
gc_locids = pssf.get_globular_cluster_fields()
omask_gc = ~np.isin(allstar_gaia['LOCATION_ID'],gc_locids)

omask = omask_finite & omask_quality & omask_bulge & omask_gc

allstar_input = allstar_gaia[omask]
gaia_input = gaia_data[omask]
print(str(np.sum(omask))+' sources after masking')

# Make orbits
vxvv_input = np.array([gaia_input['ra'],
                       gaia_input['dec'],
                       allstar_input['weighted_dist']/1e3,
                       gaia_input['pmra'],
                       gaia_input['pmdec'],
                       allstar_input['VHELIO_AVG']
                      ]).T
orbs_input = orbit.Orbit(vxvv_input, radec=True, ro=ro, vo=vo, zo=zo)

# Sanity
assert np.all(orbs_input.ra().to(apu.deg).value-gaia_input['ra']<1e-8)
assert np.all(orbs_input.dec().to(apu.deg).value-gaia_input['dec']<1e-8)
assert np.all(orbs_input.pmra().to(apu.mas/apu.yr).value-gaia_input['pmra']<1e-8)
assert np.all(orbs_input.pmdec().to(apu.mas/apu.yr).value-gaia_input['pmdec']<1e-8)
assert np.all(orbs_input.dist().to(apu.pc).value-allstar_input['weighted_dist']<1e-8)
assert np.all(orbs_input.vlos().to(apu.km/apu.s).value-allstar_input['VHELIO_AVG']<1e-8)

In [None]:
len(allstar_gaia)

In [None]:
np.isfinite(gaia_data['pmdec'][~omask_finite & omask_quality & omask_bulge & omask_gc])

In [None]:
np.sum(omask_finite & omask_quality & ~omask_bulge & omask_gc)

In [None]:
omask2 = (omask_finite & omask_quality2 & omask_bulge & omask_gc)
logg_mask = (allstar_gaia['LOGG'] > 1) & (allstar_gaia['LOGG'] < 3)

print(np.sum(omask2))
print(np.sum(omask)/np.sum(omask2))
print(np.sum(omask & logg_mask))
print(np.sum(omask2 & logg_mask))
print(np.sum(omask & logg_mask)/np.sum(omask2 & logg_mask))
print(np.sum(omask2 & logg_mask)/np.sum(omask & logg_mask))


In [None]:
# fig = plt.figure()
# ax = fig.add_subplot(111)

# ax.scatter(allstar_input['weighted_dist']/1000., allstar_input['old_weighted_dist']/1000., 
#            s=1., alpha=0.1, color='Black', zorder=2)
# ax.plot([0.,100.], [0.,100.], color='Red', linestyle='dashed', zorder=3)
# ax.set_xlabel('AstroNN distance DR16 [kpc]')
# ax.set_ylabel('AstroNN distance DR14 [kpc]')
# ax.set_xlim(0,100)
# ax.set_ylim(0,100)

# fig.show()

In [None]:
# fig = plt.figure()
# ax = fig.add_subplot(111)

# ax.scatter(orbs_input.dist().to(apu.kpc).value, 1./gaia_input['parallax'], 
#            s=1., alpha=0.1, color='Black', zorder=2)
# ax.plot([0.,100.], [0.,100.], color='Red', linestyle='dashed', zorder=3)
# ax.set_xlabel('AstroNN distance [kpc]')
# ax.set_ylabel('Inverted Gaia parallax [kpc]')
# ax.set_xlim(0,100)
# ax.set_ylim(0,100)

# fig.show()

## Make kinematics

In [None]:
## Do MC sampling of kinematic parameters

if sample_kinematics and \
    (not os.path.exists(sampled_obs_filename) or force_sampling):
    # Observables
    ra = gaia_input['ra']
    dec = gaia_input['dec']
    dist = allstar_input['weighted_dist']/1e3
    pmra = gaia_input['pmra']
    pmdec = gaia_input['pmdec']
    RV = allstar_input['VHELIO_AVG']

    # Errors
    ra_e = gaia_input['ra_error']
    dec_e = gaia_input['dec_error']
    dist_e = allstar_input['weighted_dist_error']/1e3
    pmra_e = gaia_input['pmra_error']
    pmdec_e = gaia_input['pmdec_error']
    RV_e = allstar_input['VERR']

    # Correlations, if using AstroNN then set distance correlations to 0.
    radec = gaia_input['ra_dec_corr']
    radist = 0.
    rapmra = gaia_input['ra_pmra_corr']
    rapmdec = gaia_input['ra_pmdec_corr']
    decdist = 0.
    decpmra = gaia_input['dec_pmra_corr']
    decpmdec = gaia_input['dec_pmdec_corr']
    distpmra = 0.
    distpmdec = 0.
    pmrapmdec = gaia_input['pmra_pmdec_corr']

    # Covariance matrix for observables, hardcode RV covariance to be 0.
    zarr = np.zeros(len(allstar_input))
    cov = np.zeros([len(allstar_input),6,6])
    cov[:,0] = np.dstack([ra_e**2, ra_e*dec_e*radec, ra_e*dist_e*radist, 
                          ra_e*pmra_e*rapmra, ra_e*dec_e*rapmdec, zarr])[0]
    cov[:,1,1:] = np.dstack([dec_e**2, dec_e*dist_e*decdist, dec_e*pmra_e*decpmra, 
                             dec_e*pmdec_e*decpmdec, zarr])[0]
    cov[:,2,2:] = np.dstack([dist_e**2, dist_e*pmra_e*distpmra, 
                             dist_e*pmdec_e*distpmdec, zarr])[0]
    cov[:,3,3:] = np.dstack([pmra_e**2, pmra_e*pmdec_e*pmrapmdec, zarr])[0]
    cov[:,4,4:] = np.dstack([pmdec_e**2, zarr])[0]
    cov[:,5,5] = RV_e**2
    # Symmetric:
    cov[:,:,0] = cov[:,0]
    cov[:,1:,1] = cov[:,1,1:]
    cov[:,2:,2] = cov[:,2,2:]
    cov[:,3:,3] = cov[:,3,3:]
    cov[:,4:,4] = cov[:,4,4:]

    # Means
    mean = np.dstack([ra,dec,dist,pmra,pmdec,RV])[0]

    assert np.all(np.isfinite(mean)), 'Non-finite elements in mean array'
    assert np.all(np.isfinite(cov)), 'Non-finite elements in covariance array'

    # Mask for positive-semidefiniteness (required for multivariate normal sampling)
    omask_psd = np.ones((len(allstar_input)),dtype=bool)

    # Samples
    sampled_obs = np.empty((len(allstar_input),n_samples,6))
    for i in tqdm(range(len(allstar_input))):
        try:
            sampled_obs[i] = np.random.multivariate_normal(mean[i], cov[i],
                                                           n_samples, 
                                                           check_valid='raise')
        except ValueError:
            print('Sampling failed on star i='+str(i))
            omask_psd[i] = False
    
    # Save the samples
    print('Saving sampled kinematics to '+sampled_obs_filename)
    np.save(sampled_obs_filename,sampled_obs,allow_pickle=True)
    np.save(sampled_obs_psd_mask_filename,omask_psd,allow_pickle=True)
elif sample_kinematics and os.path.exists(sampled_obs_filename):
    print('Loading sampled kinematics from '+sampled_obs_filename)
    sampled_obs = np.load(sampled_obs_filename,allow_pickle=True)
    omask_psd = np.load(sampled_obs_psd_mask_filename,allow_pickle=True)

In [None]:
# fig = plt.figure()
# ax = fig.add_subplot(111)

# i=5
# off = (np.median(sampled_obs[omask_psd,:,i],axis=1) - mean[omask_psd,i]) /\
#     np.abs(mean[omask_psd,i])
# off_lim = 1e-3
# ax.hist(off,bins=50,range=(-off_lim,off_lim))
# ax.axvline(0.,color='Black')
# fig.show()

# print(str(np.sum(np.abs(off)<off_lim))+'/'+str(len(off)))

In [None]:
## Calculate kinematic parameters

if force_kinematics or not os.path.exists(input_kinematics_filename):
    
    deltas_input = np.zeros((len(allstar_input)))
    eELzs_input = np.zeros((3,len(allstar_input)))
    accs_input = np.zeros((3,len(allstar_input)))
    orbextr_input = np.zeros((3,len(allstar_input)))
    
    if save_kinematic_samples:
        _deltas_input_nomed = np.zeros((len(allstar_input),n_samples))
        _eELzs_input_nomed = np.zeros((3,len(allstar_input),n_samples))
        _accs_input_nomed = np.zeros((3,len(allstar_input),n_samples))
        _orbextr_input_nomed = np.zeros((3,len(allstar_input),n_samples))
    
    if sample_kinematics:
        print('Calculating kinematics using sampling')
        for i in tqdm(range(len(allstar_input))):
            if not omask_psd[i]: continue

            _orbs = orbit.Orbit(sampled_obs[i,:,:], radec=True, ro=ro, vo=vo, zo=zo)
            deltas,eELzs,accs,orbextr = putil.calculate_accs_eELzs_orbextr_Staeckel(
                _orbs,mwpot,aAS)
            
            deltas_input[i] = np.nanmedian(deltas)
            eELzs_input[:,i] = np.nanmedian(eELzs,axis=1)
            accs_input[:,i] = np.nanmedian(accs,axis=1)
            orbextr_input[:,i] = np.nanmedian(orbextr,axis=1)
            
            if save_kinematic_samples:
                _deltas_input_nomed[i,:] = deltas
                _eELzs_input_nomed[:,i,:] = eELzs
                _accs_input_nomed[:,i,:] = accs
                _orbextr_input_nomed[:,i,:] = orbextr
    else:
        print('Calculating kinematics without sampling')
        _vxvv = np.array([gaia_input['ra'],
                          gaia_input['dec'],
                          allstar_input['weighted_dist']/1e3,
                          gaia_input['pmra'],
                          gaia_input['pmdec'],
                          allstar_input['VHELIO_AVG']
                         ]).T
        _orbs = orbit.Orbit(_vxvv, radec=True, ro=ro, vo=vo, zo=zo)
        deltas_input,eELzs_input,accs_input,orbextr_input = \
            putil.calculate_accs_eELzs_orbextr_Staeckel(_orbs,mwpot,aAS)
        np.save(vxvv_no_sample_filename,_vxvv,allow_pickle=True)

    print('Saving deltas, eELz, actions, extrema to '+input_kinematics_filename)
    with open(input_kinematics_filename,'wb') as f:
        pickle.dump([deltas_input,eELzs_input,accs_input,orbextr_input],f)
    if save_kinematic_samples:
        print('Saving deltas, eELz, actions, extrema samples to '+\
              input_kinematics_samples_filename)
        with open(input_kinematics_samples_filename,'wb') as f:
            pickle.dump([_deltas_input_nomed,_eELzs_input_nomed,
                         _accs_input_nomed,_orbextr_input_nomed],f)
else:
    print('Loading deltas, eELz, actions, extrema from '+input_kinematics_filename)
    with open(input_kinematics_filename,'rb') as f:
        deltas_input,eELzs_input,accs_input,orbextr_input = \
            pickle.load(f)

### Apply mask based on kinematics
- Just on eccentricity
- Also on positive-semidefiniteness of covariance matrix if sampling was done

In [None]:
omask_kin = (eELzs_input[0,:] < 1.) &\
            (eELzs_input[0,:] > 0.)

omask_final = omask_kin
if sample_kinematics:
    omask_final &= omask_psd

gaia_omask = gaia_input[omask_final]
allstar_omask = allstar_input[omask_final]
os_omask = orbs_input[omask_final]
eELzs_omask = eELzs_input[:,omask_final]
accs_omask = accs_input[:,omask_final]
orbextr_omask = orbextr_input[:,omask_final]

print('Final number of stars: '+str(len(os_omask)))

In [None]:
if sample_kinematics:
    omask_kinematics_filename = gap_dir+'clean_kinematics_sampled.npy'
else:
    omask_kinematics_filename = gap_dir+'clean_kinematics_no_sample.npy'
    
with open(omask_kinematics_filename,'wb') as f:
    pickle.dump([gaia_omask,allstar_omask,os_omask,eELzs_omask,accs_omask,orbextr_omask],f)

Note that this sample only has quality cuts applied, no $\log g$ cuts yet. This is so that the parameter can be modified without having to re-run the sampling