In [18]:
from my_units import * 
from angular_fn import *
from data_cleaning_fn import *
from sky_patch_class import *

HomeDir = '../../'
DataDir = '../../../../data/' ### Set this to the directory where you store your data files (see below how to download them)
ListDir = HomeDir+'lists/'
FigDir = HomeDir+'figs/'

# Example of first data cleaning

## Preamble

In [2]:
### Define the patch on the sky where the analysis is done. Currently only works with a circle selection on the sphere.
#sky_p = sky_patch(81.28, -69.78, 5*degree, 50*kpc, 'LMC_disc_5', np.array([1.871, 0.391]), pm_esc=0.2) ### For the LMC
sky_p = sky_patch(12.80, -73.15, 4*degree, 60*kpc, 'SMC_disc_4', np.array([0.686, -1.237]), pm_esc=0.2) ### For the SMC

### Download the files here:
### LMC @ https://www.dropbox.com/s/33ivjtdr0k0u45l/LMC_disc_5.csv?dl=0
### SMC @ https://www.dropbox.com/s/79sc5gq8euz4qwy/SMC_disc_4.csv?dl=0

In [4]:
### Parameters for data cleaning
beta_kernel_clump = 0.1*degree  # gaussian kernel for first clump removal
f_clump = 2.5                   # multiple of the avg density field for first clump removal

beta_kernel_sub_0 = 0.1*degree;  # gaussian kernels for background subtraction 
n_sigma_out_0 = 5;               # number of sigmas for outlier removal

In [3]:
### Import the data
data = pd.read_csv(DataDir+sky_p.data_file_name+'.csv')  
data.shape #LMC 16709625, SMC 3318945

(3318945, 18)

In [6]:
# For SMC only: cut on the pm to remove stars from the foreground globular clusters
if sky_p.data_file_name == 'SMC_disc_4':   
    orig_len = len(data)
    data = data[(data['pmra'] < 0.685 + 2) & (data['pmra'] > 0.685 - 2) &
                (data['pmdec'] < -1.230 + 2) & (data['pmdec'] > -1.230 - 2) & 
                (data['parallax']/data['parallax_error'] < 5)]
len(data)/orig_len

0.7763349498108586

## Execution

In [7]:
### First compute the average pm and parallax fields using a gaussian distance kernel of size beta_kernel_sub_0 = 0.1 deg
### Computing the avg field 
disc_pix, nb_pixel_list, n = fn_prepare_back_sub(data, sky_p.disc_center, sky_p.disc_radius, beta_kernel_sub_0)
fn_back_field_sub(data, disc_pix, nb_pixel_list, n, beta_kernel=beta_kernel_sub_0, sub=False)

100%|██████████| 61991/61991 [00:16<00:00, 3674.97it/s]


In [8]:
### Clump removal at angular scale beta_kernel_clump = 0.1 deg. Removing clumps which are f_clump = 2.5 denser
n_stars_before = len(data)
data, clumps = fn_remove_clumps(data, sky_p.disc_center, sky_p.disc_radius, beta_kernel=beta_kernel_clump, f_clumps=f_clump)
if sky_p.data_file_name == 'SMC_disc_4':  
    clumps = clumps[0]-360*np.heaviside(clumps[0]-300, 0), clumps[1]
print(data.shape)
print('Stars removed from the clumps', round(100*(1-len(data)/n_stars_before), 5), '%')
#np.save(ListDir+sky_p.data_file_name+'_clumps', clumps)

Linear pixel size =  0.014314526715905858  degree


100%|██████████| 246575/246575 [01:43<00:00, 2371.75it/s]


(2536757, 22)
Stars removed from the clumps 1.54684 %


In [9]:
old_clumps_SMC = np.load(ListDir+'SMC_clumps.npy')
old_clumps_SMC[0].shape, clumps[0].shape

((9969,), (9969,))

In [17]:
old_clumps_1 = np.load(ListDir+'LMC_clumps_1.npy')
old_clumps_2 = np.load(ListDir+'LMC_clumps_2.npy')
old_clumps_1[1].shape, old_clumps_2[0].shape

In [10]:
### Removal of pm and parallax outliers at more than 5 signa
### Notice: for the SMC this has changed slightly because I was using a distance of 50 kpc instead of 60 kpc. Before the fraction of outliers removed was 0.57896 %, now it's 0.58184 %
data, f_out = fn_rem_outliers(data, sky_p.pm_esc, sky_p.distance/kpc, n_sigma_out_0) 
print('Fraction of outliers removed: '+str(f_out*100)[:7]+' %')

Fraction of outliers removed: 0.58184 %


In [11]:
### Additional cuts based on the quality of the astrometric fit
old_len = len(data)
data = data[(data['ruwe'] < 1.4) & (data['ipd_gof_harmonic_amplitude'] < 0.4) & 
            (data['ipd_frac_multi_peak'] < 40) & (data['ipd_frac_odd_win'] < 40)]  ### Not using the cut on the parallax
len(data)/old_len

0.9591407126971205

In [39]:
# Exporting the cleaned data file as a csv file
#data.to_csv(DataDir+sky_p.data_file_name+'_clean.csv', index=False)

### Notice: for the SMC, because of the wrong outlier cut on the parallax, the number of stars in the file SMC_disc_4_clean has changed. Previously it was 2419001 (the one currently used for the analysis and the sim)
### Using the correct parallax cut, the number of stars should be 2418950

In [None]:
### Export the cleaned data for the simulation - saving to a npy file makes it much faster to upload.
#columns_to_keep = ['ra', 'dec', 'pmra', 'pmdec', 'parallax', 'pmra_error', 'pmdec_error', 'parallax_error', 'phot_g_mean_mag', 'ecl_lon', 'ecl_lat', 'pmra_sub', 'pmdec_sub', 'parallax_sub']
#np.save(DataDir+sky_p.data_file_name+'_clean', data[columns_to_keep].to_numpy())

In [14]:
new_data = data[['ra', 'dec', 'pmra', 'pmdec', 'parallax', 'pmra_error', 'pmdec_error', 'parallax_error', 'phot_g_mean_mag', 'ecl_lon', 'ecl_lat', 'pmra_sub', 'pmdec_sub', 'parallax_sub']].to_numpy()
new_data.shape

(2418950, 14)

In [15]:
old_data_file = np.load(DataDir+sky_p.data_file_name+'_clean.npy')
old_data_file.shape

(2419001, 14)

In [17]:
data_csv = pd.read_csv(DataDir+sky_p.data_file_name+'_clean.csv')  
data_csv.shape

(2419001, 22)