In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
from astropy.table import Table
from tqdm import tqdm
import numpy as np

try:
    import fitsio
    fitsread= fitsio.read
    fitswrite=fitsio.write
    headerread=fitsio.read_header
    _FITSIO_LOADED = True
    
except ImportError:
    import astropy.io.fits as pyfits
    fitsread= pyfits.getdata
    fitswrite=pyfits.writeto
    headerread=pyfits.getheader
    _FITSIO_LOADED = False

cat = Table.read("/data/jdli/sdss/dr17/allStar-dr17-synspec_rev1.fits")
# df = cat.to_pandas()

# print(cat.info)




In [2]:
names = [name for name in cat.colnames if len(cat[name].shape) <= 1]

df = cat[names].to_pandas()

In [3]:
gdf = df.copy()
# gdf = gdf.merge(df_raw[['GAIAEDR3_SOURCE_ID', 'tmass_id']])

mask_labels = (gdf['M_H']>-2.5) & (gdf['M_H']<0.6) &\
            (gdf['TEFF']>3000) & (gdf['TEFF']<7000) &\
            (gdf['LOGG']>0.) & (gdf['LOGG']<5.5) &\
            (gdf['ALPHA_M']>-1) & (gdf['ALPHA_M']<1) & (gdf['ASPCAPFLAG']==0) &\
            (gdf['LOGG_ERR']<0.1) & (gdf['TEFF_ERR']<50) &\
            (gdf['ALPHA_M_ERR']<0.02) & (gdf['M_H_ERR']<0.015)
        
# mask_photo = (gdf['J']>6) & (gdf['J']<14.) &\
#             (gdf['H']>6) & (gdf['H']<14.) &\
#             (gdf['K']>6) & (gdf['K']<14.) &\
#             (gdf['Qfl_J']=='A') & (gdf['Qfl_H']=='A') & (gdf['Qfl_K']=='A')&\
#             (gdf['W1mag']>6) & (gdf['W1mag']<13.) &\
#             (gdf['W2mag']>6) & (gdf['W2mag']<13.) &\
#             (gdf['qph_w1']=='A') & (gdf['qph_w2']=='A')

# mask_spec = (gdf['snr_bp10']>100.) & (gdf['snr_bp55']>1.) &\
#             (gdf['snr_rp10']>100.) & (gdf['snr_rp55']>1.)

# mask  = mask_labels & mask_photo & mask_spec
mask = mask_labels

# print(gdf.shape, gdf[mask_labels].shape, gdf[mask_photo].shape, gdf[mask_spec].shape)
print(gdf[mask].shape)


(332904, 215)


In [6]:
# gdf[mask].to_csv("/data/jdli/sdss/dr17/ap17_lbcut.csv", index=False)

adf = gdf[mask].copy(deep=True)

def convert_string_uft8(series):
    return [s.decode('utf-8').strip() for s in series]

adf['APOGEE_ID'] = convert_string_uft8(adf['APOGEE_ID'])
adf['TELESCOPE'] = convert_string_uft8(adf['TELESCOPE'])
adf['FIELD']     = convert_string_uft8(adf['FIELD'])

print(adf['TELESCOPE'].unique(), "\n", adf['FIELD'].unique())

['apo25m' 'lco25m' 'apo1m'] 
 ['120+12' '116-04' 'N7789' ... 'K2_C12_083-66_btx' 'sgr_tidal10'
 'K2_C12_089-63_btx']


In [None]:
import multiprocessing
import urllib.request

spec_dir = "/data/jdli/sdss/dr17/spec_ap17_lbcut/"
n_split = 115370

def download_spectrum(telescope, field, apogee_id, save_dir=spec_dir):
    
    # if not os.path.exists(save_dir):
    #     print("make dir %s"%save_dir)
    #     os.makedirs(save_dir)
    
    url = f"https://data.sdss.org/sas/dr17/apogee/spectro/aspcap/dr17/synspec_rev1/{telescope}/{field}/aspcapStar-dr17-{apogee_id}.fits"
    filename = f"{apogee_id}.fits"
    filepath = save_dir+filename
    
    if not os.path.exists(filepath):
        try: 
            urllib.request.urlretrieve(url, save_dir+filename)
        except Exception:
            print(f"no such a spectra of APOGEE_ID = {apogee_id}")
    

# Number of parallel processes
num_processes = 64

# Create a pool of processes
with multiprocessing.Pool(processes=num_processes) as pool:
    
    pool.starmap(download_spectrum, tqdm(
        zip(adf['TELESCOPE'].values[n_split:], 
            adf['FIELD'].values[n_split:], 
            adf['APOGEE_ID'].values[n_split:],
           ), total=len(adf)-n_split))


 25%|████████████████                                                | 54400/217534 [00:11<00:00, 540677.28it/s]

In [9]:

spec_dir = "/data/jdli/sdss/dr17/spec_ap17_lbcut/"
ava_spec_names = os.listdir(spec_dir)
print(len(ava_spec_names))

cat_spec_names = [f"{i}.fits" for i in adf['APOGEE_ID'].values]
print(len(cat_spec_names))

res_spec_names = list(set(cat_spec_names) - set(ava_spec_names))
print(len(res_spec_names))

115370
332904
195631


In [11]:
def read_spec(fname):
    pri_header = headerread(fname, 0, memmap=False)
    flux = fitsread(fname, 1, memmap=False)
    e_flux = fitsread(fname, 2, memmap=False)
    mask = fitsread(fname, 3, memmap=False)
    
    if (len(flux.shape)>1)&(len(e_flux.shape)>1)&(len(mask.shape)>1):
        flux, e_flux, mask = flux[0], e_flux[0], mask[0] 

    return {'tmass_id':pri_header['OBJID'], 'flux':flux, 'fluxerr':e_flux, 'mask':mask, }


# read_spec(fname_lst[0])

In [None]:
normspecDict_trlst = []
normspecDict_vallst= []

for i in tqdm(range(len(specDict_lst))):
    d = specDict_lst[i]
    
    # try:
    spec = d['flux']
    spec_err = d['fluxerr']
    spec_mask = d['mask']

    norm_spec, norm_spec_err = apogee_continuum(
        spec, spec_err, bitmask=spec_mask, dr=14
    )

    ind = df['2MASS_ID'].str.decode("utf-8")==d['tmass_id']

    prlx, e_prlx = df[ind]['Gaia_parallax'].values, df[ind]['Gaia_parallax_err'].values
    prlx_hogg, e_prlx_hogg = df[ind]['spec_parallax'].values, df[ind]['spec_parallax_err'].values
    mags = df[['Jmag', 'Hmag', 'Kmag', 'W1mag', 'W2mag']][ind].values

    data = {'norm_spec':norm_spec, 'norm_spec_err':norm_spec_err, 
            'mag':mags, 
            'tmass_id':d['tmass_id'], 
            'Gaia_parallax':prlx,  'Gaia_parallax_err':e_prlx, 
            'spec_parallax':prlx_hogg, 'spec_parallax_err':e_prlx_hogg,}

    if df[ind]['training_set'].values==1:
        normspecDict_trlst.append(data)
    elif df[ind]['training_set'].values==0:
        normspecDict_vallst.append(data)
    else:
        print(df['training_set'][ind].values)
            
    # except:
    #     print("normalization failed")
        
print(len(normspecDict_trlst), len(normspecDict_vallst))