In [1]:
import pandas as pd

In [2]:
ctlg_dir = '../../../data/catalogues'
obs_dir = '../../../data/observations'

read_path = f'{ctlg_dir}/gaia-sdss/cross-match.csv'
save_path = f'{ctlg_dir}/gaia-sdss/sdss-urls.txt'

In [3]:
cross_match = pd.read_csv(read_path)

In [4]:
base_url = {
    'boss': 'data.sdss.org/sas/dr15/eboss/spectro/redux/v5_10_0/spectra/',
    'legacy': 'data.sdss.org/sas/dr15/sdss/spectro/redux/26/spectra/',
    'segue2': 'data.sdss.org/sas/dr15/sdss/spectro/redux/104/spectra/',
    'stellar': 'https://data.sdss.org/sas/dr15/sdss/spectro/redux/103/spectra/'
}

In [5]:
def get_urls(cross_match):
    urls = []
    for _, entry in cross_match.iterrows():
        id_ = entry.source_id_gaia
        survey = entry.SURVEY_sdss
        mjd = entry.MJD_sdss
        plate = entry.PLATE_sdss
        fiber = entry.FIBERID_sdss

        url_ = None
        if (plate >= 266) and (plate <= 2974):
            url_ = base_url['legacy']
        elif (plate >= 3000) and (plate<=3509):
            url_ = base_url['segue2']
        elif (plate>=3510):
            url_ = base_url['boss']
        else:
            raise ValueError('Unkown URL assigned to gaia id {}, plate {}'.format(id_, plate))
        
        plate = str(entry.PLATE_sdss).zfill(4)
        fiber = str(entry.FIBERID_sdss).zfill(4)
        url_ = url_ + '{}/spec-{}-{}-{}.fits'.format(plate, plate, mjd, fiber)
        urls.append(url_)
    return urls

In [6]:
urls = get_urls(cross_match)
urls_df = pd.DataFrame(urls)
urls_df.to_csv(save_path, header=False, index=False)

### Check download

In [7]:
import numpy as np
import os

In [8]:
spectra_dir = f'{obs_dir}/spectra/sdss/xmatch/'
remain_path = './sdss-remaining.txt'

In [9]:
def get_urls_out(df):
    urls = []
    for _, entry in df.iterrows():
        id_ = entry.source_id_gaia
        survey = entry.SURVEY_sdss
        mjd = entry.MJD_sdss
        plate = str(entry.PLATE_sdss).zfill(4)
        fiber = str(entry.FIBERID_sdss).zfill(4)
        url_ = base_url['stellar'] + '{}/spec-{}-{}-{}.fits'.format(plate, plate, mjd, fiber)
        urls.append(url_)
    return urls

In [10]:
urls = np.array(urls)
files = [url_.split('/')[-1] for url_ in urls]
files_in = os.listdir(spectra_dir)
index_out = ~ np.isin(files, files_in)
entries_out = cross_match.iloc[index_out]
urls_out = get_urls_out(entries_out)
urls_out = pd.DataFrame(urls_out)

In [11]:
urls_out.to_csv(remain_path, index=False, header=False)

In [12]:
urls_out.values

array([], shape=(0, 0), dtype=float64)

In [13]:
cross_match.shape

(4607, 11)