In [1]:
from astropy import units as u
from astropy.coordinates import SkyCoord
from astropy.io import fits
from glob import glob
import numpy as np
import os
import pandas as pd
from time import time
from tqdm import tqdm

In [2]:
keys_path = './keys.csv'
spec_dir = '../spectra/'
ts_catalog_path = '../../css/CSDR1/catalog.txt'
ts_path = '../../css/CSDR1/photometry.csv'
ts_dir = '../ts/'

## X-Match table

In [3]:
def get_spec_coords(spec_dir, num_specs = None):
    spec_names = os.listdir(spec_dir)
    ra_all = []
    dec_all = []
    tel_all = []
    if not num_specs is None:
        spec_names = spec_names[:num_specs]
    for i,name in enumerate(spec_names):
        spec_path = spec_dir + name
        spec = fits.open(spec_path)
        ra = spec[0].header['PLUG_RA']
        dec = spec[0].header['PLUG_DEC']
        tel = spec[0].header['TELESCOP']
        ra_all.append(ra)
        dec_all.append(dec)
        tel_all.append(tel)
    spec_names = np.array(spec_names)
    ra_all = np.array(ra_all)
    dec_all = np.array(dec_all)
    tel_all = np.array(tel_all)
    return spec_names, ra_all, dec_all, tel_all

In [4]:
def x_match_df(spec_dir, ts_catalog_path, ts_path):
    spec_ids, spec_ra, spec_dec, _ = get_spec_coords(spec_dir)
    spec_coords = SkyCoord(ra=spec_ra*u.degree, dec=spec_dec*u.degree)  
    ts = pd.read_csv(ts_path)
    ts = ts.groupby('id', axis=0)
    ts_mean = ts.mean()
    ts_ids = ts_mean.index.values
    ts_ra = ts_mean.ra.values
    ts_dec = ts_mean.dec.values
    ts_coords = SkyCoord(ra=ts_ra*u.degree, dec=ts_dec*u.degree)
    ts_catalog = pd.read_csv(ts_catalog_path)
    ts_catalog = ts_catalog.set_index('Numerical_ID')

    idx, d2d, d3d = spec_coords.match_to_catalog_sky(ts_coords)
    distance = d2d.arcsecond
    keep = distance <= 2

    spec_ids = spec_ids[keep]
    ts_ids = ts_ids[idx][keep]
    labels = ts_catalog['label'].loc[ts_ids].values
    x_match = np.array([spec_ids, ts_ids, labels]).transpose()
    x_match = pd.DataFrame(x_match, columns = ['sloan_file', 'css_num_ID', 'label'])
    x_match.css_num_ID = x_match.css_num_ID.astype(str)
    return x_match

In [5]:
x_match = x_match_df(spec_dir, ts_catalog_path, ts_path)
x_match.to_csv(keys_path, header=True, index=False)

In [6]:
check = pd.read_csv(keys_path)
check.css_num_ID = check.css_num_ID.astype(str)
check.head()

Unnamed: 0,sloan_file,css_num_ID,label
0,spec-2112-53534-0521.fits,1132062052528,RRc
1,spec-3307-54970-0075.fits,1001072050924,EW
2,spec-2477-54058-0261.fits,1121054053106,RRab
3,spec-2502-54180-0319.fits,1118059048329,RRc
4,spec-2474-54564-0169.fits,1126077024700,RRc


In [7]:
check.equals(x_match)

True

In [8]:
check.shape, len(np.unique(check.css_num_ID))

((3296, 3), 2683)

## Gather Time Series

In [9]:
if not os.path.exists(ts_dir):
    os.makedirs(ts_dir)

In [10]:
def gather_ts(ts_path, keys):
    t0 = time()
    ts = pd.read_csv(ts_path)
    ts.id = ts.id.astype(str)
    t1 = time()
    print('Time to read: ', str(np.round(t1-t0, decimals=2)),'s.')
    
    ts = ts.groupby('id', axis=0)
    for ts_i, k in tqdm(zip(ts, keys)):
        path = ts_dir + str(k) + '.csv'
        ts_i = ts.get_group(k)
        ts_i.to_csv(path, index=False)
    t2 = time()
    print('Time to split and save: ', str(np.round(t2-t1, decimals=2)), 's.')

In [11]:
keys = np.unique(x_match.css_num_ID)
gather_ts(ts_path, keys)

0it [00:00, ?it/s]

Time to read:  18.47 s.


2683it [00:12, 208.27it/s]


Time to split and save:  13.03 s.
