In [1]:
from utils import SpecReader
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

%matplotlib inline

### sdss (VAE) dataset

In [2]:
root = './data/'
keys_path = '../../sdss/VAE-dataset/keys.csv'
spec_dir = '../../sdss/VAE-dataset/spectra/'
spec_save = root+'specs.npy'
train_save = root+'specs-train.npy'
test_save = root+'specs-test.npy'

In [3]:
keys = pd.read_csv(keys_path)

In [4]:
if not os.path.exists(root):
    os.makedirs(root)

In [5]:
def get_labels(names, keys):
    names = [name.split('/')[-1].rstrip('.fits').split("-")[1:] for name in names]
    labels = []
    for key in tqdm(names):
        cond = keys['plate'].isin([key[0]]) 
        cond = cond & keys['mjd'].isin([key[1]])
        cond = cond & keys['fiberid'].isin([key[2]])
        subclass = keys.loc[cond]['subclass'].values
        labels.append(subclass)
        
    labels = np.array(list(map(lambda x: x[0][0], labels)))
    return labels

In [6]:
def build_db(spec_dir, keys_path):
    assert os.path.exists(keys_path)
    
    spec_reader = SpecReader(root = root)
    spec_reader.build_db(spec_dir,
                         n_specs=-1,
                         normed=True,
                         down_sample=True,)

    # All wave lengths equal
    assert (spec_reader.wlen == spec_reader.wlen[0]).all()
    
    keys = pd.read_csv(keys_path)
    labels = get_labels(spec_reader.names, keys)
    db = {'flux':spec_reader.flux, 
          'wlen_same': spec_reader.wlen_same, 
          'names': spec_reader.names, 
          'labels': labels}
    return db

In [7]:
db = build_db(spec_dir, keys_path)
np.save(spec_save, db)
db['flux'].shape, db['wlen_same'].shape

100%|██████████| 20949/20949 [02:56<00:00, 118.41it/s]


Time to read:  177.31545090675354


100%|██████████| 20602/20602 [00:00<00:00, 73315.45it/s]
100%|██████████| 20602/20602 [00:09<00:00, 2181.38it/s]


Time to down sample:  12.478596925735474


100%|██████████| 20602/20602 [00:39<00:00, 525.71it/s]


((20602, 3794), (3794,))

In [8]:
# Number of dropped spectra
spec_files = os.listdir(spec_dir)
spec_N = len(spec_files)
db_N = db['labels'].shape[0]
db_N, spec_N, spec_N - db_N, 1-db_N/spec_N

(20602, 20950, 348, 0.016610978520286368)

In [9]:
def split_save(spec_save, train_save, test_save):
    db = np.load(spec_save).item()
    data = list(zip(db['flux'], db['labels'], db['names']))
    wlen_same = db['wlen_same']

    train, test = train_test_split(data, shuffle=True, test_size=0.1)

    train = list(map(np.array, zip(*train)))
    db_train = dict(zip(['flux', 'labels','names'], train))
    db_train['wlen_same'] = wlen_same

    test = list(map(np.array, zip(*test)))
    db_test = dict(zip(['flux', 'labels','names'], test))
    db_test['wlen_same'] = wlen_same

    np.save(train_save, db_train)
    np.save(test_save, db_test)

In [10]:
split_save(spec_save, train_save, test_save)

### css-sdss dataset

In [11]:
param_path = './data/params.npy'
keys_path_x = '../../css-sdss/X-match/keys.csv'
spec_dir_x = '../../css-sdss/spectra/'
spec_save_x = root+'specs-x-match.npy'

In [12]:
def get_labels_x(name, keys):
    name = name.split('/')[-1]
    label =  keys.loc[keys['sloan_file']==name]['label']
    if len(label)==0:
        print(name)
        return None
    return label.values[0]

def build_db_x(spec_dir, keys_path, w_min, w_max):
    spec_reader_x = SpecReader(root = root)
    spec_reader_x.build_db(spec_dir,
                           n_specs=-1,
                           normed=True,
                           down_sample=True,
                           threshold=[w_min, w_max])
    
    keys = pd.read_csv(keys_path)
    label_fn = lambda name: get_labels_x(name, keys)
    labels = np.array(list(map(label_fn, spec_reader_x.names)))
    db = {'flux':spec_reader_x.flux, 
          'wlen_same': spec_reader_x.wlen_same, 
          'names': spec_reader_x.names, 
          'labels': labels}
    return db

In [13]:
params = np.load(param_path).item()
w_min = params['w_min']
w_max = params['w_max']
db_x = build_db_x(spec_dir_x, keys_path_x, w_min, w_max)
np.save(spec_save_x, db_x)
db_x['flux'].shape, db_x['wlen_same'].shape

100%|██████████| 3296/3296 [00:25<00:00, 127.54it/s]
100%|██████████| 3228/3228 [00:00<00:00, 83951.10it/s]
 35%|███▌      | 1132/3228 [00:00<00:00, 8934.06it/s]

Time to read:  26.214890956878662


100%|██████████| 3228/3228 [00:00<00:00, 7985.02it/s]


Time to down sample:  1.6185717582702637


((3228, 3794), (3794,))