In [11]:
%%configure -f
{"name": "brian-load-spaxels", "executorMemory": "30G", "numExecutors": 14, "executorCores": 2,
 "conf": {"spark.yarn.appMasterEnv.PYSPARK_PYTHON":"python3", "spark.sql.shuffle.partitions": 300}}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
288,application_1588740809550_0272,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
288,application_1588740809550_0272,pyspark,idle,Link,Link,✔


In [13]:
import sys
import os
import subprocess
from io import BytesIO
from gzip import GzipFile
from pyspark.sql import Row
import glob
import time
import numpy as np
from itertools import repeat
from collections import OrderedDict

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
def get_fits_module():
    if 'astropy' not in sys.modules:
        stdout = subprocess.check_output(
            sys.executable + ' -m pip install astropy',
            stderr=subprocess.STDOUT,
            shell=True).decode('utf-8')
    from astropy.io import fits
    return fits

def getfitslocal(path):
    fits = get_fits_module()
    fits_obj = fits.open(path)
    return fits_obj

# Spark dataframe cannot do numpy types
def typeconv(i):
    try:
        return i.item()
    except:
        return i
    
def createPerPixelSpaxelRow(fits):
    ''' old and deprecated method '''
    n_wave, n_y, n_x = fits['FLUX'].data.shape
    
    hdr = fits[0].header
    flux = np.ravel(fits['FLUX'].data)
    ivar = np.ravel(fits['IVAR'].data)
    mask = np.ravel(fits['MASK'].data)
    model = np.ravel(fits['MODEL'].data)
    emline = np.ravel(fits['EMLINE'].data)
    emline_base = np.ravel(fits['EMLINE_BASE'].data)
    emline_mask = np.ravel(fits['EMLINE_MASK'].data)
    
    wave = fits['WAVE'].data
    
    binid = np.ravel(fits['BINID'].data[0])

    # ravel indexing
    # rav_idx = (wave_idx * n_y * n_y) + (y_idx * n_x) + x_idx

    # and back
    # wave_idx = int(rav_idx / (n_y * n_y))
    # spaxel_idx = rav_idx - wave_idx * (n_y * n_y)
    # y_idx = int(spaxel_idx / n_y)
    # x_idx = spaxel_idx - y_idx * n_y
    
    rows = []
    for ravel_idx, flux_val in enumerate(flux):
        wave_idx = int(ravel_idx / (n_y * n_y))
        spaxel_idx = ravel_idx - wave_idx * (n_y * n_y)
        y_idx = int(spaxel_idx / n_y)
        x_idx = spaxel_idx - y_idx * n_y
        
        # skip no good spaxels
        if binid[spaxel_idx] == -1:
            continue

        row = {'plateifu': typeconv(hdr.get('PLATEIFU')),
               'mangaid': typeconv(hdr.get('MANGAID')),
               'ra': typeconv(hdr.get('OBJRA')),
               'dec': typeconv(hdr.get('OBJDEC')),
               'daptype': typeconv(hdr.get('DAPTYPE')),
               'spaxel_idx': spaxel_idx,
               'x': x_idx,
               'y': y_idx,
               'wave': typeconv(wave[wave_idx]),
               'flux': typeconv(flux_val),
               'ivar': typeconv(ivar[ravel_idx]),
               'mask': typeconv(mask[ravel_idx]),
               'model': typeconv(model[ravel_idx]),
               'emline': typeconv(emline[ravel_idx]),
               'emline_base': typeconv(emline_base[ravel_idx]),
               'emline_mask': typeconv(emline_mask[ravel_idx]),
               'binid_binned_spectra': typeconv(binid[spaxel_idx]),
               'binid_stellar_cont': typeconv(np.ravel(fits['BINID'].data[1])[spaxel_idx]),
               'binid_emline_moments': typeconv(np.ravel(fits['BINID'].data[2])[spaxel_idx]),
               'binid_emline_models': typeconv(np.ravel(fits['BINID'].data[3])[spaxel_idx]),
               'binid_spectral_indices': typeconv(np.ravel(fits['BINID'].data[4])[spaxel_idx])
              }
            
        rows.append(Row(**row))
    return rows

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
from itertools import repeat

def createPerSpaxelRow(fits):
    n_wave, n_y, n_x = fits['FLUX'].data.shape
    n_spax = np.int(n_y * n_x)
     
    mapRow = Row('plateifu', 'mangaid', 'ra', 'dec', 'spaxel_idx', 'x', 'y', 
                 'wave', 'flux', 'ivar', 'mask', 'disp', 'predisp', 'specres', 
                 'specresd', 'prespecres', 'prespecresd')
    
    hdr = fits[0].header
    plateifu = repeat(hdr.get('PLATEIFU'))
    mangaid = repeat(hdr.get('MANGAID'))
    ra = repeat(hdr.get('OBJRA'))
    dec = repeat(hdr.get('OBJDEC'))

    flux = np.reshape(fits['FLUX'].data, (n_wave, n_spax))
    ivar = np.reshape(fits['IVAR'].data, (n_wave, n_spax))
    mask = np.reshape(fits['MASK'].data, (n_wave, n_spax))
    disp = np.reshape(fits['DISP'].data, (n_wave, n_spax))
    predisp = np.reshape(fits['PREDISP'].data, (n_wave, n_spax))

    wave = fits['WAVE'].data.tolist()
    specres = fits['SPECRES'].data.tolist()
    specresd = fits['SPECRESD'].data.tolist()
    prespecres = fits['PRESPECRES'].data.tolist()
    prespecresd = fits['PRESPECRESD'].data.tolist()
    
    # ravel indexing
    # rav_idx = (wave_idx * n_y * n_y) + (y_idx * n_x) + x_idx

    # and back
    # wave_idx = int(rav_idx / (n_y * n_y))
    # spaxel_idx = rav_idx - wave_idx * (n_y * n_y)
    # y_idx = int(spaxel_idx / n_y)
    # x_idx = spaxel_idx - y_idx * n_y
    
    rows = []
    for spaxel_idx in range(n_spax):
        y_idx = int(spaxel_idx / n_y)
        x_idx = spaxel_idx - y_idx * n_y
        x = repeat(x_idx)
        y = repeat(y_idx)
        

#         row = {'plateifu': plateifu,
#                'mangaid': mangaid,
#                'ra': ra,
#                'dec': dec,
#                'daptype': daptype,
#                'spaxel_idx': repeat(spaxel_idx),
#                'x': x,
#                'y': y,
#                'wave': wave,
#                'flux': flux[:,spaxel_idx].tolist(),
#                'ivar': ivar[:,spaxel_idx].tolist(),
#                'mask': mask[:,spaxel_idx].tolist(),
#                'disp': disp[:,spaxel_idx].tolist(),
#                'predisp': predisp[:,spaxel_idx].tolist(),
#                'specres': specres,
#                'specres': specresd,
#                'prespecres': specres,
#                'prespecres': specresd
#               }

        #row_gen = zip(*row.values())

        row_gen = zip(plateifu, mangaid, ra, dec, repeat(spaxel_idx),
                      x, y, wave, flux[:, spaxel_idx].tolist(), ivar[:, spaxel_idx].tolist(),
                      mask[:, spaxel_idx].tolist(), disp[:, spaxel_idx].tolist(),
                      predisp[:, spaxel_idx].tolist(), specres, specresd, prespecres, prespecresd)

        _rows = [mapRow(*i) for i in row_gen]
        rows.extend(_rows)
    return rows

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
base_dir = '/sciserver/vc/manga/vc/sas/dr15/manga/spectro/redux/v2_4_3/'

stack_dir = lambda x: base_dir + x + '/stack'
cube_dirs = [stack_dir(i) for i in
            os.listdir(base_dir)
            if os.path.isdir(stack_dir(i))
           ]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
files_rdd = sc.parallelize(cube_dirs).flatMap(lambda d: glob.glob(d+'/*-LOGCUBE.fits.gz'))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
files_stats = files_rdd.map(
    lambda x: (os.stat(x).st_size/1024/1024, 1, os.stat(x).st_size/1024/1024)
).reduce(
    lambda x,y: (x[0]+y[0], x[1]+y[1], max(x[2],y[2]))
)
print('N files: {1}. Total Size: {0:0.0f}MB, average size: {2:0.0f}'.format(*files_stats))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

N files: 4857. Total Size: 597114MB, average size: 214

In [19]:
n_part = int(files_stats[1])
table_data = files_rdd.repartition(n_part).map(getfitslocal).flatMap(createPerSpaxelRow)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
table_data.first()

In [20]:
hdfs_dir = 'hdfs:///manga/brian-test/dr15/v2_4_3/cubes'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
t = time.time()
table = spark.createDataFrame(table_data)
table.write.mode('overwrite').parquet(hdfs_dir)
print(time.time()-t)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
tmp = spark.read.parquet('hdfs:///manga/brian-test/dr15/v2_4_3/cubes')

In [None]:
tmp.first()

In [None]:
tmp.describe()