In [1]:
%%configure -f
{"name": "brian-load-modelcubes", "executorMemory": "25G", "numExecutors": 8, "executorCores": 2,
 "conf": {"spark.yarn.appMasterEnv.PYSPARK_PYTHON":"python3"}}

In [2]:
import sys
import os
import subprocess
from io import BytesIO
from gzip import GzipFile
from pyspark.sql import Row
import glob
import time
import numpy as np
from itertools import repeat

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
61,application_1588740809550_0063,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def get_fits_module():
    if 'astropy' not in sys.modules:
        stdout = subprocess.check_output(
            sys.executable + ' -m pip install astropy',
            stderr=subprocess.STDOUT,
            shell=True).decode('utf-8')
    from astropy.io import fits
    return fits

def get_wcs_module():
    if 'astropy' not in sys.modules:
        stdout = subprocess.check_output(
            sys.executable + ' -m pip install astropy',
            stderr=subprocess.STDOUT,
            shell=True).decode('utf-8')
    from astropy import wcs
    return wcs

def getfitslocal(path):
    fits = get_fits_module()
    fits_obj = fits.open(path)
    return fits_obj

# Spark dataframe cannot do numpy types
def typeconv(i):
    try:
        return i.item()
    except:
        return i
    
def createPerPixelSpaxelRow(fits):
    wcs = get_wcs_module()
    wc = wcs.WCS(fits['FLUX'].header)
    n_wave, n_y, n_x = fits['FLUX'].data.shape
    
    hdr = fits[0].header
    flux = np.ravel(fits['FLUX'].data)
    ivar = np.ravel(fits['IVAR'].data)
    mask = np.ravel(fits['MASK'].data)
    model = np.ravel(fits['MODEL'].data)
    emline = np.ravel(fits['EMLINE'].data)
    emline_base = np.ravel(fits['EMLINE_BASE'].data)
    emline_mask = np.ravel(fits['EMLINE_MASK'].data)
    
    wave = fits['WAVE'].data
    
    binid = np.ravel(fits['BINID'].data[0])

    # ravel indexing
    # rav_idx = (wave_idx * n_y * n_y) + (y_idx * n_x) + x_idx

    # and back
    # wave_idx = int(rav_idx / (n_y * n_y))
    # spaxel_idx = rav_idx - wave_idx * (n_y * n_y)
    # y_idx = int(spaxel_idx / n_y)
    # x_idx = spaxel_idx - y_idx * n_y
    
    rows = []
    for ravel_idx, flux_val in enumerate(flux):
        wave_idx = int(ravel_idx / (n_y * n_y))
        spaxel_idx = ravel_idx - wave_idx * (n_y * n_y)
        y_idx = int(spaxel_idx / n_y)
        x_idx = spaxel_idx - y_idx * n_y
        
        # skip no good spaxels
        if binid[spaxel_idx] == -1:
            continue

        row = {'plateifu': typeconv(hdr.get('PLATEIFU')),
               'mangaid': typeconv(hdr.get('MANGAID')),
               'ra': typeconv(hdr.get('OBJRA')),
               'dec': typeconv(hdr.get('OBJDEC')),
               'daptype': typeconv(hdr.get('DAPTYPE')),
               'spaxel_idx': spaxel_idx,
               'x': x_idx,
               'y': y_idx,
               'wave': typeconv(wave[wave_idx]),
               'flux': typeconv(flux_val),
               'ivar': typeconv(ivar[ravel_idx]),
               'mask': typeconv(mask[ravel_idx]),
               'model': typeconv(model[ravel_idx]),
               'emline': typeconv(emline[ravel_idx]),
               'emline_base': typeconv(emline_base[ravel_idx]),
               'emline_mask': typeconv(emline_mask[ravel_idx]),
               'binid_binned_spectra': typeconv(binid[spaxel_idx]),
               'binid_stellar_cont': typeconv(np.ravel(fits['BINID'].data[1])[spaxel_idx]),
               'binid_emline_moments': typeconv(np.ravel(fits['BINID'].data[2])[spaxel_idx]),
               'binid_emline_models': typeconv(np.ravel(fits['BINID'].data[3])[spaxel_idx]),
               'binid_spectral_indices': typeconv(np.ravel(fits['BINID'].data[4])[spaxel_idx])
              }
            
        rows.append(Row(**row))
    return rows
    

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
base_dir = '/sciserver/vc/manga/vc/sas/dr15/manga/spectro/analysis/v2_4_3/2.2.1/HYB10-GAU-MILESHC/'
dirs1 = [base_dir + i for i in os.listdir(base_dir)]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
files_rdd = sc.parallelize(dirs1, len(dirs1)).flatMap(lambda x: [x+'/'+i for i in os.listdir(x)]).flatMap(lambda d: glob.glob(d+'/*-LOGCUBE-*.fits.gz'))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
files_stats = files_rdd.map(
    lambda x: (os.stat(x).st_size/1024/1024, 1, os.stat(x).st_size/1024/1024)
).reduce(
    lambda x,y: (x[0]+y[0], x[1]+y[1], max(x[2],y[2]))
)
print('N files: {1}. Total Size: {0:0.0f}MB, average size: {2:0.0f}'.format(*files_stats))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

N files: 4718. Total Size: 333908MB, average size: 184

In [7]:
n_part = int(files_stats[1])
table_data = files_rdd.repartition(n_part).map(getfitslocal).flatMap(createPerPixelSpaxelRow)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
hdfs_dir = 'hdfs:///manga/brian-test/dr15/v2_4_3/modelcubes'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
t = time.time()
table = spark.createDataFrame(table_data)
table.write.mode('overwrite').parquet(hdfs_dir)
print(time.time()-t)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
Error sending http request and maximum retry encountered.
