In [1]:
%%configure -f
{"name": "brian-load-summaries", "executorMemory": "1G", "numExecutors": 4, "executorCores": 2,
 "conf": {"spark.yarn.appMasterEnv.PYSPARK_PYTHON":"python3"}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
178,application_1588740809550_0172,pyspark,idle,Link,Link,


In [2]:
import sys
import os
import subprocess
from io import BytesIO
from gzip import GzipFile
from pyspark.sql import Row
import glob
import time
import numpy as np
from itertools import repeat
from collections import OrderedDict

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
179,application_1588740809550_0173,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
def get_fits_module():
    if 'astropy' not in sys.modules:
        stdout = subprocess.check_output(
            sys.executable + ' -m pip install astropy',
            stderr=subprocess.STDOUT,
            shell=True).decode('utf-8')
    from astropy.io import fits
    return fits

def getfitslocal(path):
    fits = get_fits_module()
    fits_obj = fits.open(path)
    return fits_obj

# Spark dataframe cannot do numpy types
def typeconv(i):
    try:
        return i.item()
    except:
        return i

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
def expand_column_names(name, cols=None):
    if not cols:
        cols = ['F', 'N', 'u', 'g', 'r', 'i', 'z']
    return ['{0}_{1}'.format(name.lower(), i.lower()) for i in cols]

def get_column_names(data):
    cols = []
    names = data.columns.names
    row = data[0]
    for name in names:
        col = row[name]
        if isinstance(col, np.ndarray):
            cols.extend(expand_column_names(name, cols=['F', 'N', 'u', 'g', 'r', 'i', 'z']))
        else:
            cols.append(name)
    return cols

def convert_row(row):
    typed_row = []
    for col in list(row):
        if isinstance(col, np.ndarray):
            typed_row.extend(col.tolist())
        else:
            typed_row.append(typeconv(col))
    return typed_row
    
def create_drp_row(hdu):
    data = hdu[1].data
    cols = get_column_names(data)
    
    mapRow = Row(*cols)
    
    rows = []
    for row in data:
        typed_row = convert_row(row)
        _row = mapRow(*typed_row)
        rows.append(_row)
    return rows

specindex_channels = ['cn1', 'cn2', 'ca4227', 'g4300', 'fe4383', 'ca4455', 'fe4531', 'c24668', 'hb', 'fe5015', 
                      'mg1', 'mg2', 'mgb', 'fe5270', 'fe5335', 'fe5406', 'fe5709', 'fe5782', 'nad', 'tio1', 'tio2', 
                      'hdeltaa', 'hgammaa', 'hdeltaf', 'hgammaf', 'cahk', 'caii1', 'caii2', 'caii3', 'pa17', 'pa14', 
                      'pa12', 'mgicvd', 'naicvd', 'mgiir', 'fehcvd', 'nai', 'btio', 'atio', 'cah1', 'cah2', 
                      'naisdss', 'tio2sdss', 'd4000', 'dn4000', 'tiocvd']

emline_channels = ['oii_3729', 'hthe_3798', 'heta_3836', 'neiii_3869', 'hzet_3890', 'neiii_3968', 'heps_3971', 
                   'hdel_4102', 'hgam_4341', 'heii_4687', 'hb_4862', 'oiii_4960', 'oiii_5008', 'hei_5877', 
                   'oi_6302', 'oi_6365', 'nii_6549', 'ha_6564', 'nii_6585', 'sii_6718', 'sii_6732']

oii = 'oii_3727'    # all others 
oiid = 'oiid_3728'  # sflux and sew
snr_channels = ['g', 'r', 'i', 'z'] # snr_med and snr_ring
binr_channels = ['5', '10', '20'] # bin_r_n and bin_r_snr


def get_dap_column_names(data):
    cols = []
    names = data.columns.names
    row = data[0]
    for name in names:
        col = row[name]
        name = name.lower()
        if isinstance(col, np.ndarray):
            if 'snr' in name:
                colnames = snr_channels
            elif 'bin_r_' in name:
                colnames = binr_channels
            elif 'specindex' in name:
                colnames = specindex_channels
            elif 'emline' in name:
                if 'sflux' in name or 'sew' in name:
                    colnames = [oiid] + emline_channels
                else:
                    colnames = [oii] + emline_channels
            cols.extend(expand_column_names(name, cols=colnames))
        else:
            cols.append(name)
    return cols

def create_dap_row(hdu):
    data = hdu[1].data
    cols = get_dap_column_names(data)
    
    mapRow = Row(*cols)
    
    rows = []
    for row in data:
        typed_row = convert_row(row)
        _row = mapRow(*typed_row)
        rows.append(_row)
    return rows

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
drpall = ['/sciserver/vc/manga/vc/sas/dr15/manga/spectro/redux/v2_4_3/drpall-v2_4_3.fits']
dapall = ['/sciserver/vc/manga/vc/sas/dr15/manga/spectro/analysis/v2_4_3/2.2.1/dapall-v2_4_3-2.2.1.fits']

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
rdd = sc.parallelize(drpall)
drptable = rdd.map(getfitslocal).flatMap(create_drp_row)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
drptable.first()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(plate=8935, ifudsgn='12701', plateifu='8935-12701', mangaid='1-404249', versdrp2='v2_4_3', versdrp3='v2_4_3', verscore='v1_6_2', versutil='v5_5_32', versprim='v2_5', platetyp='APOGEE-2&MaNGA', srvymode='MaNGA dither', objra=194.523423939, objdec=29.0173526307, ifuglon=85.4656882782, ifuglat=87.6076553974, ifura=194.52342, ifudec=29.017353, ebvgal=0.00884234, nexp=9, exptime=8100.82, drp3qual=0, bluesn2=26.412, redsn2=54.6103, harname='ma087', frlplug=22, cartid='4', designid=9555, cenra=195.773, cendec=28.079836, airmsmin=1.00379, airmsmed=1.0207, airmsmax=1.05363, seemin=1.4388, seemed=1.6464, seemax=1.77775, transmin=0.778568, transmed=0.894813, transmax=0.908259, mjdmin=57809, mjdmed=57809, mjdmax=57813, gfwhm=2.77918, rfwhm=2.69495, ifwhm=2.66897, zfwhm=2.65623, mngtarg1=4288, mngtarg2=0, mngtarg3=0, catidnum=1, plttarg='plateTargets-1.par', manga_tileid=6460, nsa_iauname='J125805.61+290102.5', ifudesignsize=127, ifutargetsize=127, ifudesignwrongsize=0, z=0.0256766, zmin=0.0154

In [15]:
rdd = sc.parallelize(dapall)
daptable = rdd.map(getfitslocal).flatMap(create_dap_row)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
daptable.first()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(plate=7443, ifudesign=3702, plateifu='7443-3702', mangaid='12-84670', drpallindx=5639, mode='CUBE', daptype='VOR10-GAU-MILESHC', dapdone=True, objra=230.5983428955078, objdec=43.367774963378906, ifura=230.5983428955078, ifudec=43.367774963378906, mngtarg1=2084, mngtarg2=0, mngtarg3=0, z=0.11045099794864655, ldist_z=358.3685607910156, adist_z=290.6237487792969, nsa_z=0.11045099794864655, nsa_zdist=0.11082600057125092, ldist_nsa_z=358.3685607910156, adist_nsa_z=290.6237487792969, nsa_elpetro_ba=0.9506450295448303, nsa_elpetro_phi=129.19700622558594, nsa_elpetro_th50_r=2.6250100135803223, nsa_sersic_ba=0.9509689807891846, nsa_sersic_phi=119.18900299072266, nsa_sersic_th50=3.677690029144287, nsa_sersic_n=6.0, versdrp2='v2_4_3', versdrp3='v2_4_3', verscore='v1_6_2', versutil='v5_5_32', versdap='2.2.1', drp3qual=0, dapqual=0, rdxqakey='SNRG', binkey='VOR10', sckey='GAU-MILESHC', elmkey='EMOMM', elfkey='EFITM', sikey='INDXEN', bintype='voronoi', binsnr=10.0, tplkey='MILESHC', datedap='201

In [17]:
hdfs_drp_dir = 'hdfs:///manga/brian-test/dr15/v2_4_3/drpall'
hdfs_dap_dir = 'hdfs:///manga/brian-test/dr15/v2_4_3/dapall'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
t = time.time()

table = spark.createDataFrame(drptable)
table.write.mode('overwrite').parquet(hdfs_drp_dir)

table = spark.createDataFrame(daptable)
table.write.mode('overwrite').parquet(hdfs_dap_dir)

print(time.time()-t)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

186.46995854377747