In [71]:
%%configure -f
{"name": "brian-load-dapmaps-write", "executorMemory": "8G", "numExecutors": 29, "executorCores": 4,
 "conf": {"spark.yarn.appMasterEnv.PYSPARK_PYTHON":"python3"}}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
262,application_1588740809550_0250,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
262,application_1588740809550_0250,pyspark,idle,Link,Link,✔


In [72]:
import sys
import os
import re
import subprocess
from io import BytesIO
from gzip import GzipFile
from pyspark.sql import Row
import glob
import time
import numpy as np
from itertools import repeat
from collections import OrderedDict

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [73]:
def get_fits_module():
    if 'astropy' not in sys.modules:
        stdout = subprocess.check_output(
            sys.executable + ' -m pip install astropy',
            stderr=subprocess.STDOUT,
            shell=True).decode('utf-8')
    from astropy.io import fits
    return fits

def getfitslocal(path):
    fits = get_fits_module()
    fits_obj = fits.open(path)
    return fits_obj

# Spark dataframe cannot do numpy types
def typeconv(i):
    try:
        return i.item()
    except:
        return i

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [74]:
def get_channels(hdu):
    ''' get map channel names'''
    columns = []
    for ext in hdu:
        extname = ext.name
        if extname == 'PRIMARY':
            continue
        data = ext.data
        ctype = 'real' if data.dtype == np.dtype('>f') else 'integer'
        hdr=ext.header
        channels = [{int(key[1:]):val} for key, val in hdr.items() if re.search('^C[0-9]', key)]
        if not channels:
            channels = [None]
        for i, channel in enumerate(channels):
            chantext = channel[i+1] if channel else None
            if chantext:
                newchan = re.sub('[-]{2,}', '-', chantext).lower().replace('-', '_').replace(' ', '_').replace('.', '').replace('/', '_')
                colname = '{0}_{1}'.format(extname.lower(), newchan)
            else:
                colname = '{0}'.format(extname.lower())
            columns.append(colname)
    return columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [75]:
def reshape_data(hdu):
    ''' reshape the data into n_channel by n_spaxel '''
    reshaped=[]
    for ext in hdu:
        if ext.name == 'PRIMARY':
            continue
        extname = ext.name
        data = ext.data
        if data.ndim == 3:
            nchan, ny, nx = data.shape
            red = data.reshape(nchan, ny*nx)
        else:
            ny, nx = data.shape
            nchan = None
            red = data.reshape(ny*nx)
        reshaped.append(red)
    return reshaped

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [76]:
def collapse_spaxel(data, spaxel_idx):
    ''' collapse all channel parameters values into a single array '''
    params = []
    for d in data:
        if d.ndim == 2:
            pp=[typeconv(i) for i in d[:,spaxel_idx].tolist()]
        else:
            pp=[typeconv(d[spaxel_idx])]
        params.extend(pp)
    return params

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [77]:
from itertools import repeat

def createPerSpaxelRow(fits):
    n_y, n_x = fits['SPX_SNR'].data.shape
    n_spax = np.int(n_y * n_x)
    
    base_cols = ('plateifu', 'mangaid', 'ra', 'dec', 'daptype', 'spaxel_idx', 'x', 'y')
    channels = get_channels(fits)
    columns = base_cols + tuple(channels)
    mapRow = Row(*columns)
    
    hdr = fits[0].header
    plateifu = typeconv(hdr.get('PLATEIFU'))
    mangaid = typeconv(hdr.get('MANGAID'))
    ra = typeconv(hdr.get('OBJRA'))
    dec = typeconv(hdr.get('OBJDEC'))
    daptype = typeconv(hdr.get('DAPTYPE'))
    
    reshaped_params = reshape_data(fits)
    
    rows = []
    for spaxel_idx in range(n_spax):
        y_idx = int(spaxel_idx / n_y)
        x_idx = spaxel_idx - y_idx * n_y
        x = (x_idx)
        y = (y_idx)
        
        # params a list of channel values for a given spaxel in order of channels 
        params = collapse_spaxel(reshaped_params, spaxel_idx)
        
        row_gen = [mapRow(plateifu, mangaid, ra, dec, daptype, (spaxel_idx),
                      x, y, *params)]
        #_rows = [mapRow(*i) for i in row_gen]
        rows.extend(row_gen)
    return rows
    

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [78]:
base_dir = '/sciserver/vc/manga/vc/sas/dr15/manga/spectro/analysis/v2_4_3/2.2.1/HYB10-GAU-MILESHC/'
dirs1 = [base_dir + i for i in os.listdir(base_dir)]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [79]:
files_rdd = sc.parallelize(dirs1, len(dirs1)).flatMap(lambda x: [x+'/'+i for i in os.listdir(x)]).flatMap(lambda d: glob.glob(d+'/*-MAPS-*.fits.gz'))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [80]:
files_stats = files_rdd.map(
    lambda x: (os.stat(x).st_size/1024/1024, 1, os.stat(x).st_size/1024/1024)
).reduce(
    lambda x,y: (x[0]+y[0], x[1]+y[1], max(x[2],y[2]))
)
print('N files: {1}. Total Size: {0:0.0f}MB, average size: {2:0.0f}'.format(*files_stats))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

N files: 4718. Total Size: 9897MB, average size: 4

In [81]:
n_part = int(files_stats[1])
table_data = files_rdd.repartition(n_part).map(getfitslocal).flatMap(createPerSpaxelRow)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [82]:
table_data.first()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(plateifu='8145-6104', mangaid='1-152567', ra=116.511679886, dec=28.878071563, daptype='HYB10-GAU-MILESHC', spaxel_idx=0, x=0, y=0, spx_skycoo_on_sky_x=13.499882698059082, spx_skycoo_on_sky_y=-13.49868106842041, spx_ellcoo_elliptical_radius=23.674753189086914, spx_ellcoo_r_re=6.374599456787109, spx_ellcoo_elliptical_azimuth=115.72105407714844, spx_mflux=0.0, spx_mflux_ivar=0.0, spx_snr=0.0, binid_binned_spectra=-1, binid_stellar_continua=-1, binid_em_line_moments=-1, binid_em_line_models=-1, binid_spectral_indices=-1, bin_lwskycoo_lum_weighted_on_sky_x=0.0, bin_lwskycoo_lum_weighted_on_sky_y=0.0, bin_lwellcoo_lum_weighted_elliptical_radius=0.0, bin_lwellcoo_r_re=0.0, bin_lwellcoo_lum_weighted_elliptical_azimuth=0.0, bin_area=0.0, bin_farea=0.0, bin_mflux=0.0, bin_mflux_ivar=0.0, bin_mflux_mask=1073741827, bin_snr=0.0, stellar_vel=0.0, stellar_vel_ivar=0.0, stellar_vel_mask=1073741827, stellar_sigma=0.0, stellar_sigma_ivar=0.0, stellar_sigma_mask=1073741827, stellar_sigmacorr=0.0, st

In [83]:
hdfs_dir = 'hdfs:///manga/brian-test/dr15/v2_4_3/maps'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [84]:
t = time.time()
table = spark.createDataFrame(table_data)
table.write.mode('overwrite').parquet(hdfs_dir)
print(time.time()-t)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

467.857421875

In [90]:
    data = spark.read.parquet('hdfs:///manga/brian-test/dr15/v2_4_3/maps')
    data.createOrReplaceTempView('maps')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [91]:
spark.sql('''describe maps''').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|            plateifu|   string|   null|
|             mangaid|   string|   null|
|                  ra|   double|   null|
|                 dec|   double|   null|
|             daptype|   string|   null|
|          spaxel_idx|   bigint|   null|
|                   x|   bigint|   null|
|                   y|   bigint|   null|
| spx_skycoo_on_sky_x|   double|   null|
| spx_skycoo_on_sky_y|   double|   null|
|spx_ellcoo_ellipt...|   double|   null|
|     spx_ellcoo_r_re|   double|   null|
|spx_ellcoo_ellipt...|   double|   null|
|           spx_mflux|   double|   null|
|      spx_mflux_ivar|   double|   null|
|             spx_snr|   double|   null|
|binid_binned_spectra|   bigint|   null|
|binid_stellar_con...|   bigint|   null|
|binid_em_line_mom...|   bigint|   null|
|binid_em_line_models|   bigint|   null|
+--------------------+---------+-------+
only showing top

In [92]:
spark.sql('''select count(*) from maps''').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+
|count(1)|
+--------+
|16028280|
+--------+

In [96]:
spark.sql('''select plateifu, count(plateifu) from maps group by plateifu''').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+---------------+
|  plateifu|count(plateifu)|
+----------+---------------+
|8984-12705|           5476|
|8626-12701|           5476|
|8942-12704|           5476|
| 8952-3702|           1936|
|7990-12702|           5476|
|8947-12705|           5476|
| 8603-6104|           2916|
| 8458-3701|           1936|
| 8552-3702|           1936|
|9184-12703|           5776|
|9036-12703|           5776|
| 9883-6101|           2916|
|8146-12705|           5776|
|8601-12701|           5476|
| 8611-3701|           1936|
| 9510-1901|           1156|
| 7815-6102|           2704|
| 9678-6102|           2916|
|9512-12705|           5476|
| 8937-1902|           1156|
+----------+---------------+
only showing top 20 rows

In [97]:
spark.sql('''select count(plateifu) from (select plateifu, count(plateifu) from maps group by plateifu)''').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+
|count(plateifu)|
+---------------+
|           4718|
+---------------+

In [102]:
    cdata = spark.read.parquet('hdfs:///manga/arik-test/dr15/v2_4_3/logcube')
    cdata.createOrReplaceTempView('logcube')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [105]:
spark.sql('''select plateifu, count(emline_gflux_ha_6564) from maps where emline_gflux_ha_6564 > 25 group by plateifu''').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+---------------------------+
|  plateifu|count(emline_gflux_ha_6564)|
+----------+---------------------------+
| 8552-3702|                          1|
| 8458-3701|                         47|
| 8262-1901|                         27|
| 8150-6101|                         17|
| 8616-1901|                         33|
| 8982-9101|                        120|
| 8553-9102|                          4|
|9674-12703|                         74|
| 8259-3704|                         19|
| 8333-1902|                         32|
| 8262-3702|                         76|
| 8952-6104|                         52|
|9674-12701|                         58|
| 9038-9101|                         11|
| 8453-6102|                         38|
| 8261-1901|                         22|
| 8156-3702|                         19|
|9675-12703|                       2436|
| 9184-3704|                         30|
| 8615-3701|                         37|
+----------+---------------------------+
only showing top

In [108]:
spark.sql('''select plateifu, emline_gflux_ha_6564, x, y from maps where emline_gflux_ha_6564 > 25''').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+---+---+
| plateifu|emline_gflux_ha_6564|  x|  y|
+---------+--------------------+---+---+
|8551-3701|  26.905492782592773| 18| 20|
|8551-3701|  31.592317581176758| 19| 20|
|8551-3701|  33.098114013671875| 20| 20|
|8551-3701|  31.165016174316406| 21| 20|
|8551-3701|  27.630207061767578| 22| 20|
|8551-3701|   25.37577247619629| 17| 21|
|8551-3701|  33.406036376953125| 18| 21|
|8551-3701|   40.30801010131836| 19| 21|
|8551-3701|    42.7925910949707| 20| 21|
|8551-3701|   43.42845916748047| 21| 21|
|8551-3701|   39.72331237792969| 22| 21|
|8551-3701|  34.885719299316406| 23| 21|
|8551-3701|  30.181264877319336| 24| 21|
|8551-3701|  26.739524841308594| 25| 21|
|8551-3701|  27.167835235595703| 17| 22|
|8551-3701|  36.835113525390625| 18| 22|
|8551-3701|   41.56124496459961| 19| 22|
|8551-3701|  44.282310485839844| 20| 22|
|8551-3701|   45.60519790649414| 21| 22|
|8551-3701|  44.445220947265625| 22| 22|
+---------+--------------------+---+---+
only showing top

In [None]:
spark.sql('''select plateifu, emline_gflux_ha_6564, x, y from maps where emline_gflux_ha_6564 > 25 and binid_binned_spectra != -1 and binid_stellar_continua != -1 and binid_em_line_models != -1 and binid_em_line_moments != -1 and binid_spectral_indices != -1''').count()