In [1]:
import numpy as np
import pandas as pd
import glob
import sys
import h5py
#from netCDF4 import Dataset
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

import pyarrow as pa
import pyarrow.parquet as pq

from functools import reduce
import operator
import gc

In [2]:
# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

In [3]:
# PySpark packages
from pyspark import SparkContext   
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W


spark = SparkSession.builder \
    .master("yarn") \
    .appName("spark-shell") \
    .config("spark.driver.maxResultSize", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.executor.memory", "14g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "60") \
    .getOrCreate()


sc = spark.sparkContext
sc.setCheckpointDir("hdfs://sohnic:54310/tmp/checkpoints")

spark.conf.set("spark.sql.debug.maxToStringFields", 500)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/24 05:16:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/24 05:16:29 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [4]:
sc.getConf().getAll()[:10]

[('spark.driver.memory', '32g'),
 ('spark.executorEnv.PYTHONPATH',
  '/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip:/usr/local/spark/python/:<CPS>{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.9.7-src.zip'),
 ('spark.driver.maxResultSize', '32g'),
 ('spark.app.startTime', '1700802987948'),
 ('spark.ui.proxyBase', '/proxy/application_1700802742761_0001'),
 ('spark.driver.port', '36069'),
 ('spark.master', 'yarn'),
 ('spark.app.id', 'application_1700802742761_0001'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.host', 'sohnic')]

In [13]:
h5dir = '/mnt/raid5/lshin/'

In [14]:
flist = !ls /mnt/raid5/lshin

In [15]:
flist

['snap_099.0.0.0.hdf5']

In [16]:
h5list = flist#[2:]

In [17]:
numh5list = len(h5list)
print(numh5list)

1


In [18]:
h5dir+h5list[0]

'/mnt/raid5/lshin/snap_099.0.0.0.hdf5'

In [19]:
h5py.is_hdf5(h5dir+h5list[0])

True

In [20]:
try:
    h5f = h5py.File(h5dir+h5list[0], "r")
except IOError as e:
    print("Error opening HDF5 file:", str(e))
# Don't forget f.close() when done! 

In [21]:
h5f.keys()

<KeysViewHDF5 ['PartType0', 'PartType1', 'PartType4', 'PartType5']>

In [22]:
for key in h5f.keys():
    item = h5f[key]
    
    if isinstance(item, h5py.Group):  # Check if it's a group
        print(f"Group: {key}")
    elif isinstance(item, h5py.Dataset):  # Check if it's a dataset
        print(f"Dataset: {key}, dtype: {item.dtype}")
    else:
        print(f"Unknown item: {key}")

Group: PartType0
Group: PartType1
Group: PartType4
Group: PartType5


In [27]:
h5f['PartType4'].keys()

<KeysViewHDF5 ['Coordinates', 'GFM_StellarFormationTime', 'Masses']>

In [28]:
for key in h5f['PartType4'].keys():
    item = h5f['PartType4'][key]
    
    if isinstance(item, h5py.Group):  # Check if it's a group
        print(f"Group: {key}")
    elif isinstance(item, h5py.Dataset):  # Check if it's a dataset
        print(f"Dataset: {key}, dtype: {item.dtype}, shape: {item.shape}")
    else:
        print(f"Unknown item: {key}")

Dataset: Coordinates, dtype: float32, shape: (206055, 3)
Dataset: GFM_StellarFormationTime, dtype: float32, shape: (206055,)
Dataset: Masses, dtype: float32, shape: (206055,)


# Save selected features as a parquet

In [55]:
#schema = T.StructType([\
##                       T.StructField('filename',T.StringType(), True),\
#                       T.StructField('gas_coord',T.ArrayType(T.FloatType(),True), True),\
#                       T.StructField('gas_mass',T.ArrayType(T.FloatType(),True), True),\
#                       T.StructField('dm_coord',T.ArrayType(T.FloatType(),True), True),\
#                       T.StructField('star_coord',T.ArrayType(T.FloatType(),True), True),\
#                       T.StructField('star_form_time',T.ArrayType(T.FloatType(),True), True),\
#                       T.StructField('star_mass',T.ArrayType(T.FloatType(),True), True),\
#                       T.StructField('bh_coord',T.ArrayType(T.FloatType(),True), True),\
#                       T.StructField('bh_mass',T.ArrayType(T.FloatType(),True), True),\
#                   ])
schema = T.StructType([\
#                       T.StructField('filename',T.StringType(), True),\
                       T.StructField('gas_coord',T.FloatType(), True),\
                       T.StructField('gas_mass',T.FloatType(), True),\
                       T.StructField('dm_coord',T.FloatType(), True),\
                       T.StructField('star_coord',T.FloatType(), True),\
                       T.StructField('star_form_time',T.FloatType(), True),\
                       T.StructField('star_mass',T.FloatType(), True),\
                       T.StructField('bh_coord',T.FloatType(), True),\
                       T.StructField('bh_mass',T.FloatType(), True),\
                   ])

In [43]:
h5f['PartType4/Coordinates'].shape

(206055, 3)

In [41]:
h5f['PartType4/Coordinates'][:5]

array([[1315.3317, 4634.5044, 1751.5077],
       [1315.0211, 4634.4673, 1751.346 ],
       [1314.794 , 4634.2354, 1751.6212],
       [1315.2739, 4634.2817, 1751.7683],
       [1315.1174, 4634.261 , 1751.265 ]], dtype=float32)

In [44]:
print(len(h5f['PartType4/Coordinates'][:,:].flatten().tolist()))
h5f['PartType4/Coordinates'][:,:].flatten().tolist()[:5]

618165


[1315.3316650390625,
 4634.50439453125,
 1751.5076904296875,
 1315.0211181640625,
 4634.46728515625]

In [73]:
gas_coord = h5f['PartType0/Coordinates'][:,:].flatten().tolist()
gas_mass =  [float(i) for i in h5f['PartType0/Masses']]
dm_coord = h5f['PartType1/Coordinates'][:,:].flatten().tolist()
star_coord = h5f['PartType4/Coordinates'][:,:].flatten().tolist()
star_form_time =  [float(i) for i in h5f['PartType4/GFM_StellarFormationTime']]
star_mass =  [float(i) for i in h5f['PartType4/Masses']]
bh_coord = h5f['PartType5/Coordinates'][:,:].flatten().tolist()
bh_mass = [float(i) for i in h5f['PartType5/Masses']]

In [74]:
%%time
sparkdf = spark.createDataFrame(zip(gas_coord, gas_mass, dm_coord, star_coord, star_form_time, star_mass, bh_coord, bh_mass),schema)

CPU times: user 444 ms, sys: 7.45 ms, total: 452 ms
Wall time: 485 ms


In [75]:
outdir = 'hdfs://sohnic:54310/user/lshin/'

In [77]:
outname = outdir+h5list[0].replace("hdf5","parquet.snappy")
print(outname)

hdfs://sohnic:54310/user/lshin/snap_099.0.0.0.parquet.snappy


In [78]:
%%time
sparkdf.write.option("compression", "snappy") \
    .mode("overwrite") \
    .save(outname)

                                                                                

CPU times: user 15.9 ms, sys: 4 ms, total: 19.9 ms
Wall time: 30.5 s


# Check up the parquet

In [79]:
sparkdf.printSchema()

root
 |-- gas_coord: float (nullable = true)
 |-- gas_mass: float (nullable = true)
 |-- dm_coord: float (nullable = true)
 |-- star_coord: float (nullable = true)
 |-- star_form_time: float (nullable = true)
 |-- star_mass: float (nullable = true)
 |-- bh_coord: float (nullable = true)
 |-- bh_mass: float (nullable = true)



In [80]:
%%time
print(sparkdf.count())

23/11/24 06:41:51 WARN TaskSetManager: Lost task 0.0 in stage 11.0 (TID 692) (node5 executor 25): FetchFailed(BlockManagerId(19, node7, 45669, None), shuffleId=2, mapIndex=51, mapId=631, reduceId=0, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:437)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1232)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:971)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:86)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$10.nextCur(Iterator.scala:587)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:601)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:576)
	at org.apache.spark.util.CompletionIterator.hasNext(C

Py4JJavaError: An error occurred while calling o161.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: ResultStage 11 (count at NativeMethodAccessorImpl.java:0) has failed the maximum allowable number of times: 4. Most recent failure reason:
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:437)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1232)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:971)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:86)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$10.nextCur(Iterator.scala:587)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:601)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:576)
	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:576)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.hashAgg_doAggregateWithoutKey_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:888)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:888)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.io.IOException: Failed to connect to node5/192.168.0.135:42879
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:284)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:214)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:130)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:173)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.lambda$initiateRetry$0(RetryingBlockTransferor.java:206)
	at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
	... 1 more
Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: node5/192.168.0.135:42879
Caused by: java.net.ConnectException: Connection refused
	at java.base/sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
	at java.base/sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:777)
	at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:337)
	at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:334)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:776)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:724)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:650)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562)
	at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
	at java.base/java.lang.Thread.run(Thread.java:829)

	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.immutable.List.foreach(List.scala:333)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1961)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2978)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:382)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:3459)
	at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:3458)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.count(Dataset.scala:3458)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
