In [1]:
import numpy as np
import pandas as pd
import glob
import sys
import h5py

from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

import pyarrow as pa
import pyarrow.parquet as pqw

from functools import reduce
import operator
import gc

h = 0.6774

In [2]:
# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

In [3]:
from pyspark import SparkContext   
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
from pyspark.sql.functions import broadcast, col, sqrt, pow, floor, monotonically_increasing_id, abs, pmod, least, row_number
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W

spark = SparkSession.builder \
    .appName("MyApp") \
    .master("spark://sohnic:7077") \
    .config("spark.driver.memory", "100g") \
    .getOrCreate()

sc = spark.sparkContext
sc.setCheckpointDir("hdfs://sohnic:54310/tmp/checkpoints")

spark.conf.set("spark.sql.debug.maxToStringFields", 500)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/19 21:48:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# 1. Reading the particle and subhalo data

In [4]:
%%time

outname = 'hdfs://sohnic:54310/data/TNG300/snap99/parquet/snap099_sorted.parquet.snappy'
rawdf = spark.read.option("header","true").option("recursiveFileLookup","true").parquet(outname)

24/09/19 21:02:11 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
                                                                                

CPU times: user 8.94 ms, sys: 0 ns, total: 8.94 ms
Wall time: 10 s


In [8]:
rawdf.printSchema()

root
 |-- px: double (nullable = true)
 |-- py: double (nullable = true)
 |-- pz: double (nullable = true)
 |-- vx: double (nullable = true)
 |-- vy: double (nullable = true)
 |-- vz: double (nullable = true)
 |-- mass: double (nullable = true)



In [9]:
%%time
rawdf.limit(4).toPandas().T



CPU times: user 8.05 ms, sys: 5.76 ms, total: 13.8 ms
Wall time: 2.62 s


                                                                                

Unnamed: 0,0,1,2,3
px,31709.944511,31632.233604,31723.905726,31635.570883
py,149516.276701,149433.240081,149425.764975,149433.2444
pz,109856.814293,109874.111223,109871.943071,109855.779448
vx,23.879442,3.904811,229.96228,-39.1357
vy,-108.89532,-332.08118,294.82867,-180.44347
vz,-87.56024,-242.37239,-85.344345,-138.83029
mass,0.000591,0.000551,0.000556,0.000289


In [7]:
%%time
rawdf.describe().show() 



+-------+------------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+
|summary|                px|               py|                pz|                vx|                vy|                  vz|                mass|
+-------+------------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+
|  count|         711967480|        711967480|         711967480|         711967480|         711967480|           711967480|           711967480|
|   mean|100688.61828005507|99059.36895091053|105857.77590178407|1.2704478306706097|1.0880226031171296|-0.30910798969782094|5.280344892323565E-4|
| stddev| 58078.24131182921|57695.00416267924|59449.995547022576| 376.9752051507219|390.00232101241244|  379.19046823402056|1.424400845340057...|
|    min| 1.136348000727E-4| 8.96728655789E-4|   6.5989815512E-4|        -4127.3325|        -3954.2883|          -4355.7554|

                                                                                

In [8]:
%%time
rawdf.describe().toPandas().T 



CPU times: user 175 ms, sys: 44.8 ms, total: 220 ms
Wall time: 45.3 s


                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
px,711967480,100688.6182800552,58078.24131182934,1.136348000727E-4,204999.9994621104
py,711967480,99059.36895091052,57695.00416267917,8.96728655789E-4,204999.9977547349
pz,711967480,105857.77590178445,59449.995547022656,6.5989815512E-4,204999.9998752371
vx,711967480,1.2704478306706155,376.97520515072165,-4127.3325,4329.7563
vy,711967480,1.0880226031171272,390.0023210124123,-3954.2883,4393.859
vz,711967480,-0.3091079896978168,379.1904682340204,-4355.7554,5640.448
mass,711967480,5.280344892323571E-4,1.4244008453400582E-4,8.154293E-6,0.0014874495


In [9]:
%%time
#subhalo table
subname = 'hdfs://sohnic:54310/data/TNG300/snap99/parquet/subhalo_trim.parquet.snappy'
subdf = spark.read.option("header","true").option("recursiveFileLookup","true").parquet(subname)

#adding a column for the subhalo id
#subdf = subdf.withColumn("sub_id", F.monotonically_increasing_id())
#windowSpec = W.orderBy("sub_id")
#subdf = subdf.withColumn("sub_id", F.row_number().over(windowSpec) - 1)

#changing the column name for cross joining
subdf = subdf.withColumnRenamed("px", "sub_px")
subdf = subdf.withColumnRenamed("py", "sub_py")
subdf = subdf.withColumnRenamed("pz", "sub_pz")
subdf = subdf.withColumnRenamed("vx", "sub_vx")
subdf = subdf.withColumnRenamed("vy", "sub_vy")
subdf = subdf.withColumnRenamed("vz", "sub_vz")
subdf = subdf.withColumnRenamed("mass", "sub_mass")

subdf.limit(4).toPandas()
#subdf.describe().toPandas().T 

CPU times: user 15.1 ms, sys: 0 ns, total: 15.1 ms
Wall time: 598 ms


Unnamed: 0,sub_px,sub_py,sub_pz,sub_vx,sub_vy,sub_vz,sub_mass,sub_id
0,85427.59375,2383.967529,55310.355469,-131.656403,479.287964,-101.148598,1035811000.0,2172
1,187141.890625,113344.992188,185398.46875,-272.443634,-723.376648,126.204926,1130842000.0,2173
2,187014.40625,112437.914062,185575.78125,988.073975,-460.934631,550.105957,1118096000.0,2174
3,187099.734375,112477.179688,185824.390625,697.318298,-98.391518,102.688042,1146507000.0,2175


In [5]:
#dividing the data into 100*100*100 boxes and adding the column for the box id
subdf = subdf.withColumn("sub_ix", floor(F.col("sub_px") / 100))
subdf = subdf.withColumn("sub_iy", floor(F.col("sub_py") / 100))
subdf = subdf.withColumn("sub_iz", floor(F.col("sub_pz") / 100))

subdf.toPandas().head()

Unnamed: 0,sub_px,sub_py,sub_pz,sub_vx,sub_vy,sub_vz,sub_mass,sub_id,sub_ix,sub_iy,sub_iz
0,99173.578125,40274.546875,107897.390625,-67.104362,-151.3302,167.907593,349215300000.0,103713,991,402,1078
1,120705.28125,69941.101562,157908.03125,269.752045,-320.683807,128.344238,328744200000.0,103714,1207,699,1579
2,204545.296875,170294.8125,87957.867188,363.489685,148.479889,-54.548824,345324800000.0,103715,2045,1702,879
3,173194.890625,158155.453125,162955.359375,83.774239,-126.974121,218.970139,393756000000.0,103716,1731,1581,1629
4,34154.796875,35974.179688,149377.25,559.149902,597.246765,-471.29715,386957400000.0,103717,341,359,1493


In [11]:
%%time
#dividing the data into 100*100*100 boxes and adding the column for the box id
rawdf = rawdf.withColumn("ix", floor(F.col("px") / 100))
rawdf = rawdf.withColumn("iy", floor(F.col("py") / 100))
rawdf = rawdf.withColumn("iz", floor(F.col("pz") / 100))
rawdf.limit(4).toPandas()



CPU times: user 21.2 ms, sys: 2.46 ms, total: 23.7 ms
Wall time: 2.58 s


                                                                                

Unnamed: 0,px,py,pz,vx,vy,vz,mass,ix,iy,iz
0,70037.914989,122711.870105,78395.454087,-115.07028,-183.36308,164.8555,0.000487,700,1227,783
1,70050.889402,122702.683213,78380.293789,-295.58307,-240.30115,57.16462,0.00068,700,1227,783
2,70052.049477,122687.959709,78388.525112,-357.15543,-156.07643,215.15742,0.000389,700,1226,783
3,70044.449632,122716.085783,78394.4034,-302.51917,-30.818964,586.6003,0.000632,700,1227,783


# 2. Saving the particle data within the 50kpc spheres centered at each subhalo

In [6]:
radius_sq = (50*h)**2
radius_sq

1147.1769

In [None]:
radius_sq = (50*h)**2 #50kpc aperture size
id_size = 100 #number of boxes
box_size = 205000 #box size in the ckpc/h unit

#broad casting the subhalo table to the all workers
broadcast_subdf = F.broadcast(subdf)

# Step 1: Coarse filtering by grid indices considering the periodic boundary
filtered_df = rawdf.crossJoin(broadcast_subdf).filter(
    (F.least(F.abs(F.col("ix") - F.col("sub_ix")), id_size - F.abs(F.col("ix") - F.col("sub_ix")) ) <= 1) &
    (F.least(F.abs(F.col("iy") - F.col("sub_iy")), id_size - F.abs(F.col("iy") - F.col("sub_iy")) ) <= 1) &
    (F.least(F.abs(F.col("iz") - F.col("sub_iz")), id_size - F.abs(F.col("iz") - F.col("sub_iz")) ) <= 1)

filtered_df = filtered_df.withColumn("dist_cent_sq", F.pow(F.least(F.abs(F.col("px") - F.col("sub_px")), box_size - F.abs(F.col("px") - F.col("sub_px"))), 2) +
                                                     F.pow(F.least(F.abs(F.col("py") - F.col("sub_py")), box_size - F.abs(F.col("py") - F.col("sub_py"))), 2) +
                                                     F.pow(F.least(F.abs(F.col("pz") - F.col("sub_pz")), box_size - F.abs(F.col("pz") - F.col("sub_pz"))), 2))

# Step 2: Fine filtering by distance calculation considering the periodic boundary
filtered_df = filtered_df.filter(
    (F.col("dist_cent_sq")) < radius_sq
)

# Select relevant columns (including subhalo_id for identification)
result_df = filtered_df.select("sub_id", "px", "py", "pz", "vx", "vy", "vz", "mass", "dist_cent_sq")

subname = 'hdfs://sohnic:54310/data/TNG300/snap99/parquet/extracted_region_240918.parquet.snappy'
result_df.write.option("compression", "snappy").mode("overwrite").partitionBy("sub_id").parquet(subname)

In [7]:
#Checking the data
subname = 'hdfs://sohnic:54310/data/TNG300/snap99/parquet/extracted_region_240918.parquet.snappy'
df = spark.read.parquet(subname)

df_filtered = df.filter(df.sub_id == 10000)
df_filtered.describe().toPandas()

                                                                                

Unnamed: 0,summary,px,py,pz,vx,vy,vz,mass,dist_cent_sq,sub_id
0,count,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
1,mean,86539.98217092815,115001.8123021865,144899.63376864442,-173.17213001875004,-162.7695493062501,201.533828775,0.0005594609641875001,52.50703274436383,10000.0
2,stddev,4.35862232808732,3.330779277000689,4.676035647780895,39.405089898886125,39.32917348856376,38.71986463333312,0.00014510780084495814,133.96340006846228,0.0
3,min,86517.54089495828,114987.06916561146,144874.48056289784,-298.88245,-250.64078,100.80679,0.00025858832,0.3857580077748698,10000.0
4,max,86558.38979516154,115009.8895705654,144911.07695180256,-54.3939,-47.9957,280.2506,0.00087464123,1050.5479423833408,10000.0


# 3. Calculating stellar velocity dispersion

We measure the stellar velocity dispersion of ~10$^5$ subhalos. We first select stellar particles within subhalo-centered spheres with a certain radius. We then calculate the mass-weighted stellar velocity dispersion based on the 3D velocity of selected stellar particles. 

### $M = \Sigma m_i $
### $\overline{v_x} = \frac{\Sigma m_i v_{i,x}}{M}$, $\, \overline{v_y} = \frac{\Sigma m_i v_{i,y}}{M}$, $\, \overline{v_z} = \frac{\Sigma m_i v_{i,z}}{M}$ 
### $\sigma_x^2 = \frac{\Sigma m_i (v_{i,x} - \overline{v_{x}})^2}{M}$, $\, \sigma_y^2 = \frac{\Sigma m_i (v_{i,y} - \overline{v_{y}})^2}{M}$, $\, \sigma_z^2 = \frac{\Sigma m_i (v_{i,z} - \overline{v_{z}})^2}{M}$
### $\therefore \sigma_{3D} = \sqrt{\sigma_x^2 + \sigma_y^2 + \sigma_z^2}$
We repeat the above calculation for six different radii (3, 5, 10, 20, 30, and 50 kpc).

In [4]:
subname = 'hdfs://sohnic:54310/data/TNG300/snap99/parquet/extracted_region_240918.parquet.snappy'

df = spark.read.parquet(subname)

df_filtered = df.filter(df.sub_id == 0)
df_filtered.describe().toPandas()

24/09/19 21:48:45 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
                                                                                

Unnamed: 0,summary,px,py,pz,vx,vy,vz,mass,dist_cent_sq,sub_id
0,count,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0
1,mean,41840.65205640344,49050.24049195815,146618.59384127078,491.1804762142857,2141.1263542857128,-734.4352684999993,0.0005289361012142856,33.778025427646135,0.0
2,stddev,3.434179745897391,2.646253888959987,3.830843719144255,47.42457028605087,43.16277394559926,46.21973330251511,0.00013347494347952017,69.45086123003283,0.0
3,min,41821.05056670708,49042.46942823968,146605.40289966715,330.69318,2015.9958,-865.76483,0.00029745823,2.034020586918334e-07,0.0
4,max,41854.11205584526,49061.58165936835,146634.89777236278,636.5443,2252.1292,-622.19794,0.00087916595,445.53822808873207,0.0


In [5]:
subhalo_count = df.select("sub_id").distinct().count()
print(f"Number of distinct subhalo IDs: {subhalo_count}")



Number of distinct subhalo IDs: 104290


                                                                                

In [6]:
%%time
from pyspark.sql import Window as W
import pyspark.sql.functions as F

# Define subhalo window for partitioning by "sub_id"
subhalo_window = W.partitionBy("sub_id")

# Initial weighted velocity and dispersion calculations for all particles
df = df.withColumn("vx_weighted", F.col("mass") * F.col("vx"))
df = df.withColumn("vy_weighted", F.col("mass") * F.col("vy"))
df = df.withColumn("vz_weighted", F.col("mass") * F.col("vz"))

df = df.withColumn("mass_sum", F.sum("mass").over(subhalo_window))

df = df.withColumn("vx_avg", F.sum("vx_weighted").over(subhalo_window) / F.col("mass_sum"))
df = df.withColumn("vy_avg", F.sum("vy_weighted").over(subhalo_window) / F.col("mass_sum"))
df = df.withColumn("vz_avg", F.sum("vz_weighted").over(subhalo_window) / F.col("mass_sum"))

df = df.withColumn("vx_disp", (F.col("vx") - F.col("vx_avg"))**2)
df = df.withColumn("vy_disp", (F.col("vy") - F.col("vy_avg"))**2)
df = df.withColumn("vz_disp", (F.col("vz") - F.col("vz_avg"))**2)

df = df.withColumn("dispersion_weighted", F.col("mass") * (F.col("vx_disp") + F.col("vy_disp") + F.col("vz_disp")))

# First aperture (50 kpc)
velocity_dispersion_df = df.groupBy("sub_id").agg(
    (F.sqrt(F.sum("dispersion_weighted") / F.sum("mass"))).alias("mass_weighted_velocity_dispersion_50")
)

# Iterate through different apertures (30, 20, 10, 5, 3 kpc)
for apert in [30, 20, 10, 5, 3]:
    # Reset the DataFrame for each aperture
    distance_limit = (apert * h) ** 2
    df_aperture = df.filter(F.col("dist_cent_sq") <= distance_limit)

    df_aperture = df_aperture.withColumn("mass_sum", F.sum("mass").over(subhalo_window))
    
    df_aperture = df_aperture.withColumn("vx_avg", F.sum("vx_weighted").over(subhalo_window) / F.col("mass_sum"))
    df_aperture = df_aperture.withColumn("vy_avg", F.sum("vy_weighted").over(subhalo_window) / F.col("mass_sum"))
    df_aperture = df_aperture.withColumn("vz_avg", F.sum("vz_weighted").over(subhalo_window) / F.col("mass_sum"))
    
    df_aperture = df_aperture.withColumn("vx_disp", (F.col("vx") - F.col("vx_avg"))**2)
    df_aperture = df_aperture.withColumn("vy_disp", (F.col("vy") - F.col("vy_avg"))**2)
    df_aperture = df_aperture.withColumn("vz_disp", (F.col("vz") - F.col("vz_avg"))**2)
    
    df_aperture = df_aperture.withColumn("dispersion_weighted", F.col("mass") * (F.col("vx_disp") + F.col("vy_disp") + F.col("vz_disp")))

    velocity_dispersion_tmp = df_aperture.groupBy("sub_id").agg(
        (F.sqrt(F.sum("dispersion_weighted") / F.sum("mass"))).alias("mass_weighted_velocity_dispersion_" + str(apert))
    )
    
    # Combine the results for each aperture
    velocity_dispersion_df = velocity_dispersion_df.join(velocity_dispersion_tmp, "sub_id")

# Show the final combined DataFrame
velocity_dispersion_df.show()

[Stage 23:>                                                         (0 + 1) / 1]026]]

+------+------------------------------------+------------------------------------+------------------------------------+------------------------------------+-----------------------------------+-----------------------------------+
|sub_id|mass_weighted_velocity_dispersion_50|mass_weighted_velocity_dispersion_30|mass_weighted_velocity_dispersion_20|mass_weighted_velocity_dispersion_10|mass_weighted_velocity_dispersion_5|mass_weighted_velocity_dispersion_3|
+------+------------------------------------+------------------------------------+------------------------------------+------------------------------------+-----------------------------------+-----------------------------------+
|   148|                  60.353250511042376|                   61.38776748254439|                   62.38146856184683|                   65.64963521122424|                  70.08732553224418|                  68.72902382988421|
|   463|                   350.0666538753183|                   174.3795225514606|  

                                                                                

In [8]:
sub_id_to_find = 10000
filtered_df = velocity_dispersion_df.filter(velocity_dispersion_df["sub_id"] == sub_id_to_find)
filtered_df.show()

+------+------------------------------------+------------------------------------+------------------------------------+------------------------------------+-----------------------------------+-----------------------------------+
|sub_id|mass_weighted_velocity_dispersion_50|mass_weighted_velocity_dispersion_30|mass_weighted_velocity_dispersion_20|mass_weighted_velocity_dispersion_10|mass_weighted_velocity_dispersion_5|mass_weighted_velocity_dispersion_3|
+------+------------------------------------+------------------------------------+------------------------------------+------------------------------------+-----------------------------------+-----------------------------------+
| 10000|                   67.54407056722899|                   67.25768361639068|                   67.79019897412951|                   68.88366311763865|                  72.93970174994601|                  73.40487757132327|
+------+------------------------------------+------------------------------------+--

In [9]:
subname = 'hdfs://sohnic:54310/data/TNG300/snap99/parquet/vdisp_240918_snappy.parquet'
velocity_dispersion_df.write.mode("overwrite").parquet(subname, compression="snappy")

24/09/19 21:56:56 WARN TaskSetManager: Lost task 28.0 in stage 72.0 (TID 140423) (192.168.0.132 executor 7): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.<init>(UnsafeSorterSpillReader.java:50)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.getReader(UnsafeSorterSpillWriter.java:159)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.getSortedIterator(UnsafeExternalSorter.java:555)
	at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:172)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.window.WindowExec$$anon$1.fetchNextRow(Windo

Py4JJavaError: An error occurred while calling o481.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: ResultStage 72 (parquet at NativeMethodAccessorImpl.java:0) has failed the maximum allowable number of times: 4. Most recent failure reason:
org.apache.spark.shuffle.FetchFailedException: Error in reading FileSegmentManagedBuffer[file=/tmp/spark-b42d301b-d18d-461a-ab35-082de6c55ddf/executor-8c095244-f677-4236-97d1-fd2df17a7fea/blockmgr-daa2c3ed-6e40-4748-b072-b4488231ec77/19/shuffle_21_85348_0.data,offset=293333,length=26503]
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:437)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1232)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:890)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:86)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$10.nextCur(Iterator.scala:587)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:601)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:576)
	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:576)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.sort_addToSorter_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.window.WindowExec$$anon$1.fetchNextRow(WindowExec.scala:118)
	at org.apache.spark.sql.execution.window.WindowExec$$anon$1.<init>(WindowExec.scala:127)
	at org.apache.spark.sql.execution.window.WindowExec.$anonfun$doExecute$3(WindowExec.scala:107)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:853)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:853)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.io.IOException: Error in reading FileSegmentManagedBuffer[file=/tmp/spark-b42d301b-d18d-461a-ab35-082de6c55ddf/executor-8c095244-f677-4236-97d1-fd2df17a7fea/blockmgr-daa2c3ed-6e40-4748-b072-b4488231ec77/19/shuffle_21_85348_0.data,offset=293333,length=26503]
	at org.apache.spark.network.buffer.FileSegmentManagedBuffer.createInputStream(FileSegmentManagedBuffer.java:112)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:863)
	... 65 more
Caused by: java.io.FileNotFoundException: /tmp/spark-b42d301b-d18d-461a-ab35-082de6c55ddf/executor-8c095244-f677-4236-97d1-fd2df17a7fea/blockmgr-daa2c3ed-6e40-4748-b072-b4488231ec77/19/shuffle_21_85348_0.data (No such file or directory)
	at java.base/java.io.FileInputStream.open0(Native Method)
	at java.base/java.io.FileInputStream.open(FileInputStream.java:219)
	at java.base/java.io.FileInputStream.<init>(FileInputStream.java:157)
	at org.apache.spark.network.buffer.FileSegmentManagedBuffer.createInputStream(FileSegmentManagedBuffer.java:101)
	... 66 more

	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.immutable.List.foreach(List.scala:333)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1961)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2978)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:307)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:271)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:382)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:133)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:856)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:387)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:360)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:789)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


24/09/19 22:01:39 WARN TaskSetManager: Lost task 81.0 in stage 72.3 (TID 203967) (192.168.0.134 executor 14): FetchFailed(BlockManagerId(0, 192.168.0.138, 46871, None), shuffleId=21, mapIndex=394, mapId=184928, reduceId=81, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:437)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1232)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:971)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:86)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$10.nextCur(Iterator.scala:587)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:601)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:576)
	at org.apache.spark.util.Com

In [17]:
df = spark.read.parquet(subname)

df_filtered = df.filter(df.sub_id == 0)
df_filtered.describe().toPandas()

#subhalo_count = df.select("sub_id").distinct().count()
#print(f"Number of distinct subhalo IDs: {subhalo_count}")

AnalysisException: [UNABLE_TO_INFER_SCHEMA] Unable to infer schema for Parquet. It must be specified manually.