In [1]:
import numpy as np
import pandas as pd
import glob
import sys
import h5py
#from netCDF4 import Dataset
import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

import pyarrow as pa
import pyarrow.parquet as pqw

from functools import reduce
import operator
import gc

In [2]:
h0 = 0.6774
LM = 0.05
RM = 0.98
BM = 0.17
TM = 0.97

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rc('axes',  labelsize=30)
plt.rc('xtick', labelsize=25)
plt.rc('ytick', labelsize=25)

In [3]:
# PySpark packages
from pyspark import SparkContext   
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as _sum

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W

spark = SparkSession.builder \
    .appName("MyApp") \
    .master("spark://sohnic:7077") \
    .config("spark.driver.memory", "32g") \
    .config("spark.executor.memory", "32g") \
    .config("spark.jars.packages", "graphframes:graphframes:0.7.0-spark2.4-s_2.11") \
    .getOrCreate()

sc = spark.sparkContext
sc.setCheckpointDir("hdfs://sohnic:54310/tmp/checkpoints")

spark.conf.set("spark.sql.debug.maxToStringFields", 500)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/lshin/.ivy2/cache
The jars for the packages stored in: /home/lshin/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d940e9f1-415e-4f05-a122-29127ada04d3;1.0
	confs: [default]
	found graphframes#graphframes;0.7.0-spark2.4-s_2.11 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 97ms :: artifacts dl 3ms
	:: modules in use:
	graphframes#graphframes;0.7.0-spark2.4-s_2.11 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	------------------------------------

In [5]:
print(f"Total Cores: {sc.defaultParallelism}")
print(f"Total Executors: {sc._jsc.sc().getExecutorMemoryStatus().size()}")
sc.getConf().getAll()[:10]

Total Cores: 192
Total Executors: 9


[('spark.driver.memory', '32g'),
 ('spark.executor.memory', '32g'),
 ('spark.master', 'spark://sohnic:7077'),
 ('spark.jars',
  'file:///home/lshin/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,file:///home/lshin/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.pyFiles',
  '/home/lshin/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,/home/lshin/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.driver.host', 'sohnic'),
 ('spark.app.submitTime', '1724157227775'),
 ('spark.app.initial.file.urls',
  'spark://sohnic:43937/files/org.slf4j_slf4j-api-1.7.16.jar,spark://sohnic:43937/files/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar'),
 ('spark.submit.deployMode', 'client')]

In [6]:
%%time
tngdir = 'hdfs://sohnic:54310/data/TNG300/snap99/'
catdir = './'
subdir = './'

#filename = 'snap099_sorted_x0_y0_z0.parquet.snappy'
#
#newsparkdf = spark.read.option("header","true").option("recursiveFileLookup","true").parquet(tngdir + filename)
#newsparkdf.printSchema()

CPU times: user 3 µs, sys: 4 µs, total: 7 µs
Wall time: 14.3 µs


In [7]:
# Read the boundary of the hdf5 file that contains star particles around the target subhalos
# Define dimensions 

## Number of original HDF files 
nfile = 600

## Number of Bins along with each axes 
xarr = 10
yarr = 10
zarr = 6
#hdrlist = 'x' + str(np.arange(xarr)) +'_y' + str(np.arange(yarr)) + '_z' + str(np.arange(zarr))

## Boundaries of sorted data cubes
box_size = 205000.
xbin = (box_size / xarr)
ybin = (box_size / yarr)
zbin = (box_size / zarr)

xmins = 0. + xbin * np.arange(xarr)
xmaxs = xmins + xbin
ymins = 0. + ybin * np.arange(yarr)
ymaxs = ymins + ybin
zmins = 0. + zbin * np.arange(zarr)
zmaxs = zmins + zbin

filenum_x = [0,1,2,3,4,5,6,7,8,9,0,9] #filenum for the case when the index is 10 or -1
filenum_y = [0,1,2,3,4,5,6,7,8,9,0,9] #filenum for the case when the index is 10 or -1
filenum_z = [0,1,2,3,4,5,0,5] #filenum for the case when the index is 10 or -1

In [8]:
# Prepare a header list
hdrlist = []
for x in range(xarr):
    for y in range(yarr):
        for z in range(zarr):
            thdrlist = 'x' + str(x) +'_y' + str(y) + '_z' + str(z)
            hdrlist.append(thdrlist)

print(hdrlist[:5])

['x0_y0_z0', 'x0_y0_z1', 'x0_y0_z2', 'x0_y0_z3', 'x0_y0_z4']


In [9]:
#few seconds
t300   = pd.read_csv(subdir + 'TNG300_quiescent_subhalos.csv')
alist  = list(t300.keys())
lenarr = len(t300['ID'])

t300.head()

Unnamed: 0.1,Unnamed: 0,ID,GrNr,Posx,Posy,Posz,CMx,CMy,CMz,Velx,...,vdisp_dm_pos_sph_rh_z_bi,vdisp_dm_pos_sph_rh_z_mweight,vdisp_dm_pos_sph_rh_z_lweight,vdisp_dm_pos_sph_rh_3d_std,vdisp_dm_pos_sph_rh_3d_bi,vdisp_dm_pos_sph_rh_3d_mweight,vdisp_dm_pos_sph_rh_3d_lweight,central,quiescent,edge
0,1,394,0,41839.938,49050.41,146618.52,41841.72,49049.5,146618.42,505.39545,...,56.230135,0.0,0.0,93.595999,94.790011,0.0,0.0,0,1,1
1,2,403,0,45247.9,51453.363,145176.55,45247.434,51453.367,145176.25,484.588,...,61.195377,0.0,0.0,108.910434,110.497556,0.0,0.0,0,1,1
2,3,421,0,43345.83,50114.844,148136.55,43344.582,50116.066,148136.56,2266.7646,...,57.70442,0.0,0.0,100.625152,102.887322,0.0,0.0,0,1,1
3,4,423,0,43848.816,51581.773,145344.84,43851.332,51581.54,145348.3,-1288.3748,...,49.692601,0.0,0.0,90.353464,92.019996,0.0,0.0,0,1,1
4,5,447,0,44503.348,47543.66,146977.92,44503.926,47543.1,146978.66,-157.55678,...,57.70497,0.0,0.0,102.185058,104.080819,0.0,0.0,0,1,1


In [10]:
# Define apertures 
#aprs = [3., 5., 10., 20., 30., 50.]
#aprt = ['3kpc', '5kpc', '10kpc', '20kpc', '30kpc', '50kpc']
#axes = ['x', 'y', 'z', '3d']

aprs = [50, 30, 20, 10, 5, 3]
aprt = ['50kpc', '30kpc', '20kpc', '10kpc', '5kpc', '3kpc']
axes = ['x', 'y', 'z', '3d']

In [11]:
## Prepare new velocity dispersion arrays 
#few seconds
t300['hdr'] = np.repeat('NNNNNNNN', lenarr)

for i, ihdr in tqdm(enumerate(hdrlist)):
    x = int(ihdr[1:ihdr.index('_')])  # extract '0' after 'x'
    y = int(ihdr[ihdr.index('_')+2:ihdr.rindex('_')])  # extract '0' after 'y'
    z = int(ihdr[ihdr.rindex('_')+2:])  # extract '0' after 'z'
    
    sel = ((t300['Posx'] >= xmins[x]) & (t300['Posx'] <= xmaxs[x]) & 
           (t300['Posy'] >= ymins[y]) & (t300['Posy'] <= ymaxs[y]) & 
           (t300['Posz'] >= zmins[z]) & (t300['Posz'] <= zmaxs[z]))
    
    dum = t300['Posx'][sel]
#    print(i, x, y, z, len(dum))
    
    if len(dum) > 0:
        t300['hdr'][sel] = ihdr

0it [00:00, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t300['hdr'][sel] = ihdr
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t300['hdr'][sel] = ihdr
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t300['hdr'][sel] = ihdr
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t300['hdr'][sel] = ihdr
A value is trying to be set on a copy of a slice from a DataFrame

See t

In [12]:
t300['hdr'][-5:]

104285    x2_y0_z2
104286    x1_y4_z3
104287    x9_y5_z5
104288    x4_y1_z4
104289    x2_y2_z4
Name: hdr, dtype: object

In [20]:
len(t300)

104290

In [17]:
#%%time
#x = 0; y = 4; z = 5
#
#pdirs = []
#for i_x in [-1,0,1]:
#    for i_y in [-1,0,1]:
#        for i_z in [-1,0,1]:
#            tname = 'x' + str(filenum_x[x+i_x]) +'_y' + str(filenum_y[y+i_y]) + '_z' + str(filenum_z[z+i_z]) + '.parquet.snappy'
#            pdirs.append(tngdir +'snap099_sorted_'+ tname)
#
#df = spark.read.option("recursiveFileLookup", "true").parquet(*pdirs)
#print(df.count())



46519292
CPU times: user 10.2 ms, sys: 0 ns, total: 10.2 ms
Wall time: 2.19 s


                                                                                

In [47]:
#df['px']*2%2

Column<'((px * 2) % 2)'>

In [52]:
#total_mass = filtered_pat.agg(F.sum('mass')).collect()[0][0]
#filtered_pat = filtered_pat.withColumn('m_v_x', df['mass'] * df['px'])
#mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_x') / total_mass).collect()[0][0]
#filtered_pat = filtered_pat.withColumn('squared_diff_x', F.pow(df['px'] - mean_mass_weighted_velocity, 2))
#filtered_pat = filtered_pat.withColumn('mass_weighted_variance_x', df['squared_diff_x'] * df['mass'])
#total_mass_weighted_variance = filtered_pat.agg(F.sum('mass_weighted_variance_x') / total_mass).collect()[0][0]
#mass_weighted_velocity_dispersion_x = total_mass_weighted_variance ** 0.5

1

In [61]:
'''# Compute the velocity dispersions based on the particle selection
#slow?
for i, ihdr in tqdm(enumerate(hdrlist)):
    pass
    print(i)
#    if (i % 5) == 0:
#        print(i, datetime.datetime.now())
        
## GET the HEADER Information
    x = int(ihdr[1:ihdr.index('_')])  # extract '0' after 'x'
    y = int(ihdr[ihdr.index('_')+2:ihdr.rindex('_')])  # extract '0' after 'y'
    z = int(ihdr[ihdr.rindex('_')+2:])  # extract '0' after 'z'

## READ particle information
    pdirs = []
    for i_x in [-1,0,1]:
        for i_y in [-1,0,1]:
            for i_z in [-1,0,1]:
                tname = 'x' + str(filenum_x[x+i_x]) +'_y' + str(filenum_y[y+i_y]) + '_z' + str(filenum_z[z+i_z]) + '.parquet.snappy'
                pdirs.append(tngdir +'snap099_sorted_'+ tname)
    
    pat = spark.read.option("recursiveFileLookup", "true").parquet(*pdirs)

## FIND matched subhalos 
    mat  = (t300['hdr'] == ihdr)
    mlen = len(t300['hdr'][mat])

        
## Define new arraies 
    upd = {}
    upd['ID'] = (t300['ID'][mat]).to_numpy()
    upd['PX'] = (t300['Posx'][mat]).to_numpy()
    upd['PY'] = (t300['Posy'][mat]).to_numpy()
    upd['PZ'] = (t300['Posz'][mat]).to_numpy()
    upd['VX'] = (t300['Velx'][mat]).to_numpy()
    upd['VY'] = (t300['Vely'][mat]).to_numpy()
    upd['VZ'] = (t300['Velz'][mat]).to_numpy()
    
    for apr in aprt:
        #for axis in axes:
        #upd['vdisp_star_cyl_' + apr + '_' + axis + '_mweight'] = np.repeat(0., mlen)
        upd['vdisp_star_sph_' + apr + '_3d_mweight'] = np.repeat(0., mlen)
        upd['vdisp_star_sph_' + apr + '_z_mweight'] = np.repeat(0., mlen)    
    ## Compute the velocity dispersions     
        for j in range(mlen):
            print(j, datetime.datetime.now())
    ### Compute the relative information
#            dvx =  pat['vx'] - upd['VX'][j]
#            dvy =  pat['vy'] - upd['VY'][j]
#            dvz =  pat['vz'] - upd['VZ'][j]
            
            pat = pat.withColumn('r3d_sq',    
        F.pow(
            ((F.col('px') - upd['PX'][j] + box_size / 2) % box_size - box_size / 2), 2
        ) +
        F.pow(
            ((F.col('py') - upd['PY'][j] + box_size / 2) % box_size - box_size / 2), 2
        ) +
        F.pow(
            ((F.col('pz') - upd['PZ'][j] + box_size / 2) % box_size - box_size / 2), 2
        ) )
                                 
    ### Select particles within apertures
            for k, apr1 in enumerate(aprs):
                rsq_sel = (apr1**2)*(h0**2)
                filtered_pat = pat.filter(F.col("r3d_sq") < rsq_sel)
                total_mass = filtered_pat.agg(F.sum('mass')).collect()[0][0]
                
                filtered_pat = filtered_pat.withColumn('m_v_x', filtered_pat['mass'] * filtered_pat['px'])
                mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_x') / total_mass).collect()[0][0]
                filtered_pat = filtered_pat.withColumn('squared_diff_x', F.pow(filtered_pat['px'] - mean_mass_weighted_velocity, 2))
                filtered_pat = filtered_pat.withColumn('mass_weighted_variance_x', filtered_pat['squared_diff_x'] * filtered_pat['mass'])
                total_mass_weighted_variance_x = filtered_pat.agg(F.sum('mass_weighted_variance_x') / total_mass).collect()[0][0]

                filtered_pat = filtered_pat.withColumn('m_v_y', filtered_pat['mass'] * filtered_pat['py'])
                mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_y') / total_mass).collect()[0][0]
                filtered_pat = filtered_pat.withColumn('squared_diff_y', F.pow(filtered_pat['py'] - mean_mass_weighted_velocity, 2))
                filtered_pat = filtered_pat.withColumn('mass_weighted_variance_y', filtered_pat['squared_diff_y'] * filtered_pat['mass'])
                total_mass_weighted_variance_y = filtered_pat.agg(F.sum('mass_weighted_variance_y') / total_mass).collect()[0][0]

                filtered_pat = filtered_pat.withColumn('m_v_z', filtered_pat['mass'] * filtered_pat['pz'])
                mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_z') / total_mass).collect()[0][0]
                filtered_pat = filtered_pat.withColumn('squared_diff_z', F.pow(filtered_pat['pz'] - mean_mass_weighted_velocity, 2))
                filtered_pat = filtered_pat.withColumn('mass_weighted_variance_z', filtered_pat['squared_diff_z'] * filtered_pat['mass'])
                total_mass_weighted_variance_z = filtered_pat.agg(F.sum('mass_weighted_variance_z') / total_mass).collect()[0][0]
    
                upd['vdisp_star_sph_' + apr + '_3d_mweight'][j] = np.sqrt(total_mass_weighted_variance_x + total_mass_weighted_variance_y + total_mass_weighted_variance_z)
                upd['vdisp_star_sph_' + apr + '_z_mweight'][j] = np.sqrt(total_mass_weighted_variance_z)  
                
        if i == 0:
            cdf = pd.DataFrame(upd)
            
        if i > 0:
            ndf = pd.DataFrame(upd)
            cdf = pd.concat([cdf, ndf])
    
        cdf.to_csv(subdir + 'TNG300/TNG300_quiescent_subhalos_hadoop.csv', index=False)  '''      

0it [00:00, ?it/s]

0
0 2024-08-15 22:55:20.403918


                                                                                

1 2024-08-15 22:56:15.528713


                                                                                

2 2024-08-15 22:57:10.623175


                                                                                

3 2024-08-15 22:58:05.998407


                                                                                

4 2024-08-15 22:59:01.229102


                                                                                

5 2024-08-15 22:59:56.424689


                                                                                

6 2024-08-15 23:00:51.717516


ERROR:root:KeyboardInterrupt while sending command.>           (146 + 43) / 189]
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
                                                                                

KeyboardInterrupt: 

# spark df subhalo by subhalo

In [48]:
%%time
#subhalo by subhalo

crop_cut = 50*h0
vds = []
vds_z = []
for i in [138]:#range(len(t300)):
    cent_x = t300['Posx'][i]
    cent_y = t300['Posy'][i]
    cent_z = t300['Posz'][i]
    
    box_num_x = list(set([filenum_x[int((cent_x-crop_cut)/(box_size/10))], filenum_x[int((cent_x+crop_cut)/(box_size/10))]]))
    box_num_y = list(set([filenum_y[int((cent_y-crop_cut)/(box_size/10))], filenum_y[int((cent_y+crop_cut)/(box_size/10))]]))
    box_num_z = list(set([filenum_z[int((cent_z-crop_cut)/(box_size/6))], filenum_z[int((cent_z+crop_cut)/(box_size/6))]]))

    pdirs = []
    for i_x in box_num_x:
        for i_y in box_num_y:
            for i_z in box_num_z:
                tname = 'x' + str(i_x) +'_y' + str(i_y) + '_z' + str(i_z) + '.parquet.snappy'
                pdirs.append(tngdir +'snap099_sorted_'+ tname)

    pat = spark.read.option("recursiveFileLookup", "true").parquet(*pdirs)

#    pat = pat.withColumn('r3d_sq',    
#        F.pow((F.col('px') - cent_x ) , 2) +
#        F.pow((F.col('py') - cent_y ) , 2) +
#        F.pow((F.col('pz') - cent_z ) , 2))    
    
#periodic?
    pat = pat.withColumn('r3d_sq',    
        F.pow(( (F.col('px') - cent_x + box_size / 2) % box_size - box_size / 2), 2) +
        F.pow(( (F.col('py') - cent_y + box_size / 2) % box_size - box_size / 2), 2) +
        F.pow(( (F.col('pz') - cent_z + box_size / 2) % box_size - box_size / 2), 2))


    
    tmp = []
    tmp_z = []
    for k, apr1 in enumerate(aprs):
        rsq_sel = (apr1**2)*(h0**2)
        if k ==0:
            filtered_pat = pat.filter(F.col("r3d_sq") < rsq_sel)
        else:
            filtered_pat = filtered_pat.filter(F.col("r3d_sq") < rsq_sel)
            
        total_mass = filtered_pat.agg(F.sum('mass')).collect()[0][0]
        filtered_pat = filtered_pat.withColumn('m_v_x', filtered_pat['mass'] * filtered_pat['vx'])
        mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_x')).collect()[0][0] / total_mass
        filtered_pat = filtered_pat.withColumn('squared_diff_x', F.pow(filtered_pat['vx'] - mean_mass_weighted_velocity, 2))
        filtered_pat = filtered_pat.withColumn('mass_weighted_variance_x', filtered_pat['squared_diff_x'] * filtered_pat['mass'])
        total_mass_weighted_variance_x = filtered_pat.agg(F.sum('mass_weighted_variance_x')).collect()[0][0] / total_mass

        filtered_pat = filtered_pat.withColumn('m_v_y', filtered_pat['mass'] * filtered_pat['vy'])
        mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_y')).collect()[0][0]  / total_mass
        filtered_pat = filtered_pat.withColumn('squared_diff_y', F.pow(filtered_pat['vy'] - mean_mass_weighted_velocity, 2))
        filtered_pat = filtered_pat.withColumn('mass_weighted_variance_y', filtered_pat['squared_diff_y'] * filtered_pat['mass'])
        total_mass_weighted_variance_y = filtered_pat.agg(F.sum('mass_weighted_variance_y')).collect()[0][0]  / total_mass

        filtered_pat = filtered_pat.withColumn('m_v_z', filtered_pat['mass'] * filtered_pat['vz'])
        mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_z')).collect()[0][0]  / total_mass
        filtered_pat = filtered_pat.withColumn('squared_diff_z', F.pow(filtered_pat['vz'] - mean_mass_weighted_velocity, 2))
        filtered_pat = filtered_pat.withColumn('mass_weighted_variance_z', filtered_pat['squared_diff_z'] * filtered_pat['mass'])
        total_mass_weighted_variance_z = filtered_pat.agg(F.sum('mass_weighted_variance_z')).collect()[0][0]  / total_mass

        tmp.append(np.sqrt(total_mass_weighted_variance_x + total_mass_weighted_variance_y + total_mass_weighted_variance_z))
        tmp_z.append(np.sqrt(total_mass_weighted_variance_z) )
    vds.append(tmp)
    vds_z.append(tmp_z)    

                                                                                

CPU times: user 215 ms, sys: 42.9 ms, total: 258 ms
Wall time: 14.7 s


In [37]:
vds

[[1.4438853163639858,
  1.8910700497925066,
  3.421572756896383,
  4.637784926700523,
  5.250260402258049,
  5.780369619825705]]

# pandas df subhalo by subhalo

In [22]:
%%time
#calculation time test
#subhalo by subhalo

crop_cut = 50*h0
vds = []
vds_z = []
for i in [138]:#range(len(t300)):
    cent_x = t300['Posx'][i]
    cent_y = t300['Posy'][i]
    cent_z = t300['Posz'][i]
    
    box_num_x = list(set([filenum_x[int((cent_x-crop_cut)/(box_size/10))], filenum_x[int((cent_x+crop_cut)/(box_size/10))]]))
    box_num_y = list(set([filenum_y[int((cent_y-crop_cut)/(box_size/10))], filenum_y[int((cent_y+crop_cut)/(box_size/10))]]))
    box_num_z = list(set([filenum_z[int((cent_z-crop_cut)/(box_size/6))], filenum_z[int((cent_z+crop_cut)/(box_size/6))]]))

    pat = pd.DataFrame()
    df_list = []
    for i_x in box_num_x:
        for i_y in box_num_y:
            for i_z in box_num_z:
                npat = pd.read_csv('./TNG300/snap99/snap099_sorted_' + 'x' + str(i_x) +'_y' + str(i_y) + '_z' + str(i_z) + '.csv')
                df_list.append(npat)
                
    pat  = pd.concat(df_list, ignore_index = True)
    pat['r3d_sq'] = ((pat['px']- cent_x + box_size / 2) % box_size - box_size / 2)**2 + ((pat['py']- cent_y + box_size / 2) % box_size - box_size / 2)**2 + ((pat['pz']- cent_z + box_size / 2) % box_size - box_size / 2)**2   

    a = []
    for k, apr1 in enumerate(aprs):
        rsq_sel = (apr1**2)*(h0**2)
        if k ==0:
            filtered_pat = pat[pat["r3d_sq"] < rsq_sel]
        else:
            filtered_pat = filtered_pat[filtered_pat["r3d_sq"] < rsq_sel]

        total_mass = np.sum(filtered_pat['mass'])
        filtered_pat['m_v_x'] = filtered_pat['mass'] * filtered_pat['vx']
        mean_mass_weighted_velocity = np.sum(filtered_pat['m_v_x']) / total_mass
        filtered_pat['squared_diff_x'] =  (filtered_pat['vx'] - mean_mass_weighted_velocity)**2
        filtered_pat['mass_weighted_variance_x'] = filtered_pat['squared_diff_x'] * filtered_pat['mass']
        total_mass_weighted_variance_x = np.sum(filtered_pat['mass_weighted_variance_x'])/ total_mass

        total_mass = np.sum(filtered_pat['mass'])
        filtered_pat['m_v_x'] = filtered_pat['mass'] * filtered_pat['vx']
        mean_mass_weighted_velocity = np.sum(filtered_pat['m_v_x']) / total_mass
        filtered_pat['squared_diff_x'] =  (filtered_pat['vx'] - mean_mass_weighted_velocity)**2
        filtered_pat['mass_weighted_variance_x'] = filtered_pat['squared_diff_x'] * filtered_pat['mass']
        total_mass_weighted_variance_x = np.sum(filtered_pat['mass_weighted_variance_x'])/ total_mass

        total_mass = np.sum(filtered_pat['mass'])
        filtered_pat['m_v_x'] = filtered_pat['mass'] * filtered_pat['px']
        mean_mass_weighted_velocity = np.sum(filtered_pat['m_v_x']) / total_mass
        filtered_pat['squared_diff_x'] =  (filtered_pat['px'] - mean_mass_weighted_velocity)**2
        filtered_pat['mass_weighted_variance_x'] = filtered_pat['squared_diff_x'] * filtered_pat['mass']
        total_mass_weighted_variance_x = np.sum(filtered_pat['mass_weighted_variance_x'])/ total_mass

        a.append(total_mass_weighted_variance_x)


CPU times: user 2.6 s, sys: 71.2 ms, total: 2.67 s
Wall time: 2.67 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

# all at once?

In [23]:
aprt

['50kpc', '30kpc', '20kpc', '10kpc', '5kpc', '3kpc']

In [31]:
%%time
# Compute the velocity dispersions based on the particle selection
for i, ihdr in tqdm(enumerate(hdrlist[0:1])):
    pass
    print(i)
#    if (i % 5) == 0:
#        print(i, datetime.datetime.now())
        
## GET the HEADER Information
    x = int(ihdr[1:ihdr.index('_')])  # extract '0' after 'x'
    y = int(ihdr[ihdr.index('_')+2:ihdr.rindex('_')])  # extract '0' after 'y'
    z = int(ihdr[ihdr.rindex('_')+2:])  # extract '0' after 'z'
    cent = np.array([xmins[x]+xmaxs[x], ymins[y]+ymaxs[y], zmins[z]+zmaxs[z]])/2
    
## READ particle information
    pdirs = []
    for i_x in [-1,0,1]:
        for i_y in [-1,0,1]:
            for i_z in [-1,0,1]:
                tname = 'x' + str(filenum_x[x+i_x]) +'_y' + str(filenum_y[y+i_y]) + '_z' + str(filenum_z[z+i_z]) + '.parquet.snappy'
                pdirs.append(tngdir +'snap099_sorted_'+ tname)
    
    pat = spark.read.option("recursiveFileLookup", "true").parquet(*pdirs)
    pat = pat.filter( (((F.col('px') - cent[0] + box_size / 2) % box_size - box_size / 2) > - xbin - 50*h0)
                    & (((F.col('px') - cent[0] + box_size / 2) % box_size - box_size / 2) <   xbin + 50*h0)
                    & (((F.col('py') - cent[1] + box_size / 2) % box_size - box_size / 2) > - ybin - 50*h0)
                    & (((F.col('py') - cent[1] + box_size / 2) % box_size - box_size / 2) <   ybin + 50*h0)
                    & (((F.col('pz') - cent[2] + box_size / 2) % box_size - box_size / 2) > - zbin - 50*h0)
                    & (((F.col('pz') - cent[2] + box_size / 2) % box_size - box_size / 2) <   zbin + 50*h0)
                    )
    
## FIND matched subhalos 
    mat  = (t300['hdr'] == ihdr)
    mlen = len(t300['hdr'][mat])

## Define new arraies 
    upd = {}
    upd['ID'] = (t300['ID'][mat]).to_numpy()
    upd['PX'] = (t300['Posx'][mat]).to_numpy()
    upd['PY'] = (t300['Posy'][mat]).to_numpy()
    upd['PZ'] = (t300['Posz'][mat]).to_numpy()
    upd['VX'] = (t300['Velx'][mat]).to_numpy()
    upd['VY'] = (t300['Vely'][mat]).to_numpy()
    upd['VZ'] = (t300['Velz'][mat]).to_numpy()
    
    for apr in aprt:
        print(apr)
        #for axis in axes:
        #upd['vdisp_star_cyl_' + apr + '_' + axis + '_mweight'] = np.repeat(0., mlen)
        upd['vdisp_star_sph_' + apr + '_3d_mweight'] = np.repeat(0., mlen)
        upd['vdisp_star_sph_' + apr + '_z_mweight'] = np.repeat(0., mlen)    
        rsq_sel = (int(apr.rstrip('kpc'))**2)*(h0**2)

    ## Compute the velocity dispersions  

        for j in range(mlen):
            #print(j, datetime.datetime.now())          
            pat = pat.withColumn('r3d_sq',    
            F.pow(
            ((F.col('px') - upd['PX'][j] + box_size / 2) % box_size - box_size / 2), 2
            ) +
            F.pow(
            ((F.col('py') - upd['PY'][j] + box_size / 2) % box_size - box_size / 2), 2
            ) +
            F.pow(
            ((F.col('pz') - upd['PZ'][j] + box_size / 2) % box_size - box_size / 2), 2
            ) )
                                 
            filtered_pat = pat.filter(F.col("r3d_sq") < rsq_sel)
            #total_mass = filtered_pat.agg(F.sum('mass')).collect()[0][0]
            total_mass = filtered_pat.select(_sum('mass')).collect()[0][0]
#            
#            filtered_pat = filtered_pat.withColumn('m_v_x', filtered_pat['mass'] * filtered_pat['vx'])
#            mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_x') / total_mass).collect()[0][0]
#            filtered_pat = filtered_pat.withColumn('squared_diff_x', F.pow(filtered_pat['vx'] - mean_mass_weighted_velocity, 2))
#            filtered_pat = filtered_pat.withColumn('mass_weighted_variance_x', filtered_pat['squared_diff_x'] * filtered_pat['mass'])
#            total_mass_weighted_variance_x = filtered_pat.agg(F.sum('mass_weighted_variance_x') / total_mass).collect()[0][0]
#            
#            filtered_pat = filtered_pat.withColumn('m_v_y', filtered_pat['mass'] * filtered_pat['vy'])
#            mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_y') / total_mass).collect()[0][0]
#            filtered_pat = filtered_pat.withColumn('squared_diff_y', F.pow(filtered_pat['vy'] - mean_mass_weighted_velocity, 2))
#            filtered_pat = filtered_pat.withColumn('mass_weighted_variance_y', filtered_pat['squared_diff_y'] * filtered_pat['mass'])
#            total_mass_weighted_variance_y = filtered_pat.agg(F.sum('mass_weighted_variance_y') / total_mass).collect()[0][0]
#            
#            filtered_pat = filtered_pat.withColumn('m_v_z', filtered_pat['mass'] * filtered_pat['vz'])
#            mean_mass_weighted_velocity = filtered_pat.agg(F.sum('m_v_z') / total_mass).collect()[0][0]
#            filtered_pat = filtered_pat.withColumn('squared_diff_z', F.pow(filtered_pat['vz'] - mean_mass_weighted_velocity, 2))
#            filtered_pat = filtered_pat.withColumn('mass_weighted_variance_z', filtered_pat['squared_diff_z'] * filtered_pat['mass'])
#            total_mass_weighted_variance_z = filtered_pat.agg(F.sum('mass_weighted_variance_z') / total_mass).collect()[0][0]
#    
#            upd['vdisp_star_sph_' + apr + '_3d_mweight'][j] = np.sqrt(total_mass_weighted_variance_x + total_mass_weighted_variance_y + total_mass_weighted_variance_z)
#            upd['vdisp_star_sph_' + apr + '_z_mweight'][j] = np.sqrt(total_mass_weighted_variance_z)  
#                
#        if i == 0:
#            cdf = pd.DataFrame(upd)
#            
#        if i > 0:
#            ndf = pd.DataFrame(upd)
#            cdf = pd.concat([cdf, ndf])
#    
#        cdf.to_csv(subdir + 'TNG300/TNG300_quiescent_subhalos_hadoop.csv', index=False)      

0it [00:00, ?it/s]

0
50kpc


Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:65)
ERROR:root:KeyboardInterrupt while sending command.            (17 + 172) / 189]
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
Keyb

KeyboardInterrupt: 

                                                                                