In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy
import scipy
import matplotlib
import matplotlib.dates as mpd
import pylab as plt
import datetime as dtm
import pytz
import multiprocessing as mpp
import pickle
import os
import time
import numba
#
import pyspark
#
# TODO: phase out unreferenced hpc_lib calls...
import hpc_lib

#
#data_file_name = 'data/mazama_usage_20200506_tool8.out'
#data_file_name = 'data/sacct_sherlock_out_serc2020_05_08.out'
data_file_name = 'data/serc_usage_20200914.out'
#
pkl_name = "{}.pkl".format(os.path.splitext(data_file_name)[0])
h5_name = "{}.h5".format(os.path.splitext(data_file_name)[0])
#

### PySpark tutorial and examples
- PySpark implementation of HPC_analytics SACCT data parsing script
- Discussion of different reading methods and data reading/management classes
    - Vanilla `RDD`
    - `PySpark DataFrames` object (and `SQLContext` context methods).

#### Brief sumary:
PySpark is the Python implementation of Spark, which is a distributed data processing infrastructure. Spark should parallelize across multiple nodes, and so should be a better multi-processing option than `python.multiprocessing`. The syntax is not Pythonic. At all, so it's basically like writing another language in Python, and the workflow strategies are very different as well, so if you're a Python person, be prepared to pivot a bit.

The biggest problem I'm still having is the final act of reading a very large data file into memory or transfering into a new (disk based) container. This, of course, should be simple, since it is really the fundamental and primary purpose of Spark, but alas... The problem arises when the distributed (Java) VMs exceed memory limitations. But we'll get there...

#### 1. A quick look at the inpuit data:

In [3]:
# Take a quick look at the input data:
#
with open(data_file_name, 'r') as fin:
    k=0
    for rw in fin:
        print('** ', rw)
        k+=1
        if k>15: break

**  User|Group|GID|JobName|JobID|JobIDRaw|Partition|State|Timelimit|NCPUS|NNodes|Submit|Eligible|Start|End|Elapsed|SystemCPU|UserCPU|TotalCPU|NTasks|CPUTimeRAW|Suspended|

**  saipb|oneillm|328022|hovmuller|62339523|62339523|serc|COMPLETED|4-00:00:00|16|1|2020-03-01T00:10:36|2020-03-01T00:10:36|2020-03-01T00:11:24|2020-03-01T00:12:05|00:00:41|00:02.369|00:17.315|00:19.685||656|00:00:00|

**  |||batch|62339523.batch|62339523.batch||COMPLETED||16|1|2020-03-01T00:11:24|2020-03-01T00:11:24|2020-03-01T00:11:24|2020-03-01T00:12:05|00:00:41|00:02.367|00:17.315|00:19.683|1|656|00:00:00|

**  |||extern|62339523.extern|62339523.extern||COMPLETED||16|1|2020-03-01T00:11:24|2020-03-01T00:11:24|2020-03-01T00:11:24|2020-03-01T00:12:05|00:00:41|00:00.001|00:00:00|00:00.001|1|656|00:00:00|

**  pjwomble|gorelick|26961|6dff71d6eaf0c|62339657_0|62339659|serc|COMPLETED|00:59:00|1|1|2020-03-01T00:17:07|2020-03-01T00:17:08|2020-03-01T00:17:14|2020-03-01T00:39:21|00:22:07|00:10.982|21:27.659|21:38.642||1327|

#### 2. Instantiate and configure some context handler objects.
- There are a few...
- The "spark" and "sql" variants seem to come from different branches of the project, or source projects, that have since merged, albeit perhaps not entirely gracefully.

In [4]:
n_cpu = 8
#
# .config("spark.driver.memory", "15g")
#conf = pyspark.SparkConf('local[*]').set("spark.cores.max", "6").set("spark.executor.instances", "4").set("spark.executor.cores","2")
conf = pyspark.SparkConf('local[{}]'.format(n_cpu)).set("spark.driver.memory", "15g")
#conf = conf.set("spark.executor.memory", "4g").set("spark.executor.pyspark.memory", "3g")
sc   = pyspark.SparkContext(conf=conf)
#

# also build a SQL context?
sc_sql = pyspark.SQLContext(sc)
spark = pyspark.sql.SparkSession.builder.appName('HPC_loader').master('local[{}]'.format(n_cpu)).config("spark.driver.memory", "15g").getOrCreate()
#spark = pyspark.sql.SparkSession.builder.appName('HPC_loader').config(conf).getOrCreate()




Pull some data structures, handler functions, etc. from relevant modules (ie `hpc_lib`). Note that eventually, we'll want to consolicate the `process_row()` function.

In [5]:
types_dict = hpc_lib.SACCT_data_handler.default_types_dict
print('** typex_dict: ', types_dict)

@numba.jit
def process_row(rw, delim='|'):
    # use this with MPP processing:
    # ... but TODO: it looks like this is 1) inefficient and 2) breaks with large data inputs because I think it pickles the entire
    #  class object... so we need to move the MPP object out of class.
    #
    # use this for MPP processing:
    rws = rw.split(delim)
    #
    # NOTE: SACCT returns data with a terminal delimiter. We can assume this or trap for it. Here, we assume
    #. it and exclude the last value in each row. Note that this also handles the terminal `\n` character.
    #. a more rigorous solution would be to confirm that the `\n` is being treated like an actual character
    #  (ie, not automatically parsed since it's an EoL delimeter), and then check the rw[-2] character.
    #. We probably want to avoid diagnosing this on a row-by-row basis, for performance. We could add 
    #. an input pram to beter handle this.
    #
    #return [None if vl=='' else self.types_dict.get(col,str)(vl)
    #            for k,(col,vl) in enumerate(zip(self.headers, rw.split(self.delim)[:-1]))]
    return [None if vl=='' else types_dict.get(col,str)(vl)
                for k,(col,vl) in enumerate(zip(self.headers, rws[:-1]))] + [rws[self.RH['JobID']].split('.')[0]]


** typex_dict:  {'User': <class 'str'>, 'JobID': <class 'str'>, 'JobName': <class 'str'>, 'Partition': <class 'str'>, 'State': <class 'str'>, 'JobID_parent': <class 'str'>, 'Timelimit': <function elapsed_time_2_day at 0x7fa38eb05550>, 'Start': <function str2date_num at 0x7fa38eb05430>, 'End': <function str2date_num at 0x7fa38eb05430>, 'Submit': <function str2date_num at 0x7fa38eb05430>, 'Eligible': <function str2date_num at 0x7fa38eb05430>, 'Elapsed': <function elapsed_time_2_day at 0x7fa38eb05550>, 'MaxRSS': <class 'str'>, 'MaxVMSize': <class 'str'>, 'NNodes': <class 'int'>, 'NCPUS': <class 'int'>, 'MinCPU': <class 'str'>, 'SystemCPU': <function elapsed_time_2_day at 0x7fa38eb05550>, 'UserCPU': <function elapsed_time_2_day at 0x7fa38eb05550>, 'TotalCPU': <function elapsed_time_2_day at 0x7fa38eb05550>, 'NTasks': <class 'int'>}


### Most Direct Approach:
- Use a sequence of RDD operations to (pseudo-)directly compute jobs_summary from the raw input

In [6]:
delim = '|'
# Define one or more row processing functions.
def f_rw(rw, header_names, RH_index, types_dict=types_dict, delim='|'):
    rws = rw[:-1].split(delim)
    #
    #if not len(rws)==0:
    #    return rws
    return [None if s=='' else types_dict.get(h,str)(s) for h,s in zip(header_names,rws)] + [rws[RH_index['JobID']].split('.')[0] ]
    #return [str(s) for h,s in zip(header_names,rws)]
    
    

#### Basic RDD operations:

In [7]:
lines = sc.textFile(data_file_name)
header_names = (lines.take(1)[0])[:-1].split(delim)
RH_index = {s:k for k,s in enumerate(header_names) }
print('** Headers: ', header_names)

** Headers:  ['User', 'Group', 'GID', 'JobName', 'JobID', 'JobIDRaw', 'Partition', 'State', 'Timelimit', 'NCPUS', 'NNodes', 'Submit', 'Eligible', 'Start', 'End', 'Elapsed', 'SystemCPU', 'UserCPU', 'TotalCPU', 'NTasks', 'CPUTimeRAW', 'Suspended']


### Excluding header row from data collection:
- This is surprisingly harder than it looks, and searching for solutions seems to be elusive.
- A very direct approach like: read first row, then read the rest of the file, skipping the first row, etc. do not really seem to be an option
- We have to find a _sparkonic_ way
- But there are a few:
    - Using the most common (I think) and direct approach to read text data, `textFile()`, a filter() can be used. This is also a good option when reading multiple files. For example, a script to consilidate many small data files into a single (HDF5) container might ust this, since the header row will be encountered many times.
    - *(spark) dataframes:* Use a syntactical variant of `spark.read.format('CSV')` method below to read the data into a dataframe.


In [8]:
# is there a smart way to skip the header row? I"m not finding it, and this was actually recommended.
#. obviously, it's expensive and will be part of the reason to reorganize this to start with the \
#. spark DF clase.
# that said, if we are taking multiple files, the filer() approach tentatively makes a lot of sense.
#
rows = lines.filter(lambda s: not s.startswith('User') ).map(lambda x: f_rw(x, header_names, RH_index) )
#
for rw in rows.take(10):
    print('** ', rw)

**  ['saipb', 'oneillm', '328022', 'hovmuller', '62339523', '62339523', 'serc', 'COMPLETED', 4.0, 16, 1, 737485.0073611111, 737485.0073611111, 737485.0079166667, 737485.0083912037, 0.00047453703703703704, 2.7418981481481484e-05, 0.0002004050925925926, 0.00022783564814814812, None, '656', '00:00:00', '62339523']
**  [None, None, None, 'batch', '62339523.batch', '62339523.batch', None, 'COMPLETED', None, 16, 1, 737485.0079166667, 737485.0079166667, 737485.0079166667, 737485.0083912037, 0.00047453703703703704, 2.7395833333333333e-05, 0.0002004050925925926, 0.0002278125, 1, '656', '00:00:00', '62339523']
**  [None, None, None, 'extern', '62339523.extern', '62339523.extern', None, 'COMPLETED', None, 16, 1, 737485.0079166667, 737485.0079166667, 737485.0079166667, 737485.0083912037, 0.00047453703703703704, 1.1574074074074074e-08, 0.0, 1.1574074074074074e-08, 1, '656', '00:00:00', '62339523']
**  ['pjwomble', 'gorelick', '26961', '6dff71d6eaf0c', '62339657_0', '62339659', 'serc', 'COMPLETED', 

#### Dataframes:
- using the sql context, we can read the data into a dataframe
- Nominally fast and easy, but I think really for well behaved data.
- Getting the header row is not too tough, but I'm not so sure about excluding a false terminal column, resulting from a terminal delimeter (row string ending in a delimiter).
- In fact, we seem to get some weird behavior from this
- ... to the point that I would probably just err on the side of having more control and maybe burning some cycles on the filter() option (which i expect is pretty well optimized on the back end).
- HOWEVER: Preliminary assessments just doing a `.count()` suggests that DF might be much, much faster than the standard RDD methods... Though that may also be because the `DataFramds` methods are using a context or session that is not CPU limited -- which would make sense.


In [9]:
# Another way to read the file with headers. This will give an effective array of (val,ky) tuples.
rows_2 = spark.read.format('CSV').option('header', 'true').option('sep', '|').load(data_file_name)
print('** type: ', type(rows_2))
#

for rw in rows_2.take(10):
    print('** ', rw[:])

** type:  <class 'pyspark.sql.dataframe.DataFrame'>
**  ('saipb', 'oneillm', '328022', 'hovmuller', '62339523', '62339523', 'serc', 'COMPLETED', '4-00:00:00', '16', '1', '2020-03-01T00:10:36', '2020-03-01T00:10:36', '2020-03-01T00:11:24', '2020-03-01T00:12:05', '00:00:41', '00:02.369', '00:17.315', '00:19.685', None, '656', '00:00:00', None)
**  (None, None, None, 'batch', '62339523.batch', '62339523.batch', None, 'COMPLETED', None, '16', '1', '2020-03-01T00:11:24', '2020-03-01T00:11:24', '2020-03-01T00:11:24', '2020-03-01T00:12:05', '00:00:41', '00:02.367', '00:17.315', '00:19.683', '1', '656', '00:00:00', None)
**  (None, None, None, 'extern', '62339523.extern', '62339523.extern', None, 'COMPLETED', None, '16', '1', '2020-03-01T00:11:24', '2020-03-01T00:11:24', '2020-03-01T00:11:24', '2020-03-01T00:12:05', '00:00:41', '00:00.001', '00:00:00', '00:00.001', '1', '656', '00:00:00', None)
**  ('pjwomble', 'gorelick', '26961', '6dff71d6eaf0c', '62339657_0', '62339659', 'serc', 'COMPLETED'

In [10]:
# Another syntax to load directly into a spark dataframe (via .sql):
#
df_rows = spark.read.csv(data_file_name, header=True, sep='|')
#
print('** type: ', type(df_rows))
print('** dypes: ', df_rows.dtypes)
print('** header: {}'.format( df_rows.schema.names ) )
#
print('\n*** *** ')
for rw in df_rows.take(5):
    print('** ', rw[:])
    #print('* * ', rw.head)

** type:  <class 'pyspark.sql.dataframe.DataFrame'>
** dypes:  [('User', 'string'), ('Group', 'string'), ('GID', 'string'), ('JobName', 'string'), ('JobID', 'string'), ('JobIDRaw', 'string'), ('Partition', 'string'), ('State', 'string'), ('Timelimit', 'string'), ('NCPUS', 'string'), ('NNodes', 'string'), ('Submit', 'string'), ('Eligible', 'string'), ('Start', 'string'), ('End', 'string'), ('Elapsed', 'string'), ('SystemCPU', 'string'), ('UserCPU', 'string'), ('TotalCPU', 'string'), ('NTasks', 'string'), ('CPUTimeRAW', 'string'), ('Suspended', 'string'), ('_c22', 'string')]
** header: ['User', 'Group', 'GID', 'JobName', 'JobID', 'JobIDRaw', 'Partition', 'State', 'Timelimit', 'NCPUS', 'NNodes', 'Submit', 'Eligible', 'Start', 'End', 'Elapsed', 'SystemCPU', 'UserCPU', 'TotalCPU', 'NTasks', 'CPUTimeRAW', 'Suspended', '_c22']

*** *** 
**  ('saipb', 'oneillm', '328022', 'hovmuller', '62339523', '62339523', 'serc', 'COMPLETED', '4-00:00:00', '16', '1', '2020-03-01T00:10:36', '2020-03-01T00:10

In [11]:
# We can count rows like:
#
# NOTE: the DF.count() instances may be much faster (if they are) because they are not
#. configured with CPU constraints (which would be good news that the cpu constraints are working)
# for rr in (rows, rows_2, df_rows, rows, rows_2, df_rows):
#     t0 = time.time()
#     n_rws = rr.count()
#     print('** time: {}'.format(time.time()-t0))
# #


In [12]:
# this is how to fetch all the rows, but it always breaks for a large array.
#all_rows = rows.collect()
print('** ', rows.getStorageLevel(), rows.partitionBy(20))

#grouped = rows.filter(lambda rw:rw[0]!='User').groupBy(lambda rw: rw[0])

**  Serialized 1x Replicated MapPartitionsRDD[34] at mapPartitions at PythonRDD.scala:133


### (Pseudo-) production(ish) PySpark prodcessing framework

In [13]:
print('*** ', df_rows.schema)
#
print('*** ', df_rows.dtypes)
#print('** ', set())

***  StructType(List(StructField(User,StringType,true),StructField(Group,StringType,true),StructField(GID,StringType,true),StructField(JobName,StringType,true),StructField(JobID,StringType,true),StructField(JobIDRaw,StringType,true),StructField(Partition,StringType,true),StructField(State,StringType,true),StructField(Timelimit,StringType,true),StructField(NCPUS,StringType,true),StructField(NNodes,StringType,true),StructField(Submit,StringType,true),StructField(Eligible,StringType,true),StructField(Start,StringType,true),StructField(End,StringType,true),StructField(Elapsed,StringType,true),StructField(SystemCPU,StringType,true),StructField(UserCPU,StringType,true),StructField(TotalCPU,StringType,true),StructField(NTasks,StringType,true),StructField(CPUTimeRAW,StringType,true),StructField(Suspended,StringType,true),StructField(_c22,StringType,true)))
***  [('User', 'string'), ('Group', 'string'), ('GID', 'string'), ('JobName', 'string'), ('JobID', 'string'), ('JobIDRaw', 'string'), ('Par

In [14]:
re_typer={numpy.float64:float, numpy.float128:float}



x1 = rows.take(2)[1]
print('** ', x1)
print('** ', type(x1[4])(x1[4]))
#
x1_prime = [None if x is None else re_typer.get(type(x), type(x))(x) for x in x1]
#
print('** x1_prime: ', x1_prime)
print('** x2_p types: ', [type(x) for x in x1_prime])

**  [None, None, None, 'batch', '62339523.batch', '62339523.batch', None, 'COMPLETED', None, 16, 1, 737485.0079166667, 737485.0079166667, 737485.0079166667, 737485.0083912037, 0.00047453703703703704, 2.7395833333333333e-05, 0.0002004050925925926, 0.0002278125, 1, '656', '00:00:00', '62339523']
**  62339523.batch
** x1_prime:  [None, None, None, 'batch', '62339523.batch', '62339523.batch', None, 'COMPLETED', None, 16, 1, 737485.0079166667, 737485.0079166667, 737485.0079166667, 737485.0083912037, 0.00047453703703703704, 2.7395833333333333e-05, 0.0002004050925925926, 0.0002278125, 1, '656', '00:00:00', '62339523']
** x2_p types:  [<class 'NoneType'>, <class 'NoneType'>, <class 'NoneType'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'NoneType'>, <class 'str'>, <class 'NoneType'>, <class 'int'>, <class 'int'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'int'>, <class 'str'>, <class '

In [15]:
#
#

# raw, preliminary access to data:
delim = '|'
lines = sc.textFile(data_file_name)
#
n_terminal = 0
header_string = lines.take(1)[0]
#for c in header_string:
# for c in (lines.take(2)[1])[:-1]:
#     print('{}: [{}]'.format(c, ord(c)))

while header_string[-1] in ('\n', delim):
    header_string=header_string[:-1]
    n_terminal += 1
print('** n_terminal: ', n_terminal)
if n_terminal>0:
    lines = lines.map(lambda ln: ln[:-n_terminal])
#
header_names = header_string.split(delim) + ['JobID_parent']
RH_index = {s:k for k,s in enumerate(header_names) }
print('** Headers[{}]: '.format(len(header_names), header_names) )
#
# for c in (lines.take(2)[1]):
#     print('{}: [{}]'.format(c, ord(c)))

** n_terminal:  1
** Headers[23]: 


In [16]:
# use this to filter header rows:
n_startswith = 15
header_start = header_string[0:n_startswith]
#
rows = lines.filter(lambda s: not s.startswith(header_start) ).map(lambda x: f_rw(x, header_names,
                                            types_dict=types_dict, RH_index=RH_index) )
rows = rows.map(lambda rw: [None if (x is None or x=='') else re_typer.get(type(x), type(x))(x) for x in rw])
#
#sample_row = f_rw(lines.take(2)[1], header_names, RH_index)
#print('*** ', [(s, type(s) )for s in sample_row])

#my_dtypes = 
# we can either group and reduce using the RH{} index, or convert to a DF first:
rows_df = spark.createDataFrame( rows, header_names ).sort('JobID')

In [17]:
print('** rows_df schema[{}]: {}'.format(len(rows_df.schema), rows_df.schema))
print('** dytpes[{}]: {}'.format(len(rows_df.dtypes), rows_df.dtypes))
print('** header_names[{}]: {}'.format(len(header_names), header_names))

** rows_df schema[23]: StructType(List(StructField(User,StringType,true),StructField(Group,StringType,true),StructField(GID,StringType,true),StructField(JobName,StringType,true),StructField(JobID,StringType,true),StructField(JobIDRaw,StringType,true),StructField(Partition,StringType,true),StructField(State,StringType,true),StructField(Timelimit,DoubleType,true),StructField(NCPUS,LongType,true),StructField(NNodes,LongType,true),StructField(Submit,DoubleType,true),StructField(Eligible,DoubleType,true),StructField(Start,DoubleType,true),StructField(End,DoubleType,true),StructField(Elapsed,DoubleType,true),StructField(SystemCPU,DoubleType,true),StructField(UserCPU,DoubleType,true),StructField(TotalCPU,DoubleType,true),StructField(NTasks,LongType,true),StructField(CPUTimeRAW,StringType,true),StructField(Suspended,StringType,true),StructField(JobID_parent,StringType,true)))
** dytpes[23]: [('User', 'string'), ('Group', 'string'), ('GID', 'string'), ('JobName', 'string'), ('JobID', 'string'),

In [None]:
#rows_df.show()
print(rows_df.rdd.getNumPartitions())
rows_df = rows_df.repartition('JobID_parent').orderBy('Submit')
print(rows_df.rdd.getNumPartitions())
#

for rw in rows_df.take(10):
    print('** ', rw[:])

In [None]:
df_summary = df_rows.withColumn("row",row_number.over(w2))
      .where($"row" === 1).drop("row")

#### The end (of usefulness?)
- And this might be where usefullness ends.
- Looks like a real-programming language approach, of grouping/aggegating/reducing by grouping, pulling the first row by default, then substituting and aggregate for selected columns (but not really caring if some of the other columns are not identical in value) is not possible
- Looks like it needs to be much more SQL `group by` like. Can we write our own functions? Can we pass functions programmatically? Not seeing that so far...
- *Maybe!* I think we want to do it this way: https://sparkbyexamples.com/spark/spark-dataframe-how-to-select-the-first-row-of-each-group/
- Which I think is the DF method. There are ways to (maybe) do it in an RDD with or without converting to a key-value array (like {JobID_parent: full_row} ), but those are supposed to be much less compute efficient.


In [38]:
# Here, let's try a RDD_pair, then reduce_by_key() function:
# (this appears to work, but needs to be validated )
row_pairs = rows.map(lambda x: (x[RH_index['JobID_parent']], list(x[:])))
#
group_py_functions = {'End':numpy.nanmax, 'Start':numpy.nanmin, 'NCPUS':numpy.nanmax, 'NNodes':numpy.nanmax}
def f_reduce_row(r1, r2):
     return tuple([group_py_functions.get(hdr, lambda x: x[0] )([x1, x2]) 
             for k, (hdr,x1,x2) in enumerate(zip(header_names, r1, r2)) ] )
summary_rdd = row_pairs.reduceByKey(f_reduce_row)
#summary_rdd = row_pairs.reduceByKey(lambda x1,x2: x1)

In [39]:
for rw in summary_rdd.take(10):
    print('** ', rw)

**  ('62339657_34', ('pjwomble', 'gorelick', '26961', '6dff71d6eaf0c', '62339657_34', '62339848', 'serc', 'COMPLETED', 0.04097222222222222, 1, 1, 737485.0118865741, 737485.0118981481, 737485.0134027777, 737485.0287037037, 0.015300925925925926, 0.00012969907407407407, 0.014705127314814813, 0.014834837962962963, None, '1322', '00:00:0', '62339657_34'))
**  ('62339657_252', ('pjwomble', 'gorelick', '26961', '6dff71d6eaf0c', '62339657_252', '62341000', 'serc', 'COMPLETED', 0.04097222222222222, 1, 1, 737485.0118865741, 737485.0118981481, 737485.0169097222, 737485.0321759259, 0.015243055555555555, 0.0001260648148148148, 0.014826122685185187, 0.014952199074074073, None, '1317', '00:00:0', '62339657_252'))
**  ('62339657_381', ('pjwomble', 'gorelick', '26961', '6dff71d6eaf0c', '62339657_381', '62342165', 'serc', 'COMPLETED', 0.04097222222222222, 1, 1, 737485.0118865741, 737485.0118981481, 737485.0274652778, 737485.0428356482, 0.015370370370370371, 0.00012980324074074074, 0.014864444444444444, 

In [28]:
for rw in row_pairs.take(10):
    print('** ', rw)

**  ('62339523', ['saipb', 'oneillm', '328022', 'hovmuller', '62339523', '62339523', 'serc', 'COMPLETED', 4.0, 16, 1, 737485.0073611111, 737485.0073611111, 737485.0079166667, 737485.0083912037, 0.00047453703703703704, 2.7418981481481484e-05, 0.0002004050925925926, 0.00022783564814814812, None, '656', '00:00:0', '62339523'])
**  ('62339523', [None, None, None, 'batch', '62339523.batch', '62339523.batch', None, 'COMPLETED', None, 16, 1, 737485.0079166667, 737485.0079166667, 737485.0079166667, 737485.0083912037, 0.00047453703703703704, 2.7395833333333333e-05, 0.0002004050925925926, 0.0002278125, 1, '656', '00:00:0', '62339523'])
**  ('62339523', [None, None, None, 'extern', '62339523.extern', '62339523.extern', None, 'COMPLETED', None, 16, 1, 737485.0079166667, 737485.0079166667, 737485.0079166667, 737485.0083912037, 0.00047453703703703704, 1.1574074074074074e-08, 0.0, 1.1574074074074074e-08, 1, '656', '00:00:0', '62339523'])
**  ('62339657_0', ['pjwomble', 'gorelick', '26961', '6dff71d6e

In [29]:
print('** ', RH_index)
print('** ', header_names)

**  {'User': 0, 'Group': 1, 'GID': 2, 'JobName': 3, 'JobID': 4, 'JobIDRaw': 5, 'Partition': 6, 'State': 7, 'Timelimit': 8, 'NCPUS': 9, 'NNodes': 10, 'Submit': 11, 'Eligible': 12, 'Start': 13, 'End': 14, 'Elapsed': 15, 'SystemCPU': 16, 'UserCPU': 17, 'TotalCPU': 18, 'NTasks': 19, 'CPUTimeRAW': 20, 'Suspended': 21, 'JobID_parent': 22}
**  ['User', 'Group', 'GID', 'JobName', 'JobID', 'JobIDRaw', 'Partition', 'State', 'Timelimit', 'NCPUS', 'NNodes', 'Submit', 'Eligible', 'Start', 'End', 'Elapsed', 'SystemCPU', 'UserCPU', 'TotalCPU', 'NTasks', 'CPUTimeRAW', 'Suspended', 'JobID_parent']


In [None]:
# some examples using group(). Basically, not very useful for what we want to do, since we want to keep the
#.  non-grouped rows and we can't guarantee uniqueness of the extra col values.

# dummy(-ish) functions to handle grouping operations.
def grp_first(X):
    return X[0]
#
# can we use regular numpy functions, or do we need to use PySpark SQL functions?
import pyspark.sql.functions as psf
group_functions = {'End':psf.max, 'Start':psf.min, 'NCPUS':psf.max, 'NNodes':psf.max}
group_function_names = {'End':'max', 'Start':'min', 'NCPUS':'max', 'NNodes':'max'}
#group_functions = {'End':numpy.nanmax, 'Start':numpy.nanmin, 'NCPUS':numpy.nanmax, 'NNodes':numpy.nanmax}
#
# maybe this:?:
#jobs_summary =  rows_df.groupBy('JobID_parent').agg({cl:group_functions.get(cl, grp_first)
#                                                          for cl in header_names})

# jobs_summary =  rows_df.groupBy('JobID_parent').agg({cl:group_functions.get(cl, grp_first)
#                                                           for cl in header_names})
jobs_summary =  rows_df.groupBy('JobID_parent').agg({cl:f for cl,f in group_function_names.items()})
#jobs_summary = rows_df.groupBy('JobID_parent').agg({'End':'max'})
#


In [None]:
js_group = rows_df.groupBy('JobID_parent')

In [None]:
for rw in js_group.count().take(5) :
    print('*** ', rw)

In [None]:
collected = js_group.agg(psf.collect_list('JobID_parent'))

In [None]:
for rw in collected.take(5):
    print(rw)

In [None]:
# print('** lines: ')
# for ln in lines.take(10):
# 	print('** **: ', ln)
#
print('rows: ')
for rw in rows.take(10):
	print('** **: ', rw)
#
#
# print('** rows again: ')
# for rw in rows.take(10):
#         print('** **: ', rw)
#
print('** groups: ')
for rw in grouped.take(10):
    print('* *: ', rw)

In [None]:
print('begin all_rows[]: ')
t0 = time.time()
#
all_rows = rows.collect()
#
print('** time: {}'.format(time.time()-t0))

In [None]:
print('begin all_lines: ')
t0 = time.time()
#
all_lines = lines.collect()
#
print('** time: {}'.format(time.time()-t0))

In [None]:
print('** sc: ', sc.defaultParallelism)