In [43]:
for e1, e2 in zip([1,2,3,4], [1,2,3,4]):
                   print(e1,e2)
        
l=[(x, y) for x in [1,2,3,4] for y in [1,2,3,4]]
for e in l:
    print(e)


(1, 1)
(2, 2)
(3, 3)
(4, 4)
(1, 1)
(1, 2)
(1, 3)
(1, 4)
(2, 1)
(2, 2)
(2, 3)
(2, 4)
(3, 1)
(3, 2)
(3, 3)
(3, 4)
(4, 1)
(4, 2)
(4, 3)
(4, 4)


In [None]:
### Compute Spectrogram

import hdf5storage
from joblib import Parallel, delayed
import numpy as np
from scipy.signal import detrend
import nitime.algorithms as tsa

def compute_spec_each_seg(eeg_seg, NW, Fs):
    """
    Input:
    eeg_seg: numpy.array (point_num x channel_num)
    NW: Time-halfbandwidth product, 2 or 3 or 4 or ... #taper = 2NW-1
    Fs: sampling frequency in Hz

    Output:
    mt_pxx: the spectrum (freq_num x channel_num)
    freqs: frequencies (freq_num,)
    """
    point_num, channel_num = eeg_seg.shape
    # point_num, channel_num = eeg_seg.shape
    nfft = max(1 << (point_num - 1).bit_length(), point_num)
    freqs = np.arange(0, Fs, Fs * 1.0 / nfft)[:nfft // 2 + 1]  # list of frequencies

    mt_pxx = np.zeros((len(freqs), channel_num))  # create the array to contain the spectrum

    eeg_seg = detrend(eeg_seg, axis=0)  # remove the overall trend of the signal
    for chi in range(channel_num):
        _, pxx, _ = tsa.multi_taper_psd(eeg_seg[:, chi], Fs=Fs, NW=NW, adaptive=True, jackknife=False, low_bias=True,
                                        NFFT=nfft)
        mt_pxx[:, chi] = pxx

    return mt_pxx, freqs


def mtspecgram_shq(eeg, movingwin, fpass, NW, Fs):
    """
    Input:
    eeg: numpy.array (signal_length x channel_num)
    movingwin: [window length, window step] in seconds
    fpass: [low, higher] in Hz
    NW: Time-halfbandwidth product, 2 or 3 or 4 or ... #taper = 2NW-1
    Fs: sampling frequency in Hz

    Output:
    spect: the spectrogram (window_num x freq_num x channel_num)
    stimes: window starting times (window_num,)
    sfreqs: frequencies (freq_num,)
    """
    print(eeg.shape)
    # signal_length, channel_num = eeg.shape
    signal_length = eeg.shape[0]
    window_length = int(round(movingwin[0] * Fs))
    window_step = int(round(movingwin[1] * Fs))


    window_start = np.arange(0, signal_length - window_length + 1, window_step)  # starting point of each segment
    window_num = len(window_start)
    print('%d windows'%window_num)
    stimes = window_start * 1. / Fs

    # Parallel
    # n_jobs = 1  # number of cpus for parallel computing, -1 is all cpus
    n_jobs = -1  # number of cpus for parallel computing, -1 is all cpus
    verbose = 10  # verbosity in parallel computing
    # res = Parallel(n_jobs=n_jobs, verbose=verbose)(
    #    delayed(compute_spec_each_seg)(eeg[window_start[wi]:window_start[wi] + window_length, :], NW, Fs) for wi in range(window_num))

    #res = Parallel(n_jobs=n_jobs, verbose=verbose)(
    #   delayed(compute_spec_each_seg)(eeg[window_start[wi]:window_start[wi] + window_length].reshape(len(eeg[window_start[wi]:window_start[wi] + window_length]),1), NW, Fs) for wi in range(window_num))


    # Iteration
    res = []
    for wi in range(window_num):
        if wi%10==0:
            print(wi)
        #res.append(compute_spec_each_seg(eeg[window_start[wi]:window_start[wi] + window_length, :], NW, Fs))
        na=np.array(eeg[window_start[wi]:window_start[wi] + window_length])
        na=na.reshape(len(na), 1)
        res.append(compute_spec_each_seg(na, NW, Fs))

    sfreqs = res[0][1]
    freq_good_ids = np.logical_and(sfreqs >= fpass[0], sfreqs < fpass[1])
    sfreqs = sfreqs[freq_good_ids]

    spect = np.array([rr[0][freq_good_ids] for rr in res])  # the spectrogram

    return spect, stimes, sfreqs


# Subfunc
class load_mat():
    def __init__(self, filename, mode='r'):
        self.filename = filename
        self.mode = mode

    def __enter__(self):
        print('open')
        # self.open_file = open(self.filename, self.mode)
        return 0

    def __exit__(self, *args):
        # self.open_file.close()
        print('close')
        
def load(fp):
    matfile = hdf5storage.loadmat(fp)
    # print('Loading from %s' % fp)
    return matfile



def step1_resample_spectrogram(fileName0):
    #### 1.0 Loading data ####

    # - input -
    # dataPath = '/Data/'
    # print('dataPath',dataPath)

    # - output -
    # targetFolder = catstr('/Output/')

    #fileName0 = 'Case1_seg12.mat'

    print('Loading', fileName0)
    # fp=dataPath +fileName0
    fp = fileName0
    with load_mat(fp):
        matfile = load(fp)
        Fs = matfile['Fs']
        start_time = matfile['start_time']
        channels = matfile['channels']

        eeg_data = matfile['data']

    print('data', eeg_data.shape, Fs, start_time, channels)


    # New: Resample to 200 Hz!!! %
    # import scipy
    # print(Fs)
    # P=200
    # Q=int(Fs)
    # data=scipy.signal.resample(eeg_data.T, int(len(eeg_data.T) * P / Q))
    # # data = np.resample(data, 200, Fs)
    # eeg_data = data.T
    Fs = 200

    #### 1.1 Compute spectrograms ####
    # just a toy example
    #    eeg = np.random.rand(10000,6)
    #print('resampled',eeg_data.shape)
    spect, stimes, sfreqs = mtspecgram_shq(eeg_data[0,:].T, [2, 2], [0.5, 20], 2, Fs)
    print('spect',spect.shape,spect)
    print('stimes',stimes.shape,stimes)
    
    print('sfreqs',sfreqs.shape,sfreqs)
    return [1]


if __name__ == '__main__':
    fp="Case5/Case5_seg4.mat"
    #loadData()
    step1_resample_spectrogram(fp)

('Loading', 'Case5/Case5_seg4.mat')
open
close
('data', (32, 9999999), array([[ 255.99775]]), array([[u'01-30-2012 22:28:28']], 
      dtype='<U19'), array([[u'C3  '],
       [u'C4  '],
       [u'CZ  '],
       [u'F3  '],
       [u'F4  '],
       [u'F7  '],
       [u'F8  '],
       [u'FZ  '],
       [u'FP1 '],
       [u'FP2 '],
       [u'FPZ '],
       [u'O1  '],
       [u'O2  '],
       [u'P3  '],
       [u'P4  '],
       [u'PZ  '],
       [u'T3  '],
       [u'T4  '],
       [u'T5  '],
       [u'T6  '],
       [u'AUX1'],
       [u'EKG '],
       [u'CII '],
       [u'AUX4'],
       [u'AUX5'],
       [u'LOC '],
       [u'ROC '],
       [u'AUX8'],
       [u'T1  '],
       [u'T2  '],
       [u'A1  '],
       [u'A2  ']], 
      dtype='<U4'))
(9999999,)
24999 windows
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
59



23580
23590
23600
23610
23620
23630
23640
23650
23660
23670
23680
23690
23700
23710
23720
23730
23740
23750
23760
23770
23780
23790
23800
23810
23820
23830
23840
23850
23860
23870
23880
23890
23900
23910
23920
23930
23940
23950
23960
23970
23980
23990
24000
24010
24020
24030
24040
24050
24060
24070
24080
24090
24100
24110
24120
24130
24140
24150
24160
24170
24180
24190
24200
24210
24220
24230
24240
24250
24260
24270
24280
24290
24300
24310
24320
24330
24340
24350
24360
24370
24380
24390
24400
24410
24420
24430
24440
24450
24460
24470
24480
24490
24500
24510
24520
24530
24540
24550
24560
24570
24580
24590
24600
24610
24620
24630
24640
24650
24660
24670
24680
24690
24700
24710
24720
24730
24740
24750
24760
24770
24780
24790
24800
24810
24820
24830
24840
24850
24860
24870
24880
24890
24900
24910
24920
24930
24940
24950
24960
24970
24980
24990
('spect', (24999, 50, 1), array([[[  7.19973528e+01],
        [  7.75632802e+01],
        [  4.04923230e+01],
        ..., 
        [  7.99193389e-0

In [29]:
def loadFile(fileName0):
    print('loading',fileName0)
    r=fileName0
    mat = hdf5storage.loadmat(fileName0) 
    #spect, stimes, sfreqs = mtspecgram_shq(mat['data'], [2,2], [0.5, 20], 2, 200)
    #print mat['data']
    sampleRate = np.round(mat['Fs'])
    #print sampleRate
    #os.remove(os.path.basename(fileName0))
    #transpose
    rawDat = pd.DataFrame(mat['data'])
    rawDat = rawDat.T
    #label columns
    #add time column
    #calculate time column
    fileNum = r[:len(r)-4]
    fileNum = re.search('(\d+)$', fileNum).group(0)
    #print fileNum
    rawDat.reset_index(inplace=True)
    #print rawDat
    #rawDat['index']=rawDat.index
    rawDat['time'] = rawDat['index'].apply(lambda e: timeStamp(e,fileNum,sampleRate))
    caseNum = int(re.search(r'\d+', fileName0).group())
    rawDat['case'] = caseNum

    return rawDat

def timeStamp(ind,fileNum,sampleRate):
    start = (int(fileNum) - 1) * 99999
    row = start + int(ind)
    time = row*int(sampleRate)
    return time

import pandas as pd
import re
if __name__ == '__main__':
    fp="Case5/Case5_seg4.mat"
    #loadData()
    print(loadFile(fp))

('loading', 'Case5/Case5_seg4.mat')
           index          0          1          2          3          4  \
0              0  43.065867  27.115546  43.331706  41.736674  29.773933   
1              1  46.787609  31.900642  48.914318  46.787609  33.495674   
2              2  45.458415  28.976417  50.775189  44.926738  31.634804   
3              3  44.395061  28.178901  49.445996  40.407480  29.773933   
4              4  44.926738  28.178901  47.053447  41.736674  30.305610   
5              5  45.458415  30.039772  50.243512  44.926738  34.293191   
6              6  46.787609  29.508094  54.496931  46.787609  32.963997   
7              7  41.736674  25.786353  49.445996  39.078287  27.913062   
8              8  36.685739  22.064611  43.331706  35.356545  25.254675   
9              9  36.154061  23.127966  42.002512  36.685739  26.052191   
10            10  38.546610  25.786353  45.724254  39.875803  27.115546   
11            11  35.356545  21.267095  43.863383  32.963997  24

In [44]:
# Load

data = ["Case5/Case5_seg4.mat","Case5/Case5_seg3.mat"]
distData = sc.parallelize(data)
distData.repartition(len(data))
distData.cache()

ParallelCollectionRDD[61] at parallelize at PythonRDD.scala:475

In [45]:
justData3 = distData.flatMap(step1_resample_spectrogram)
justData3.take(2)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 11.0 failed 10 times, most recent failure: Lost task 0.9 in stage 11.0 (TID 119, yp-spark-dal09-env5-0021, executor 18213d4d-31bb-4ac8-b442-3904b593af90): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
    process()
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/src/spark21master/spark/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
    yield next(iterator)
  File "<ipython-input-36-9cb729501f9e>", line 130, in step1_resample_spectrogram
  File "<ipython-input-36-9cb729501f9e>", line 108, in load
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/hdf5storage/__init__.py", line 1768, in loadmat
    with h5py.File(filename, mode='r') as f:
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/h5py/_hl/files.py", line 271, in __init__
    fid = make_fid(name, mode, userblock_size, fapl, swmr=swmr)
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/h5py/_hl/files.py", line 101, in make_fid
    fid = h5f.open(name, flags, fapl=fapl)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-rdtLFq-build/h5py/_objects.c:2840)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-rdtLFq-build/h5py/_objects.c:2798)
  File "h5py/h5f.pyx", line 78, in h5py.h5f.open (/tmp/pip-rdtLFq-build/h5py/h5f.c:2117)
IOError: Unable to open file (Unable to open file: name = 'case5/case5_seg4.mat', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:326)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:290)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1153)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.lang.Thread.run(Thread.java:785)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1442)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1430)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1429)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1429)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:803)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1657)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1612)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1601)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at java.lang.Thread.getStackTrace(Thread.java:1117)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:629)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1957)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:95)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:55)
	at java.lang.reflect.Method.invoke(Method.java:507)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:785)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
    process()
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/src/spark21master/spark/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
    yield next(iterator)
  File "<ipython-input-36-9cb729501f9e>", line 130, in step1_resample_spectrogram
  File "<ipython-input-36-9cb729501f9e>", line 108, in load
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/hdf5storage/__init__.py", line 1768, in loadmat
    with h5py.File(filename, mode='r') as f:
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/h5py/_hl/files.py", line 271, in __init__
    fid = make_fid(name, mode, userblock_size, fapl, swmr=swmr)
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/h5py/_hl/files.py", line 101, in make_fid
    fid = h5f.open(name, flags, fapl=fapl)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-rdtLFq-build/h5py/_objects.c:2840)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-rdtLFq-build/h5py/_objects.c:2798)
  File "h5py/h5f.pyx", line 78, in h5py.h5f.open (/tmp/pip-rdtLFq-build/h5py/h5f.c:2117)
IOError: Unable to open file (Unable to open file: name = 'case5/case5_seg4.mat', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:326)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:290)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1153)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [34]:
justData3 = distData.flatMap(loadFile)
justData3.cache()
print(justData3)
justData3.take(2)

PythonRDD[53] at RDD at PythonRDD.scala:48


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 10 times, most recent failure: Lost task 0.9 in stage 10.0 (TID 109, yp-spark-dal09-env5-0027, executor bf8add15-869e-4583-b14b-28161ca67e25): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
    process()
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-29-f8665c8abae2>", line 4, in loadFile
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/hdf5storage/__init__.py", line 1768, in loadmat
    with h5py.File(filename, mode='r') as f:
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/h5py/_hl/files.py", line 271, in __init__
    fid = make_fid(name, mode, userblock_size, fapl, swmr=swmr)
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/h5py/_hl/files.py", line 101, in make_fid
    fid = h5f.open(name, flags, fapl=fapl)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-rdtLFq-build/h5py/_objects.c:2840)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-rdtLFq-build/h5py/_objects.c:2798)
  File "h5py/h5f.pyx", line 78, in h5py.h5f.open (/tmp/pip-rdtLFq-build/h5py/h5f.c:2117)
IOError: Unable to open file (Unable to open file: name = 'case5/case5_seg4.mat', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:326)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:339)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:337)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:980)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:955)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:895)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:955)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:701)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:337)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:326)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:290)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1153)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.lang.Thread.run(Thread.java:785)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1442)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1430)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1429)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1429)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:803)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1657)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1612)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1601)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at java.lang.Thread.getStackTrace(Thread.java:1117)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:629)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1957)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:95)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:55)
	at java.lang.reflect.Method.invoke(Method.java:507)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:785)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
    process()
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/src/spark21master/spark-2.1.0-bin-2.7.3/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-29-f8665c8abae2>", line 4, in loadFile
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/hdf5storage/__init__.py", line 1768, in loadmat
    with h5py.File(filename, mode='r') as f:
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/h5py/_hl/files.py", line 271, in __init__
    fid = make_fid(name, mode, userblock_size, fapl, swmr=swmr)
  File "/gpfs/fs01/user/seff-34c2f0d3dcc620-a916a00b641d/.local/lib/python2.7/site-packages/h5py/_hl/files.py", line 101, in make_fid
    fid = h5f.open(name, flags, fapl=fapl)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-rdtLFq-build/h5py/_objects.c:2840)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-rdtLFq-build/h5py/_objects.c:2798)
  File "h5py/h5f.pyx", line 78, in h5py.h5f.open (/tmp/pip-rdtLFq-build/h5py/h5f.c:2117)
IOError: Unable to open file (Unable to open file: name = 'case5/case5_seg4.mat', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:326)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:339)
	at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:337)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:980)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:955)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:895)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:955)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:701)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:337)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:326)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:290)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1153)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [15]:
!ls Case3/Case3_seg4.mat

Case3/Case3_seg4.mat
