In [1]:
import numpy as np
import io
import urllib
import requests


In [2]:
# simple test function that retrieves a complete file and returns its size
# can switch between urllib and requests library
def get_url(url,use_requests=False):
    data=urllib.request.urlopen(url).read()
    print("%s : \nsize of data [bytes] : %d"%(url,len(data)))
    return data

In [3]:
# listing in HTML
u='http://virgo02/turbdb'
%time r=get_url(u)
r

URLError: <urlopen error [Errno -2] Name or service not known>

In [4]:
%%time
u='http://virgo02/turbdb/turbdb101_0.bin'
%time d=get_url(u)
n=int(len(d)/4)
%time _arr=np.frombuffer(d,dtype=np.float32,count=n)
d=None
%time print("# cells", n,", average value:",np.average(_arr))



URLError: <urlopen error [Errno -2] Name or service not known>

# Cutting chunks

In [55]:
# single chunk is easy
ncells=512
ranges=[(0,ncells*3*4-1)]
# url of a file
url='http://virgo02/turbdb/turbdb101_30.bin'
# define the Range HTTP header
headers={"Range":"bytes=%s" % ",".join("%d-%d" % (g) for g in ranges)}
# retrieve the data
resp=requests.get(url,headers=headers,stream=True)
if resp.status_code != 206:
    print("error in response code ",resp.status_code)
d=resp.raw.read()
n=int(len(d)/4)
%time _arr=np.frombuffer(d,dtype=np.float32,count=n)
%time print("# cells", n,", average value:",np.average(_arr))


CPU times: user 39.1 ms, sys: 282 ms, total: 321 ms
Wall time: 353 ms
# cells 1536 , average value: -0.470825
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 982 µs


In [56]:
# when >1 ranges present, more complex
# assuming the data are organized in 12-byte triples, of 3 floats
# define a few ranges that will retrieve these tiples
ranges=[(0,11), (12,23),(24,47),(144,155)]
# url of a file
url='http://virgo02/turbdb/turbdb101_30.bin'
# define the Range HTTP header
headers={"Range":"bytes=%s" % ",".join("%d-%d" % (g) for g in ranges)}
# retrieve the data
resp=requests.get(url,headers=headers,stream=True)
if resp.status_code != 206:
    print("error in response code ",resp.status_code)
# if there are multiple ranges, the result is a complex structure defined by the Content-Type header
# which defines a boundary separating the byte streams of the individual ranges
# the next few lines define this boundary which will be used to cut up the complete content
ct=resp.headers['Content-Type']
boundary=ct.split("=")[1]
boundary=("--%s"%boundary).encode()
print("boundary=",boundary)

# first cut up the raw content using the boundary
chunks=resp.raw.read().split(boundary)
count=0;ncount=0;
for chunk in chunks:
    # each chunk has still some crap around it that needs to be removed as well
    ix=chunk.find(b"Content-Range")
    if ix<0:
        continue
    chunk=chunk[ix:-2]
    chunk=chunk[chunk.find(b'\r\n\r\n')+4:]
    # now the chunk is a pure array of bytes, the length of which should be given by the range
    
    _range=ranges[count]
    # define the length of the arryay of 4-byte floats
    n=int((_range[1]-_range[0]+1)/4 )
    # read the array
    narr=np.frombuffer(chunk,dtype=np.float32,count=n)
    print(narr)
    count+=1

boundary= b'--<q1w2e3r4t5y6u7i8o9p0zaxscdvfbgnhmjklkl>'
[-0.65512633 -1.03012335 -0.11621436]
[-0.63719344 -1.04078746 -0.12809123]
[-0.61377287 -1.0589366  -0.12283614 -0.58266103 -1.05651009 -0.10981002]
[-0.55073839 -1.02768779 -0.03384927]
