# First example with HDF5

In [1]:
import h5py
import ujson
import gzip
import ast
from pandas import DataFrame
from toolz import dissoc
from toolz import dissoc, partition_all
from castra import Castra
import time
import datetime
import dask.dataframe as dd
import dask.bag as db
from dask.diagnostics import ProgressBar
import numpy as np
import pandas as pd
from pandas import HDFStore

## 1) Creating with h5py and reading with Dask: conflict between h5py and Dask?

In [53]:
path_to_file = "/home/ec2-user/amazon_dataset/data.hdf5"
df = pd.DataFrame(np.random.randn(10, 5))
current_index = 10 #Starting from 0, pointing to the first "free" position
chunks = 10 #writing 10 lines each time

with h5py.File(path_to_file, "w") as hf:
    hf.create_dataset('amazon', data=df,maxshape=(None,None))

In [54]:
with h5py.File(path_to_file, "r") as hf:
    print 'List of arrays in this file: \n', hf.keys()
    data = hf.get('amazon')
    np_data = np.array(data)
    print'Shape of the array amazon: \n', np_data.shape

List of arrays in this file: 
[u'amazon']
Shape of the array amazon: 
(10, 5)


Trying to reshape and add more content to the HDF5:

In [55]:
with h5py.File(path_to_file, "r+") as hf:
    data = hf.get('amazon')
    df = pd.DataFrame(np.random.randn(10, 5)) #New dataframe to add
    data.resize((data.shape[0]+chunks,data.shape[1]))
    data[current_index:current_index+chunks] = df
    print 'List of arrays in this file: \n', hf.keys()
    print 'Hierarchy:',data.name
    np_data = np.array(data)
    print'Shape of the array amazon: \n', np_data.shape
    print 'Data:\n',data[:]

List of arrays in this file: 
[u'amazon']
Hierarchy: /amazon
Shape of the array amazon: 
(20, 5)
Data:
[[ 0.33615614  0.92148885  0.80791913 -0.07501982  1.50430268]
 [ 0.7479168   0.65582071 -0.71301003  0.9000312   2.12067986]
 [ 0.28266367 -0.93705573 -0.93001321 -0.04424039  0.37498966]
 [ 0.69727914  0.88918464 -2.12194092  1.03283473 -0.23717406]
 [-0.22126272 -0.18595924  0.70276079 -0.25828817  0.03022614]
 [-0.25548966 -0.50407185 -0.68975554  0.67881699  0.375861  ]
 [ 1.68238172 -0.94102896 -0.91619115 -0.22347315 -0.48222473]
 [ 0.44037292  0.1177223   0.73702812 -0.71398744  1.0187634 ]
 [-0.99370112 -0.366224   -0.56279895  0.26389202 -0.59253603]
 [-0.71683573  1.2921071  -1.59845757  0.719606   -0.8612614 ]
 [ 0.72463843  0.07863037  0.00382157 -1.39810272 -1.81331261]
 [-0.45035738  0.7919192  -0.40131715  0.12347294 -0.81692247]
 [ 0.07909288  1.13137146  1.43555496 -1.92644076  1.05954108]
 [ 0.68715044 -0.55307139 -1.49701966 -1.61766041  0.59391732]
 [-2.79255144 -

In [27]:
dd.read_hdf(path_to_file,'/amazon')

ValueError: Input must be a list longer than 0

## 2) Creating HDF5 with Pandas

In [10]:
#First DF
df = pd.DataFrame(np.random.randn(10, 5))
#Create HDF5
df.to_hdf(path_to_file,'/amazon',format='table')

#Second DF
df = pd.DataFrame(np.random.randn(10, 5))
df.to_hdf(path_to_file,'/amazon',format='table',append=True)

#Third DF
df = pd.DataFrame(np.random.randn(10, 5))
df.to_hdf(path_to_file,'/amazon',format='table',append=True)

#Check status
with h5py.File(path_to_file, "r+") as hf:
    data = hf.get('amazon/table')
    print data.shape
    print data[:]

(30,)
[ (0, [1.4563122393852164, -0.24982073624775278, -0.3293450140017712, -2.772855602530441, -0.058142051682005445])
 (1, [0.7574309530244978, 0.24523064187019356, 1.410097766976578, -1.3332376792617133, -0.08484400847375759])
 (2, [0.38521326209434337, -0.7607452854170671, -1.138418831321935, -0.6150874524000192, 0.8428532223471128])
 (3, [0.721336005389751, -1.1442249074591915, 1.1684689945463134, -2.16217405435812, -1.004274068492681])
 (4, [0.6050078944205708, -0.6643434801552311, -1.762136984961981, 0.7668257407776377, -1.2093949801044968])
 (5, [0.15287912986610439, -0.8987083600616191, 0.5651140207231614, -0.43412461537983404, 0.12622315269492265])
 (6, [0.4849829072057437, -0.30284917884973395, 0.08285398467388008, 0.24550263975627878, 1.9310003355850383])
 (7, [0.14906799301316268, -0.40163177754128215, 0.8621989270922575, -1.29392807633283, -1.146961931116345])
 (8, [-0.3692064759819252, 0.6833280151980384, -0.27567742013271074, -0.20314684750770887, -0.3284001586766101])


In [11]:
'''df = pd.DataFrame(np.random.randn(10, 5))
df.to_hdf(path_to_file,'/amazon',format='table')'''
ddf = dd.read_hdf(path_to_file,key='/amazon') #Dask can read correctly HDF5 created with Pandas
'''with h5py.File(path_to_file, "r+") as hf:
    data = hf.get('amazon/table')
    print data[:]
    df = pd.DataFrame(np.random.randn(10, 5))
    print data.shape
    data.resize((data.shape[0]+chunks,))
    print data.shape
    data[current_index:current_index+chunks] = df
    print 'Data:\n',data[:]'''

'with h5py.File(path_to_file, "r+") as hf:\n    data = hf.get(\'amazon/table\')\n    print data[:]\n    df = pd.DataFrame(np.random.randn(10, 5))\n    print data.shape\n    data.resize((data.shape[0]+chunks,))\n    print data.shape\n    data[current_index:current_index+chunks] = df\n    print \'Data:\n\',data[:]'

In [12]:
ddf.compute()

Unnamed: 0,0,1,2,3,4
0,1.456312,-0.249821,-0.329345,-2.772856,-0.058142
1,0.757431,0.245231,1.410098,-1.333238,-0.084844
2,0.385213,-0.760745,-1.138419,-0.615087,0.842853
3,0.721336,-1.144225,1.168469,-2.162174,-1.004274
4,0.605008,-0.664343,-1.762137,0.766826,-1.209395
5,0.152879,-0.898708,0.565114,-0.434125,0.126223
6,0.484983,-0.302849,0.082854,0.245503,1.931
7,0.149068,-0.401632,0.862199,-1.293928,-1.146962
8,-0.369206,0.683328,-0.275677,-0.203147,-0.3284
9,-0.571848,-0.977001,-0.046565,1.113768,1.912524


## 3) DataFrames with data from JSON files

In [2]:
path_to = "/home/ec2-user/amazon_dataset/"
path_to_file = "/home/ec2-user/amazon_dataset/data.hdf5"
f = 'reviews_Musical_Instruments_5.json'
reviews_columns = ['asin', 'reviewerID','reviewerName', 'overall','summary','reviewText','reviewTime','unixReviewTime']
metadata_columns = ['asin','title','price','imUrl','related','also_bought','also_viewed','bought_together','salesRank','brand','categories']
str_dt = h5py.special_dtype(vlen=str)
#dtypes = np.dtype([('asin',str_dt),('reviewerID',str_dt),('reviewerName',str_dt),('overall',float),('summary',str_dt),
#                  ('reviewText',str_dt),('reviewTime',str_dt),('unixReviewTime',int)])
chunksize = 5000
current_index = 0

#Convert a line of JSON into a cleaned up dict.
def to_json(line):
    return ujson.loads(line.encode('utf8'))

#Convert a not proper line of JSON (due to single quotes) into a cleaned up dict.
def fix_json(line):
    return ast.literal_eval(line)

#Convert a list of JSON strings into a DataFrame
def to_df(batch,filename):
    if filename == 'metadata':
        blobs = map(fix_json,batch)
        df = DataFrame.from_records(blobs, columns=metadata_columns)
    else:
        blobs = map(to_json, batch)
        df = DataFrame.from_records(blobs, columns=reviews_columns)
    return df

def create_new(path_to_file,df, current_index):
    with h5py.File(path_to_file, "w") as hf:
        hf.create_dataset('amazon', data=df,maxshape=(None,None),dtype=str_dt)
        current_index+=chunksize
    return hf, current_index

def extend_hdf5(path_to_file,df, current_index, chunksize):
    with h5py.File(path_to_file, "r+") as hf:
        data = hf.get('amazon')
        if len(df)<chunksize:
            chunksize=len(df)
        data.resize((data.shape[0]+chunksize,data.shape[1]))
        data[current_index:current_index+chunksize] = df
        current_index+=chunksize
        return current_index

def create_hdf5(fullpath,chunksize,current_index):
    filename = fullpath.split('/')[-1].split('.')[0]
    with open(fullpath,'r') as f:
        batches = partition_all(chunksize, f)
        store = None
        for batch in batches:
            df = to_df(batch,filename)
            if store==None:
                store, current_index = create_new(path_to_file,df,current_index)
            else:
                current_index = extend_hdf5(path_to_file,df, current_index, chunksize)

In [3]:
create_hdf5(path_to+f,chunksize,current_index)

In [4]:
f = h5py.File(path_to_file)
f['amazon'][:]

array([['1384719342', 'A2IBPI20UZIR0U',
        'cassandra tu "Yeah, well, that\'s just like, u...', ...,
        "Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,",
        '02 28, 2014', '1393545600'],
       ['1384719342', 'A14VAT5EAX3D9S', 'Jake', ...,
        "The product does exactly as it should and is quite affordable.I did not realized it was double screened until it arrived, so it was even better than I had expected.As an added bonus, one of the screens carries a small hint of the smell of an old grape candy I used to buy, so for reminiscent's sake, I cannot stop putting the pop filter next to my nose and smelling it after recording. :DIf you needed a pop filter, this will work just as well as the expensive ones, and it may even come with a pleasing aroma like

In [5]:
ddf = dd.read_hdf(path_to_file,'/amazon')


  variable length strings are not supported yet

The leaf will become an ``UnImplemented`` node.
  % (self._g_join(childname), exc))


ValueError: Input must be a list longer than 0

## Second Version

In [4]:
path_to = "/home/ec2-user/amazon_dataset/"
path_to_file = "/home/ec2-user/amazon_dataset/data.hdf5"
f = 'reviews_Musical_Instruments_5.json'
reviews_columns = ['asin', 'reviewerID', 'overall','summary','reviewText','reviewTime','unixReviewTime']
metadata_columns = ['asin','title','price','imUrl','related','also_bought','also_viewed','bought_together','salesRank','brand','categories']
#str_dt = h5py.special_dtype(vlen=unicode)
#dtypes = np.dtype([('asin',str_dt),('reviewerID',str_dt),('reviewerName',str_dt),('overall',float),('summary',str_dt),
 #                  ('reviewText',str_dt),('reviewTime',str_dt),('unixReviewTime',int)])
chunksize = 5000
current_index = 0

#Convert a line of JSON into a cleaned up dict.
def to_json(line):
    return dissoc(ujson.loads(line.encode('utf8')),'reviewerName')

#Convert a not proper line of JSON (due to single quotes) into a cleaned up dict.
def fix_json(line):
    return ast.literal_eval(line)

#Convert a list of JSON strings into a DataFrame
def to_df(batch,filename):
    if filename == 'metadata':
        blobs = map(fix_json,batch)
        df = DataFrame.from_records(blobs, columns=metadata_columns)
    else:
        blobs = map(to_json, batch)
        df = DataFrame.from_records(blobs, columns=reviews_columns)
    return df

def create_new(path_to_file,df, current_index):
    types = df.apply(lambda x: pd.lib.infer_dtype(x.values))
    for col in types[types=='unicode'].index:
        df[col] = df[col].astype(str)
    print df.apply(lambda x: pd.lib.infer_dtype(x.values))
    df.to_hdf(path_to_file,'/amazon',format='table',data_columns=True,min_itemsize={'summary':150,'reviewText':12000})
    current_index+=chunksize
    return 'blabla', current_index

def extend_hdf5(path_to_file,df, current_index, chunksize):
    types = df.apply(lambda x: pd.lib.infer_dtype(x.values))
    for col in types[types=='unicode'].index:
        df[col] = df[col].astype(str)
    df.to_hdf(path_to_file,'/amazon',format='table',data_columns=True,append=True)
    current_index+=chunksize
    return current_index

def create_hdf5(fullpath,chunksize,current_index):
    filename = fullpath.split('/')[-1].split('.')[0]
    with open(fullpath,'r') as f:
        batches = partition_all(chunksize, f)
        store = None
        for batch in batches:
            df = to_df(batch,filename)
            if store==None:
                store, current_index = create_new(path_to_file,df,current_index)
            else:
                current_index = extend_hdf5(path_to_file,df, current_index, chunksize)

In [5]:
create_hdf5(path_to+f,chunksize,current_index)

asin                string
reviewerID          string
overall           floating
summary             string
reviewText          string
reviewTime          string
unixReviewTime     integer
dtype: object


be ready to see PyTables asking for *lots* of memory and possibly slow
I/O.  You may want to reduce the rowsize by trimming the value of
dimensions that are orthogonal (and preferably close) to the *main*
dimension of this leave.  Alternatively, in case you have specified a
very small/large chunksize, you may want to increase/decrease it.


In [6]:
#Check status
with h5py.File(path_to_file, "r+") as hf:
    data = hf.get('amazon/table')
    print data.shape
    print data[:10]

(10261,)
[ (0, '1384719342', 'A2IBPI20UZIR0U', 5.0, 'good', "Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,", '02 28, 2014', 1393545600)
 (1, '1384719342', 'A14VAT5EAX3D9S', 5.0, 'Jake', "The product does exactly as it should and is quite affordable.I did not realized it was double screened until it arrived, so it was even better than I had expected.As an added bonus, one of the screens carries a small hint of the smell of an old grape candy I used to buy, so for reminiscent's sake, I cannot stop putting the pop filter next to my nose and smelling it after recording. :DIf you needed a pop filter, this will work just as well as the expensive ones, and it may even come with a pleasing aroma like mine did!Buy this product! :]", '03 16, 2013', 1363392000)
 (2, '1384719342

In [7]:
df = dd.read_hdf(path_to_file,'/amazon')

In [14]:
result = df.where(df['overall']<5).groupby('asin').mean().nlargest(10,columns='overall')
result.compute()

MemoryError: 

Traceback
---------
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/dask/async.py", line 267, in execute_task
    result = _execute_task(task, data)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/dask/async.py", line 249, in _execute_task
    return func(*args2)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/dask/dataframe/io.py", line 523, in _pd_read_hdf
    result = pd.read_hdf(path, key, **kwargs)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py", line 360, in read_hdf
    return store.select(key, auto_close=auto_close, **kwargs)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py", line 724, in select
    return it.get_result()
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py", line 1423, in get_result
    results = self.func(self.start, self.stop, where)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py", line 717, in func
    columns=columns, **kwargs)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py", line 4087, in read
    if not self.read_axes(where=where, **kwargs):
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py", line 3288, in read_axes
    values = self.selection.select()
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py", line 4686, in select
    return self.table.table.read(start=self.start, stop=self.stop)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/tables/table.py", line 2008, in read
    arr = self._read(start, stop, step, field, out)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/tables/table.py", line 1898, in _read
    result = self._get_container(nrows)
  File "/home/ec2-user/anaconda2/lib/python2.7/site-packages/tables/table.py", line 1005, in _get_container
    return numpy.empty(shape=shape, dtype=self._v_dtype)
