In [2]:
import happybase

In [84]:
connection = happybase.Connection('localhost', autoconnect=False)

# before first use:
connection.open()

In [79]:
print(connection.tables())

['mytable', 'test']


# Working with tables

In [106]:
table = happybase.Connection('localhost').table('test')

In [85]:
table = connection.table('test')

In [70]:
table.put('row4', {'cf:d': 'value4'})

In [72]:
row = table.row('row4')
print(row['cf:d'])

value4


In [73]:
for key, data in table.rows(['row1', 'row2']):
    print(key, data)

('row1', {'cf:a': 'value1'})
('row2', {'cf:b': 'value2'})


In [74]:
for key, data in table.scan(row_prefix=b'row'):
    print(key, data)

('row1', {'cf:a': 'value1'})
('row2', {'cf:b': 'value2'})
('row3', {'cf:c': 'value3'})
('row4', {'cf:d': 'value4'})


In [82]:
rows_as_dict = dict(table.rows(['row1', 'row2']))
print rows_as_dict

{'row1': {'cf:a': 'value1'}, 'row2': {'cf:b': 'value2'}}


In [88]:
from collections import OrderedDict
rows_as_ordered_dict = OrderedDict(table.rows([b'row1', b'row2']))
print rows_as_ordered_dict

OrderedDict([('row1', {'cf:a': 'value1'}), ('row2', {'cf:b': 'value2'})])


# Making more fine-grained selections

In [89]:
table = happybase.Connection('localhost').table('test')

In [91]:
table.put('row1', {'cf:a0': 'value0'})

In [92]:
row = table.row('row1', columns=['cf:a','cf:a0'])
print(row[b'cf:a'])
print(row[b'cf:a0'])

value1
value0


In [95]:
row = table.row('row1', columns=['cf:a'])
print row

{'cf:a': 'value1'}


In [102]:
row = table.row('row1', columns=['cf:a'], include_timestamp=True)
value, timestamp = row[b'cf:a']
print value
print timestamp

value1
1485924383141


In HBase, each cell has a timestamp attached to it. In case you don’t want to work with the latest version of data stored in HBase, the methods that retrieve data from the database, e.g. Table.row(), all accept a timestamp argument that specifies that the results should be restricted to values with a timestamp up to the specified timestamp:

In [104]:
row = table.row('row1', timestamp=2485924383141)
print row

{'cf:a': 'value1', 'cf:a0': 'value0'}


HBase supports storing multiple versions of the same cell. This can be configured for each column family. To retrieve all versions of a column for a given row, Table.cells() can be used. This method returns an ordered list of cells, with the most recent version coming first. The versions argument specifies the maximum number of versions to return. Just like the methods that retrieve rows, the include_timestamp argument determines whether timestamps are included in the result. Example:

In [107]:
values = table.cells(b'row1', b'cf:a', versions=2)
for value in values:
    print("Cell data: {}".format(value))

Cell data: value1


In [108]:
cells = table.cells(b'row1', b'cf:a', versions=3, include_timestamp=True)
for value, timestamp in cells:
    print("Cell data at {}: {}".format(timestamp, value))

Cell data at 1485924383141: value1


# Scanning over rows in a table

In [109]:
table = happybase.Connection('localhost').table('test')

In [110]:
for key, data in table.scan():
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row2', {'cf:b': 'value2'})
('row3', {'cf:c': 'value3'})
('row4', {'cf:d': 'value4'})


In [115]:
for key, data in table.scan(row_start='row2'):
    print(key, data)

('row2', {'cf:b': 'value2'})
('row3', {'cf:c': 'value3'})
('row4', {'cf:d': 'value4'})


To iterate over all rows from the start of the table up to row xyz, use this:

In [114]:
for key, data in table.scan(row_stop='row3'):
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row2', {'cf:b': 'value2'})


In [116]:
for key, data in table.scan(row_start='row1', row_stop='row3'):
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row2', {'cf:b': 'value2'})


An alternative is to use a key prefix. For example, to iterate over all rows starting with abc:

In [119]:
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row2', {'cf:b': 'value2'})
('row3', {'cf:c': 'value3'})
('row4', {'cf:d': 'value4'})


# Storing data

In [122]:
table = happybase.Connection('localhost').table('test')

In [123]:
table.put('row5', {'cf:e0': b'value5-0',
                   'cf:e1': b'value5-1'})
table.put('row6', {'cf:f0': b'value6-0',
                   'cf:f1': b'value6-1'})

In [140]:
table.put('row7', {b'cf:g': b'value8'})

In [141]:
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row2', {'cf:b': 'value2'})
('row3', {'cf:c': 'value3'})
('row4', {'cf:d': 'value4'})
('row5', {'cf:e1': 'value5-1', 'cf:e0': 'value5-0'})
('row6', {'cf:f0': 'value6-0', 'cf:f1': 'value6-1'})
('row7', {'cf:g': 'value8'})
('row8', {'cf:h0': 'value8-0', 'cf:h1': 'value8-1'})
('row9', {'cf:i1': 'value9-1', 'cf:i0': 'value9-0'})


# Deleting data

In [127]:
table = happybase.Connection('localhost').table('test')

In [128]:
table.delete('row7')
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row2', {'cf:b': 'value2'})
('row3', {'cf:c': 'value3'})
('row4', {'cf:d': 'value4'})
('row5', {'cf:e1': 'value5-1', 'cf:e0': 'value5-0'})
('row6', {'cf:f0': 'value6-0', 'cf:f1': 'value6-1'})


# Performing batch mutations

The Table.put() and Table.delete() methods both issue a command to the HBase Thrift server immediately. This means that using these methods is not very efficient when storing or deleting multiple values. It is much more efficient to aggregate a bunch of commands and send them to the server in one go. This is exactly what the Batch class, created using Table.batch(), does. A Batch instance has put and delete methods, just like the Table class, but the changes are sent to the server in a single round-trip using Batch.send():

In [231]:
table = happybase.Connection('localhost').table('test')

In [173]:
b = table.batch()
try:
    b.put('row8', {'cf:h0': 'value8-0', 'cf:h1': 'value8-1'})
    b.put('row9', {'cf:i0': 'value9-0', 'cf:i1': 'value9-1'})
    b.put('row10', {'cf:j0': 'value10-0'})
    b.put('rowx', {'cf:j0': 'value10-0'})

    b.delete(b'rowx')
    raise ValueError("Something went wrong!")
except ValueError as e:
    # error handling goes here; nothing will be sent to HBase
    pass
else:
    # no exceptions; send data
    b.send()

In [174]:
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row10', {'cf:j0': 'value10-0', 'cf:i0': 'value9-0'})
('row2', {'cf:b': 'value2'})
('row3', {'cf:c': 'value3'})
('row4', {'cf:d': 'value4'})
('row5', {'cf:e1': 'value5-1', 'cf:e0': 'value5-0'})
('row6', {'cf:f0': 'value6-0', 'cf:f1': 'value6-1'})
('row7', {'cf:g': 'value8'})
('row8', {'cf:h0': 'value8-0', 'cf:h1': 'value8-1'})
('row9', {'cf:i1': 'value9-1', 'cf:i0': 'value9-0'})


Batch instances can be used as context managers, which are most useful in combination with Python’s with construct. The example above can be simplified to read:

In [189]:
with table.batch(transaction=True) as b:
     b.put('row2', {'cf:b0': 'value2-0'})
     b.put('row3', {'cf:b0': 'value3-0'})
       

In [190]:
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row10', {'cf:j0': 'value10-0', 'cf:i0': 'value9-0'})
('row2', {'cf:b0': 'value2-0', 'cf:b': 'value2'})
('row3', {'cf:b0': 'value3-0', 'cf:c': 'value3'})
('row4', {'cf:d': 'value4'})
('row5', {'cf:e1': 'value5-1', 'cf:e0': 'value5-0'})
('row6', {'cf:f0': 'value6-0', 'cf:f1': 'value6-1'})
('row7', {'cf:g': 'value8'})
('row8', {'cf:h0': 'value8-0', 'cf:h1': 'value8-1'})
('row9', {'cf:i1': 'value9-1', 'cf:i0': 'value9-0'})


In [225]:
try:
    with table.batch(transaction=True) as b:
        b.put('row2', {'cf:b3': 'value2-4'})
        b.put('row3', {'cf:c3': 'value3-4'})
        #raise ValueError("Something went wrong!")
except ValueError:
    # error handling goes here; nothing is sent to HBase
    pass

# when no error occurred, the transaction succeeded

In [226]:
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row10', {'cf:j0': 'value10-0', 'cf:i0': 'value9-0'})
('row2', {'cf:b0': 'value2-0', 'cf:b1': 'value2-1', 'cf:b3': 'value2-4', 'cf:b': 'value2'})
('row3', {'cf:b0': 'value3-0', 'cf:c': 'value3', 'cf:c3': 'value3-4', 'cf:c2': 'value3-3'})
('row4', {'cf:d': 'value4'})
('row5', {'cf:e1': 'value5-1', 'cf:e0': 'value5-0'})
('row6', {'cf:f0': 'value6-0', 'cf:f1': 'value6-1'})
('row7', {'cf:g': 'value8'})
('row8', {'cf:h0': 'value8-0', 'cf:h1': 'value8-1'})
('row9', {'cf:i1': 'value9-1', 'cf:i0': 'value9-0'})


As you may have imagined already, a Batch keeps all mutations in memory until the batch is sent, either by calling Batch.send() explicitly, or when the with block ends. This doesn’t work for applications that need to store huge amounts of data, since it may result in batches that are too big to send in one round-trip, or in batches that use too much memory. For these cases, the batch_size argument can be specified. The batch_size acts as a threshold: a Batch instance automatically sends all pending mutations when there are more than batch_size pending operations. For example, this will result in three round-trips to the server (two batches with 1000 cells, and one with the remaining 400):

In [241]:
with table.batch(batch_size=2) as b:
    for i in range(20):
        # this put() will result in two mutations (two cells)
        b.put(b'row%2d'%(i), {'cf:x': 'valuex', 'cf:y': 'valuey'})

In [242]:
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row 0', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 1', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 2', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 3', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 4', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 5', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 6', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 7', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 8', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 9', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row%04d', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row10', {'cf:j0': 'value10-0', 'cf:i0': 'value9-0', 'cf:x': 'valuex', 'cf:y': 'valuey'})
('row11', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row12', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row13', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row14', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row15', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row16', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row17', {'cf:y': 'valuey', 'cf:x': 'value

In [243]:
with table.batch(batch_size=2) as b:
    for i in range(20):
        # this put() will result in two mutations (two cells)
        b.delete(b'row%2d'%(i))

In [244]:
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row%04d', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row1', {'cf:a': 'value1', 'cf:a0': 'value0'})
('row2', {'cf:b0': 'value2-0', 'cf:b1': 'value2-1', 'cf:b3': 'value2-4', 'cf:b': 'value2'})
('row3', {'cf:b0': 'value3-0', 'cf:c': 'value3', 'cf:c3': 'value3-4', 'cf:c2': 'value3-3'})
('row4', {'cf:d': 'value4'})
('row5', {'cf:e1': 'value5-1', 'cf:e0': 'value5-0'})
('row6', {'cf:f0': 'value6-0', 'cf:f1': 'value6-1'})
('row7', {'cf:g': 'value8'})
('row8', {'cf:h0': 'value8-0', 'cf:h1': 'value8-1'})
('row9', {'cf:i1': 'value9-1', 'cf:i0': 'value9-0'})


# create New Table

In [263]:
connection = happybase.Connection('localhost', autoconnect=False)
# before first use:
connection.open()
connection.create_table('mytable',{'cf': dict()})

In [264]:
table = happybase.Connection('localhost').table('mytable')

In [268]:
with table.batch(batch_size=2) as b:
    for i in range(10):
        # this put() will result in two mutations (two cells)
        b.put(b'row%2d'%(i), {'cf:x': 'valuex', 'cf:y': 'valuey'})

In [269]:
for key, data in table.scan(row_prefix='ro'):
    print(key, data)

('row 0', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 1', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 2', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 3', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 4', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 5', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 6', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 7', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 8', {'cf:y': 'valuey', 'cf:x': 'valuex'})
('row 9', {'cf:y': 'valuey', 'cf:x': 'valuex'})
