In [1]:
import ast
import logging
import sys
import copy
import collections.abc
import json

import dataset

logging.basicConfig(format='%(levelname)s | %(message)s',
                    level=logging.DEBUG,
                    stream=sys.stdout)
log = logging.getLogger(__name__)

In [2]:
import datalink

In [3]:
from pandas.util import hash_pandas_object
import pandas as pd
import numpy as np

np.random.seed(42)
arr = np.random.choice(['foo', 'bar', 42], size=(3,4))
df = pd.DataFrame(arr)

df
h = hash_pandas_object(df)
type(h)
h

arr = np.random.choice(['foo', 'bar', 42], size=(3,4))
df = pd.DataFrame(arr)
df
h2 = hash_pandas_object(df)
type(h2)
h2

h3 = h2.copy()

h.equals(h2)

h2.equals(h3)

Unnamed: 0,0,1,2,3
0,42,foo,42,42
1,foo,foo,42,bar
2,42,42,42,42


pandas.core.series.Series

0     5559921529589760079
1    16825627446701693880
2     7171023939017372657
dtype: uint64

Unnamed: 0,0,1,2,3
0,foo,42,bar,foo
1,bar,bar,bar,bar
2,foo,foo,bar,bar


pandas.core.series.Series

0    13395664528142679341
1     4844743719075605538
2    13663166965737423489
dtype: uint64

False

True

In [4]:
datalink.test_output()

INFO | logging from datalink


In [5]:
from collections import namedtuple

p = namedtuple('Point', ['x', 'y'])
n = p(x=(1,2,3), y=2)
n2 = p(x=(1,2,3), y=2)

In [6]:
hash(n)
hash(n2)

5701640225614034726

5701640225614034726

In [7]:
class DataStoreDescriptor(object):
    """A descriptor for the relevant key in the datastore."""

    def __init__(self, key):
        self.key = key

    def __get__(self, instance, owner):
        return instance._data[self.key]

    def __set__(self, instance, value):
        instance._data[self.key] = value
        if instance._has_data_updated:
            instance._save_state()
            instance._set_data_hash()

class DataStore:
    db_path = None
    table_name = None
    _data_fields = {}
    
    """Class for a basic mapping datastore."""
    def __init__(self, link='unique', **kwargs):
        self._hash_previous = None
        self._data = self._data_fields # {'a': 10, 'b': [2,3], 'c': 'a string'}
        for key in self._data:
            if not hasattr(self.__class__, key):
                setattr(self.__class__, key, DataStoreDescriptor(key))
        self._get_data_hash()
        
        # Establish link and attempt a load.
        if link == 'unique':
            self.link = datalink.UniqueLookup(table_name = self.table_name,
                                              db_path = self.db_path,
                                              **kwargs)
        elif link == 'metadata':
            self.link = datalink.NamespaceLookup(**kwargs)
        if self.link._loaded_data:
            self._format_loaded_data()
    
    # Properties for interfacing with the link to save, and to handle
    # translation between SQL friendly data and the python objects in
    # the data store.
    def _save_state(self):
        log.debug('Call to _save_state.')
        self.link.save(self._sql_friendly_data)
        
    @property
    def _sql_friendly_data(self):
        """
        Property to return a version of the data store
        with data types supported by SQL.
        """
        d = copy.deepcopy(self._data)
        for key, val in d.items():
            if (isinstance(val, collections.abc.Sequence) and not 
                isinstance(val, str)):
                try:
                    d[key] = str(val)
                except TypeError:
                    raise
        # Add the uuid
        d['uuid'] = self.link.uuid
        return d
    
    def _format_loaded_data(self):
        results = list(self.link._loaded_data)
        if len(results) != 1:
            log.warning(f'Ambiguous uuid in loading of data,'
                        f' received {len(results)} results.')
        d = results[0]
        d.pop('id')
        d.pop('uuid')
        for k,v in d.items():
            try:
                d[k] = ast.literal_eval(v)
            except (ValueError, SyntaxError):
                d[k] = v
        self._data = dict(d)
        
    # Properties for accessing and updating the data store.
    @property
    def data(self):
        return self._data
    
    @property
    def uuid(self):
        return self.link.uuid
    
    def update(self, config):
        """
        Update multiple properties at once.
        Only uses descriptor directly in last call for 
        only one save call.
        """
        for i, (k, v) in enumerate(config.items()):
            if i == len(config)-1:
                setattr(self, k, v)
            else:
                self._data[k] = v
    
    # Properties and methods for hashing data and detecting changes
    # in the internal data store state.
    @property
    def _hashable_data(self):
        """Make any unhashable values in the data store hashable."""
        d = copy.deepcopy(self._data)
        for key, val in d.items():
            if isinstance(val, collections.abc.Hashable):
                continue
            else:
                if isinstance(val, collections.abc.Iterable):
                    try:
                        d[key] = tuple(val)
                    except TypeError:
                        raise
        return d
    
    def _get_data_hash(self):
        """
        Creates a hash of the internal datastore, casting 
        unhashable types to hashables where possible.
        """
        d = self._hashable_data
        # Make a hash and assign it.
        h = hash(json.dumps(d, sort_keys=True))
        return h

    def _set_data_hash(self):
        self._hash_previous = self._get_data_hash()
    
    @property
    def _has_data_updated(self):
        new_hash = self._get_data_hash()
        if new_hash == self._hash_previous:
            return False
        else:
            return True

def datalink_factory(
    db_path=None, table_name=None, data_fields=None,
    ):
    """
    Factory function to produce a new class derived from DataStore.
    """
    class NewClass(DataStore):
        pass
    NewClass.db_path = db_path
    NewClass.table_name = table_name
    NewClass._data_fields = data_fields
    return NewClass

In [8]:
MyClass = datalink_factory(db_path='~/test.db', table_name='data',
                           data_fields={'a': None, 'b': [2,4], 'c': 'a string'})


In [9]:
d = MyClass()
d.data

DEBUG | Creating database: /home/sogilvy/test.db
INFO | - db created at path: /home/sogilvy/test.db


{'a': None, 'b': [2, 4], 'c': 'a string'}

In [10]:
d.a = 12
d.data

DEBUG | Call to _save_state.




{'a': 12, 'b': [2, 4], 'c': 'a string'}

In [11]:
d.update({'a': 14, 'c': 'a new string'})
d.data

DEBUG | Call to _save_state.


{'a': 14, 'b': [2, 4], 'c': 'a new string'}

In [12]:
# Now let's get the uuid and load into a new instance from the SQL database.
d.uuid

'2d1b9160-a265-4d3b-a37e-6929090bed55'

In [13]:
d2 = MyClass(uuid=d.uuid)

Loading


In [14]:
d2.data

{'a': 14, 'b': [2, 4], 'c': 'a new string'}