In [1]:
from collections import OrderedDict
import datetime

import numpy as np
import pandas as pd

from IPython.display import HTML

import statsmodels.api as sm

from break4w.categorical import Categorical
from break4w.continous import Continous
from break4w.question import Question
from break4w.bool import Bool

I'm going to try to make a data dictionary object using columns from an example data dictionary and study I worked with a while ago. I'm going to start this by assuming we can convert a text documnt to a series of dictionaries to build off of. I'm going to use the data description from the [Statsmodels National Election DataSet](http://www.statsmodels.org/0.6.1/datasets/generated/anes96.html).

In [2]:
data_ = pd.DataFrame(sm.datasets.anes96.load().data)

In [3]:
columns = [
    {
        'name': 'popul',
        'description': 'Census place population in 1000s',
        'dtype': float,
        'units': 'people',
    },
    {
        'name': 'TVnews',
        'description': 'Number of times per week that respondent watches TV news.',
        'dtype': int,
        'units': 'views per week',
        'limits': [0, None]
    },
    {
        'name': 'PID',
        'description': 'Party identification of respondent',
        'dtype': int,
        'order': [0, 1, 2, 3, 4, 5, 6],
        'numeric_mapping': {0: 'Strong Democrat',
                            1: 'Weak Democrat', 
                            2: 'Independent-Democrat', 
                            3: 'Independent-Indpendent', 
                            4: 'Independent-Republican', 
                            5: 'Weak Republican', 
                            6: 'Strong Republican'}
    },
    {
        'name': 'vote',
        'description': 'Individual expected to vote for Bob Dole',
        'dtype': bool,
    },
    ]
types = ['continous', 'question', 'categorical']

In [4]:
type_lookup = {'continous': Continous,
               'categorical': Categorical,
               'multiple choice': Categorical,
               'ordinal': Categorical,
               'bool': Bool,
               'boolean': Bool,
               'yes/no': Bool,
               }

In [5]:
proto_dict = OrderedDict()
for col_, type_ in zip(*(columns, types)):
    question_type = type_lookup.get(type_.lower(), Question)
    proto_dict[col_['name']] = question_type(**col_)
#         proto_dict[col_['name']] = Continous(**col_)
#     elif type_ == 'categorical':
#         proto_dict[col_['name']] = Categorical(**col_)
#     else:
#         proto_dict[col_['name']] = Question(**col_)

In [6]:
proto_dict

OrderedDict([('popul', <break4w.continous.Continous at 0x11a5e9ef0>),
             ('TVnews', <break4w.question.Question at 0x11a5e9f60>),
             ('PID', <break4w.categorical.Categorical at 0x11a5e9fd0>)])

In [7]:
type_lookup = {'continous': Continous,
               'categorical': Categorical,
               'multiple choice': Categorical,
               'ordinal': Categorical,
               'bool': Bool,
               'boolean': Bool,
               'yes/no': Bool,
               }

In [8]:
proto_dict

OrderedDict([('popul', <break4w.continous.Continous at 0x11a5e9ef0>),
             ('TVnews', <break4w.question.Question at 0x11a5e9f60>),
             ('PID', <break4w.categorical.Categorical at 0x11a5e9fd0>)])

In [9]:
class DataDictionary:
    def __init__(self, columns, types):
        """Initializes the dictionary object
        """
        self.log = []
        self.columns = OrderedDict()
        for col_, type_ in zip(*(columns, types)):
            self.add_question(col_, type_, record=False,)
        self.columns = proto_dict

        self._update_log('initialize the dictionary')
    def _update_log(self, command, column=None,
                    transform_type=None, transformation=None):
        """Used for internal tracking of the columns and data
        """
        self.log.append({
            'timestamp': datetime.datetime.now(),
            'column': column,
            'command': command,
            'transform_type': transform_type,
            'transformation': transformation,
            })

    def add_question(self, question_data, question_type=None, check=True,
                     record=True):
        """
        Adds a new question object to the data dictionary
        """
        error = False

        # Converts the dict data to a Question object
        if isinstance(question_data, dict):
            question_object = type_lookup.get(question_type.lower(), Question)
            question_data = question_object(**question_data)
        name = question_data.name
        
        # Checks if the question is in the dictionary
        if (name in self.columns.keys()) and check:
            error = True
            message = '%s already has a dictionary entry.' % name
            transform_type = 'error'
        else:
            message = '%s was added to the dictionary' % name
            transform_type = None

        # Updates thd log
        if record:
            self._update_log('add column', column=name, transformation=message,
                             transform_type=transform_type)
    
        # Raises an error or updates the dictionary, as appropriate
        if error:
            raise ValueError(message)
        else:
            self.columns[name] = question_data

In [10]:
test = DataDictionary([], [])

In [11]:
test.add_question(columns[0], types[0])
test.add_question(Continous(**columns[1]))
test.add_question(columns[1], types[1])

ValueError: popul already has a dictionary entry.

In [22]:
list(test.columns.keys())

['popul', 'TVnews', 'PID']

In [25]:
columns = test.columns

In [26]:
columns

OrderedDict([('popul', <break4w.continous.Continous at 0x113e6d630>),
             ('TVnews', <break4w.question.Question at 0x113e6d6a0>),
             ('PID', <break4w.categorical.Categorical at 0x113e6d710>)])

In [30]:
del columns['popul']

KeyError: 'popul'

In [28]:
columns

OrderedDict([('TVnews', <break4w.question.Question at 0x113e6d6a0>),
             ('PID', <break4w.categorical.Categorical at 0x113e6d710>)])

In [29]:
test.columns

OrderedDict([('TVnews', <break4w.question.Question at 0x113e6d6a0>),
             ('PID', <break4w.categorical.Categorical at 0x113e6d710>)])

In [65]:
class DictTest(OrderedDict):
    def __init__(self, columns, types):
        for t, q in zip(*(types, columns)):
            question_object = type_lookup.get(t.lower(), Question)
            question = question_object(**q)
            self[question.name] = question
        
        self.log = []
        self._update_log('initialize the dictionary')
    
    def _update_log(self, command, column=None,
        transform_type=None, transformation=None):
        """Used for internal tracking of the columns and data

        Every time a Question acts on data, a record should be made of
        the transformation. (See break4w.question.Question._update_log).
        However, this also tracks the transformation on the dictionary
        level.

        Parameters
        ----------
        command : str
            A short textual description of the command performed. This
            may be the function name in text format.
        column : str, optional
            The column in the metadata being explored.
        transform_type: str, optional
            A more general description of the type of action that was
            performed. Ideally, this comes for a preset list of possible
            actions, and the descriptions are consistent.
        transformation: str, optional
            Explains exactly how values were changed.

        """
        self.log.append({
            'timestamp': datetime.datetime.now(),
            'column': column,
            'command': command,
            'transform_type': transform_type,
            'transformation': transformation,
            })
    
    def add_question(self, question_data, question_type=None, check=True,
                     record=True):
        """..."""
        error = False

        # Converts the dict data to a Question object
        if isinstance(question_data, dict):
            question_object = type_lookup.get(question_type.lower(), Question)
            question_data = question_object(**question_data)
        name = question_data.name
        
        # Checks if the question is in the dictionary
        if (name in self.keys()) and check:
            error = True
            message = '%s already has a dictionary entry.' % name
            transform_type = 'error'
        else:
            message = '%s was added to the dictionary' % name
            transform_type = None

        # Updates thd log
        if record:
            self._update_log('add column', column=name, transformation=message,
                             transform_type=transform_type)
    
        # Raises an error or updates the dictionary, as appropriate
        if error:
            raise ValueError(message)
        else:
            self[name] = question_data

In [66]:
test = DictTest(columns[:2], types[:2])

In [67]:
test.log

[{'column': None,
  'command': 'initialize the dictionary',
  'timestamp': datetime.datetime(2017, 6, 8, 16, 12, 30, 447266),
  'transform_type': None,
  'transformation': None}]

In [68]:
test.add_question(columns[2], types[2])

In [72]:
current = vars(test['popul'])

In [83]:
new = {'blanks': 'not applicable',
       'frog': 'Chowder'}

In [86]:
change_keys = {}
for k, v in new.items():
    if k in current:
        change_keys[k] = (current[k], v)
    else:
        change_keys[k] = ('add', v)
    setattr(test['popul'], k, v)

In [88]:
test['popul'].frog

'Chowder'

In [89]:
check = test['popul']

In [90]:
check.cat = 'None'

In [92]:
check.cat

'None'

In [93]:
test['popul'].cat

'None'

In [95]:
vars(check)

{'blanks': 'not applicable',
 'bound_lower': None,
 'bound_upper': None,
 'cat': 'None',
 'clean_name': 'Popul',
 'colormap': None,
 'derivative_columns': [],
 'description': 'Census place population in 1000s',
 'dtype': float,
 'free_response': False,
 'frog': 'Chowder',
 'log': [],
 'mimarks': False,
 'missing': {'missing: not collected',
  'missing: not provided',
  'missing: restricted',
  'not applicable',
  'not collected',
  'not provided',
  'restricted'},
 'name': 'popul',
 'notes': None,
 'ontology': None,
 'original_name': None,
 'other_properties': {},
 'outlier_lower': None,
 'outlier_upper': None,
 'sig_figs': None,
 'source_columns': [],
 'type': 'Continous',
 'units': 'people'}