### Environment

To run this notebook, open a database connection to warehouse.ofh.priv and set the following environment variables *in* in the shell process running ipython:

``` shell
export PACIOLI_DB="postgresql"
export PACIOLI_DB_URI="?:?@localhost:2001/mellon"
export PACIOLI_DB_URL="postgresql://?:?@localhost:2001/mellon"
```

In [1]:
%matplotlib inline

In [2]:
from datetime import datetime
import numpy as np
import pandas as pd
from pandas.tslib import Timestamp
import seaborn as sns
from sqlalchemy import and_
import matplotlib.pyplot as plt

In [3]:
def join_related(df, join_target=None, join_on=None, columns=None):
    """Extend an existing dataframe with columns from a related table.

    :type df: pandas.DataFrame
    :param df: existing dataframe to be extended.

    :type join_target: str
    :param join_target: the string name of the collection from which to
        collect additional data.

    :type join_on: str
    :param join_on: key on existing df with which to execute the join.

    :type columns: list of strings
    :param columns: optional list of columns on join_target to include,
        defaulting to all columns when none are specified.

    """
    # verify join_on
    if join_on not in df.columns:
        raise Exception(
            'join_on ({}) must be name of column in df ({})'.format(
                join_on, df.columns))

    # identify target collection
    target = getattr(db, join_target, None)
    if target is None:
        raise Exception(
            '{} is not a valid db collection'.format(join_target))

    # retrieve target data
    filter_clause = lambda query: query.filter(
        target.model.uid.in_(df[join_on].dropna().unique()))
    join = target.search(filter_clause=filter_clause, dataframe=True)
    join = join.set_index('uid')
    if columns is not None:
        join = join[columns]

    # return merged dataframe
    # return df.join(join, on=join_on)
    return df.join(join, on=join_on)

In [4]:
def join_stepwise(df, join_steps):
    for step in join_steps:
        df = join_related(df, *step)
    return df

In [5]:
def make_cartesian_index(df):
    new = pd.MultiIndex.from_product(
        df.index.levels, names=df.index.names)
    return df.reindex(new)

In [6]:
def datetime_comparator(dt):
    return dt.strftime('%Y-%m-%d %H:%M:%S')

In [7]:
def append_derived_column(df, column_name, func=lambda x: x):
    """Convenience function for appending derived column to a dataframe."""
    if not isinstance(df, pd.DataFrame):
        raise Exception(
            '`append_derived_column`expected pd.DataFrame, got {}'.format(
                type(df)))
    df[column_name] = df.apply(lambda row: func(row), axis=1)
    return df


def append_full_name(df, first='legal_first_name', last='legal_last_name',
                     alias='full_name'):
    """Combine first and last name into a separate field.

    :type df: pandas.DataFrame
    :param df: the dataframe on which to operate

    :type first: str
    :param first: the name of the field to use as first name

    :type last: str
    :param last: the name of the field to use as last name

    :type alias: str
    :param alias: the string with which to name the new, combined field

    """
    func = lambda row: '{} {}'.format(row[first], row[last])
    df = append_derived_column(df, alias, func=func)

In [8]:
from pacioli import connect

In [9]:
db = connect()