In [1]:
import json
from pathlib import Path
import os

import pandas as pd


#//****************************************************************
#//****************************************************************
#//*** Skipping S3 Cluster this week due to reliability issues
#//****************************************************************
#import s3fs
#def read_cluster_csv(file_path, endpoint_url='https://storage.budsc.midwest-datascience.com'):
#    s3 = s3fs.S3FileSystem(
#        anon=True,
#        client_kwargs={
#            'endpoint_url': endpoint_url
#        }
#    )
#    return pd.read_csv(s3.open(file_path, mode='rb'))
#//****************************************************************
#//****************************************************************

#//*** Get Current Working Directory
current_dir = Path(os.getcwd()).absolute()

#//*** Build Path to Results directory
results_dir = current_dir.joinpath('results')

#//*** Build Path to Results/kvdb directory
kv_data_dir = results_dir.joinpath('kvdb')

#//*** Make kvdb folder as needed
kv_data_dir.mkdir(parents=True, exist_ok=True)

#//*** Build Filepaths
people_json = kv_data_dir.joinpath('people.json')
visited_json = kv_data_dir.joinpath('visited.json')
sites_json = kv_data_dir.joinpath('sites.json')
measurements_json = kv_data_dir.joinpath('measurements.json')

In [2]:
class KVDB(object):
    def __init__(self, db_path):
        self._db_path = Path(db_path)
        self._db = {}
        self._load_db()

    def _load_db(self):
        if self._db_path.exists():
            with open(self._db_path) as f:
                self._db = json.load(f)

    def get_value(self, key):
        #//*** If key is string, return db value
        if isinstance(key, str):
            return self._db.get(key)
        
        #//*** If Key is INT Convert to String
        if isinstance(key, int):
            return self._db.get(str(key))
        
        if isinstance(key, tuple):
            
            
            
            #//*** Get Value using Top Level Tuple
            value = self._db.get(str(key[0]))
            
            #//*** Loop through remaining Tuples and follow the keys
            #//*** In Production Code, I'd add more validation
            for tk in key[1:]:
                if tk in value.keys():
                    value = value[str(tk)]
            
            return value
        
    def set_value(self, key, value):
        self._db[key] = value

    def save(self):
        with open(self._db_path, 'w') as f:
            json.dump(self._db, f, indent=2)

measurements = KVDB(measurements_json)
measurements.get_value(('619','dyer','rad'))
#measurements.get_value(("619",'dyer','sal'))
            

{'visit_id': 619, 'person_id': 'dyer', 'quantity': 'rad', 'reading': 9.82}

In [3]:
def create_sites_kvdb():
    fields = ['site_id']
    db = KVDB(sites_json)
    #//*** Skipping S3 Cluster this week due to reliability issues
    #df = read_cluster_csv('data/external/tidynomicon/site.csv')
    
    df = pd.read_csv("./source/site.csv")
    for site_id, group_df in df.groupby(fields[0]):
        db.set_value(site_id, group_df.to_dict(orient='records')[0])
    db.save()


def create_people_kvdb():
    #//*** Initialize KVDB Object: people_json path
    fields = ['person_id']
    db = KVDB(people_json)
    
    #//*** Load Locally Stored File 
    df = pd.read_csv("./source/person.csv")

    #//*** Group by field unique value. Could use Iterrows(). 
    #//*** This method only stores the first unique key. Duplicates are automatically Removed
    for field_name, group_df in df.groupby(fields[0]):
        db.set_value(field_name, group_df.to_dict(orient='records')[0])

    
    db.save()


def create_visits_kvdb():
    db = KVDB(visited_json)
    fields = ['visit_id','site_id']
    #//*** Load Locally Stored File 
    df = pd.read_csv("./source/visited.csv")
    
    
    #//**************************************************************************************************
    #//*** JSON only supports strings as Keys.
    #//*** I really wanted a single programatic loop to solve for single index and multi-index.
    #//*** It feels like a recusive problem. Instead, I'll Hard coding the dictionary values.
    #//**************************************************************************************************
    
    
    #//******************************************************************************************************
    #//*** Loop through each field unique Value and use as a JSON key.
    #//*** This is a slight variation on the groupby loop
    #//*** This feels like it should be accomplished with a one sized fits all function
    #//*** That recursively builds any number of keys. I just couldn't get there and simply hard coded it.
    #//******************************************************************************************************
    #//*** Each field is a JSON key.
    #//******************************************************************************************************
    for key1 in df[fields[0]].unique():
        df1 = df[ df[fields[0]] == key1 ]
        out = {}
        for key2 in df1[fields[1]].unique():
            
            out[str(key2)] = {}
            
            df2 = df1[ df1[fields[1]] == key2 ]
            
            out[str(key2)] = df2.to_dict(orient='records')
        
        db.set_value(str(key1),out)
        
    #db.save()


def create_measurements_kvdb():
    db = KVDB(measurements_json)
    
    #//*** Fields to index in the Tuple
    fields = ['visit_id', 'person_id','quantity']
    
    #//*** Load Locally Stored File 
    df = pd.read_csv("./source/measurements.csv")
    
    #//******************************************************************************************************
    #//*** Loop through each field unique Value and use as a JSON key.
    #//*** This is a slight variation on the groupby loop
    #//*** This feels like it should be accomplished with a one sized fits all function
    #//*** That recursively builds any number of keys. I just couldn't get there and simply hard coded it.
    #//******************************************************************************************************
    #//*** Each field is a JSON key.
    #//******************************************************************************************************
    for key1 in df[fields[0]].unique():
        df1 = df[ df[fields[0]] == key1 ]
        out = {}
        for key2 in df1[fields[1]].unique():
            #if key2 not in out.keys():
            out[str(key2)] = {}
            
            df2 = df1[ df1[fields[1]] == key2 ]
            
            for key3 in df2[fields[2]].unique():
                df3 = df2[ df2[fields[2]] == key3 ]
                out[str(key2)][(key3)] = df3.to_dict(orient='records')[0]
        
        db.set_value(str(key1),out)

    db.save()
        
create_people_kvdb()

In [4]:
create_sites_kvdb()
create_people_kvdb()
create_visits_kvdb()
create_measurements_kvdb()

In [5]:
#sites.get_values("DR-3")
