In [2]:
import json
from pathlib import Path
import os

import pandas as pd


#//****************************************************************
#//****************************************************************
#//*** Skipping S3 Cluster this week due to reliability issues
#//****************************************************************
#import s3fs
#def read_cluster_csv(file_path, endpoint_url='https://storage.budsc.midwest-datascience.com'):
#    s3 = s3fs.S3FileSystem(
#        anon=True,
#        client_kwargs={
#            'endpoint_url': endpoint_url
#        }
#    )
#    return pd.read_csv(s3.open(file_path, mode='rb'))
#//****************************************************************
#//****************************************************************

#//*** Get Current Working Directory
current_dir = Path(os.getcwd()).absolute()

#//*** Build Path to Results directory
results_dir = current_dir.joinpath('results')

#//*** Build Path to Results/kvdb directory
kv_data_dir = results_dir.joinpath('kvdb')

#//*** Make kvdb folder as needed
kv_data_dir.mkdir(parents=True, exist_ok=True)

#//*** Build Filepaths
people_json = kv_data_dir.joinpath('people.json')
visited_json = kv_data_dir.joinpath('visited.json')
sites_json = kv_data_dir.joinpath('sites.json')
measurements_json = kv_data_dir.joinpath('measurements.json')

In [5]:
class KVDB(object):
    def __init__(self, db_path):
        self._db_path = Path(db_path)
        self._db = {}
        self._load_db()

    def _load_db(self):
        if self._db_path.exists():
            with open(self._db_path) as f:
                self._db = json.load(f)

    def get_value(self, key):
        return self._db.get(key)

    def set_value(self, key, value):
        self._db[key] = value

    def save(self):
        with open(self._db_path, 'w') as f:
            json.dump(self._db, f, indent=2)

In [23]:
def create_sites_kvdb():
    db = KVDB(sites_json)
    #//*** Skipping S3 Cluster this week due to reliability issues
    #df = read_cluster_csv('data/external/tidynomicon/site.csv')
    
    df = pd.read_csv("./source/site.csv")
    for site_id, group_df in df.groupby('site_id'):
        db.set_value(site_id, group_df.to_dict(orient='records')[0])
    db.save()


def create_people_kvdb():
    #//*** Initialize KVDB Object: people_json path
    db = KVDB(people_json)
    
    #//*** Load Locally Stored File 
    df = pd.read_csv("./source/person.csv")

    #//*** Single Field to index
    field = 'person_id'

    #//*** Group by field unique value. Could use Iterrows(). 
    #//*** This method only stores the first unique key. Duplicates are automatically Removed
    for field_name, group_df in df.groupby(field):
        db.set_value(field_name, group_df.to_dict(orient='records')[0])

    
    db.save()


def create_visits_kvdb():
    db = KVDB(visited_json)
    
    #//*** Load Locally Stored File 
    df = pd.read_csv("./source/visited.csv")
    
    print(df)
    
    #//*** index Field1, First Index tuple Key
    field1 = 'site_id'
    
    #//*** index Field2, Second Index tuple Key
    field2 = 'visit_id'
    
    
    #//*** Multiple Indexes require Groupby. Iterrows() Wouldn't work here
    
    #//*** Sort By site_id
    for field_name1, group_df1 in df.groupby(field1):
        
        #//*** Sort By
        for field_name2, group_df2 in group_df1.groupby(field2):
            print(field_name2,field_name1, list(group_df2['visit_date']))
        
        
        

    db.save()


def create_measurements_kvdb():
    db = KVDB(measurements_json)
    ## TODO: Implement code
    db.save()
    

create_visits_kvdb()

   visit_id site_id  visit_date
0       619    DR-1  1927-02-08
1       622    DR-1  1927-02-10
2       734    DR-3  1930-01-07
3       735    DR-3  1930-01-12
4       751    DR-3  1930-02-26
5       752    DR-3         NaN
6       837   MSK-4  1932-01-14
7       844    DR-1  1932-03-22
DR-1
619 DR-1 ['1927-02-08']
622 DR-1 ['1927-02-10']
844 DR-1 ['1932-03-22']
DR-3
734 DR-3 ['1930-01-07']
735 DR-3 ['1930-01-12']
751 DR-3 ['1930-02-26']
752 DR-3 [nan]
MSK-4
837 MSK-4 ['1932-01-14']


In [7]:
create_sites_kvdb()
create_people_kvdb()
create_visits_kvdb()
create_measurements_kvdb()