In [None]:
import requests,json,os,time
import pandas as pd
import markdown
from bs4 import BeautifulSoup

apiKey=os.getenv('CKANAPIKEY')
if apiKey is None:
    raise EnvironmentError("Failed because {} is not set.".format('CKANAPIKEY'))

In [None]:
def createDataResource(apiEndpoint,headers,payload):
    response = requests.request("POST", apiEndpoint, headers=headers, json = payload)
    return response
    
def getMetadata(filePath):
    mk = open(filePath).read()
    html = markdown.markdown(mk)
    soup = BeautifulSoup(html,'html.parser')
    
    # get h3 headers
    metadataTypes = soup.find_all('h3')
    metadataTypesList = ['Files']

    for tag in metadataTypes:
        metadataTypesList.append(tag.text)
    
    #get contents under h3 
    metadata = soup.find_all('li')
    filesMetadata = []
    
    for i,tag in enumerate(metadata):
        check = tag.find('code')
        if check is not None:
            source = tag.find('code').text
            content = tag.text
            contentsDict = {source:content.replace(source+ ' - ','')}
            filesMetadata.append(contentsDict)

    #combine all
    numFiles=[4,7,9,3,8,6]
    fullMetadata={file: None for file in metadataTypesList if file != 'Useful Links'}
    start = 0
    end = numFiles[0]
    tempData = filesMetadata[start:end]
    fullMetadata['Files'] = {k: v for d in tempData for k, v in d.items()}
    
    fullMetadata['Files']['data.csv'] = fullMetadata['Files']['data.csv.gz']
    del fullMetadata['Files']['data.csv.gz']

    start = numFiles[0]
    end = sum(numFiles[0:2])
    tempData = filesMetadata[start:end]
    fullMetadata['Sensor Data'] = {k: v for d in tempData for k, v in d.items()}

    start = sum(numFiles[0:2])
    end = sum(numFiles[0:3])
    tempData = filesMetadata[start:end]
    fullMetadata['Node Metadata'] = {k: v for d in tempData for k, v in d.items()}

    #skip three
    start = sum(numFiles[0:4])
    end = sum(numFiles[0:5])
    tempData = filesMetadata[start:end]
    fullMetadata['Sensor Metadata'] ={k: v for d in tempData for k, v in d.items()}

    start = sum(numFiles[0:5])
    end = sum(numFiles[0:6])
    tempData = filesMetadata[start:end]
    fullMetadata['Provenance Metadata'] = {k: v for d in tempData for k, v in d.items()}
    
    return fullMetadata
    
def getDataForCKAN(filePath,orient='records'):
    dataDF = pd.read_csv(dataPath)
    dataDict = dataDF.to_dict(orient=orient)
    return dataDict

def getFieldsTemplate():
    template = {
                "id": None,
                "info": 
                {
                    "label": None,
                    "notes": None
                }
            }
    return template

def getFieldsForCKAN(metadata):
    fieldsList=[]
    for key,val in metadata.items():
        template = getFieldsTemplate()
        template['id'] = key
        template['info']['label'] = key
        template['info']['notes'] = val
        fieldsList.append(template)
    return fieldsList

In [None]:
mainDir = '/Users/iperezx/Documents/sage-commons/sage-commons-aot/'
sageCommonsURL='http://hotshot.sdsc.edu:5000'
headers = {'Authorization': apiKey}
orgName= 'array-of-things'

In [None]:
#get directories of datasets
directories = [f.path for f in os.scandir(mainDir) if f.is_dir() and f.name.startswith('chicago')]
print(directories)

In [None]:
timeNames = ['daily','weekly','monthly']
dataDir = directories[1]
dataFiles = [f for f in sorted(os.listdir(dataDir)) if os.path.isfile(os.path.join(dataDir, f))] 
timeData = {key: None for key in timeNames}
for val in timeNames:
    timeData[val] = {key: None for key in dataFiles}

In [None]:
dataFiles = [f for f in sorted(os.listdir(dataDir)) if os.path.isfile(os.path.join(dataDir, f))]
metaDataFilePath= os.path.join(dataDir,dataFiles[0])
metadata = getMetadata(metaDataFilePath)
lookupFile={'data.csv':'Sensor Data',
            'sensors.csv':'Sensor Metadata',
            'provenance.csv':'Provenance Metadata',
            'nodes.csv':'Node Metadata'}

# print(json.dumps(metadata, indent=4, sort_keys=True))

## import chicago daily data

### create data resource

In [None]:
apiAction='/api/3/action/package_create'
url = sageCommonsURL+apiAction

dataDir = directories[1]
title = dataDir.replace(mainDir,'')

timeType = 'daily'
name = 'chicago-' + timeType

tags = [{'name':'csv'},{'name':'waggle'},{'name':'sensors'},{'name': timeType}]

payload = {'owner_org': orgName,
           'title': title,
           'name' : name,
           'notes': 'Description of dataset',
           'tags' : tags
          }

response = createDataResource(url,headers,payload)
jsonResponseDS = response.json()
print(response)
print(json.dumps(jsonResponseDS, indent=4, sort_keys=True))

### create data source

In [None]:
apiAction='/api/3/action/datastore_create'
packageID = jsonResponseDS['result']['id']
url = sageCommonsURL+apiAction

dataFiles = [f for f in sorted(os.listdir(dataDir)) if os.path.isfile(os.path.join(dataDir, f)) and f!='README.md' and f!='offsets.csv']

for dataFile in dataFiles:
    dataPath= os.path.join(dataDir,dataFile)
    name = os.path.splitext(dataFile)[0]
    records = getDataForCKAN(dataPath)
    
    resource = {'package_id': packageID,
                'name' : name,
                'description': metadata['Files'][dataFile]
               }
    
    fields = getFieldsForCKAN(metadata[lookupFile[dataFile]])
    
    
    payload = {'resource': resource,
               'fields': fields,
               'records': records
              }

    
    start = time.time() # start timing
    response = requests.request("POST", url, headers=headers, json = payload)
    end = time.time()
    timeData[timeType][dataFile] = abs(end-start)
#     jsonResponse = response.json()
#     print(response)
#     print(response.text)
#     print(json.dumps(jsonResponse, indent=4, sort_keys=True))
    
    print('Elapsed time: ' + str(timeData[timeType][dataFile]))

## import chicago weekly data

### create data resource

In [None]:
apiAction='/api/3/action/package_create'
url = sageCommonsURL+apiAction

dataDir = directories[2]
title = dataDir.replace(mainDir,'')

timeType = 'weekly'
name = 'chicago-' + timeType

tags = [{'name':'csv'},{'name':'waggle'},{'name':'sensors'},{'name': timeType}]

payload = {'owner_org': orgName,
           'title': title,
           'name' : name,
           'notes': 'Description of dataset',
           'tags' : tags
          }
response = requests.request("POST", url, headers=headers, json = payload)
jsonResponseDS = response.json()
print(response)
print(json.dumps(jsonResponseDS, indent=4, sort_keys=True))

### create data source

In [None]:
apiAction='/api/3/action/datastore_create'
packageID = jsonResponseDS['result']['id']
url = sageCommonsURL+apiAction

dataFiles = [f for f in sorted(os.listdir(dataDir)) if os.path.isfile(os.path.join(dataDir, f)) and f!='README.md' and f!='offsets.csv']

for dataFile in dataFiles:
    dataPath= os.path.join(dataDir,dataFile)
    name = os.path.splitext(dataFile)[0]
    records = getDataForCKAN(dataPath)
    
    resource = {'package_id': packageID,
                'name' : name,
                'description': metadata['Files'][dataFile]
               }
    
    fields = getFieldsForCKAN(metadata[lookupFile[dataFile]])
    
    
    payload = {'resource': resource,
               'fields': fields,
               'records': records
              }

    
    start = time.time() # start timing
    response = requests.request("POST", url, headers=headers, json = payload)
    end = time.time()
    timeData[timeType][dataFile] = abs(end-start)
#     jsonResponse = response.json()
#     print(response)
#     print(response.text)
#     print(json.dumps(jsonResponse, indent=4, sort_keys=True))
    
    print('Elapsed time: ' + str(timeData[timeType][dataFile]))

In [None]:
print(json.dumps(timeData, indent=4, sort_keys=True))

## import chicago monthly data

### create dataset

In [None]:
apiAction='/api/3/action/package_create'
url = sageCommonsURL+apiAction

dataDir = directories[0]
title = dataDir.replace(mainDir,'')

timeType = 'monthly'
name = 'chicago-' + timeType

tags = [{'name':'csv'},{'name':'waggle'},{'name':'sensors'},{'name': timeType}]

payload = {'owner_org': orgName,
           'title': title,
           'name' : name,
           'notes': 'Description of dataset',
           'tags' : tags
          }
response = requests.request("POST", url, headers=headers, json = payload)
jsonResponseDS = response.json()
print(response)
print(json.dumps(jsonResponseDS, indent=4, sort_keys=True))

### create data source

In [None]:
apiAction='/api/3/action/datastore_create'
packageID = jsonResponseDS['result']['id']
url = sageCommonsURL+apiAction

dataFiles = [f for f in sorted(os.listdir(dataDir)) if os.path.isfile(os.path.join(dataDir, f)) and f!='README.md' and f!='offsets.csv']

for dataFile in dataFiles:
    dataPath= os.path.join(dataDir,dataFile)
    name = os.path.splitext(dataFile)[0]
    records = getDataForCKAN(dataPath)
    
    resource = {'package_id': packageID,
                'name' : name,
                'description': metadata['Files'][dataFile]
               }
    
    fields = getFieldsForCKAN(metadata[lookupFile[dataFile]])
    
    
    payload = {'resource': resource,
               'fields': fields,
               'records': records
              }

    
    start = time.time() # start timing
    response = requests.request("POST", url, headers=headers, json = payload)
    end = time.time()
    timeData[timeType][dataFile] = abs(end-start)
#     jsonResponse = response.json()
#     print(response)
#     print(response.text)
#     print(json.dumps(jsonResponse, indent=4, sort_keys=True))
    
    print('Elapsed time: ' + str(timeData[timeType][dataFile]))