In [38]:
#Purpose: loading collections records from the Mars
import requests
import json
import pandas as pd
import numpy as np

In [2]:
#Get the metadate information of collections
url = 'https://mars.cyverse.org/thing'
res = requests.get(url)
collectionsInfo = json.loads(res.text)
collectionsInfo

{'status': [{'status': '200', 'count': 3027724},
  {'status': '500', 'count': 4},
  {'status': '503', 'count': 249}],
 'authority': [{'authority': 'GEOME', 'count': 228457},
  {'authority': 'OPENCONTEXT', 'count': 820272},
  {'authority': 'SESAR', 'count': 1784731},
  {'authority': 'SMITHSONIAN', 'count': 194517}]}

In [39]:
#Method: getId
#Purpose: get the identifers of collection
#Parametes: index -> a array of offset
#           collection -> the collection name you want to retrieve
#           limit -> the limit count of records you want to retrieve
def getID(index, collection, limit):
    identifiers = []

    for offset in index:
        payload = {'offset':offset, 'limit': limit, 'status':200, 'authority': collection}
        url = f'https://mars.cyverse.org/thing/'

        res = requests.get(url, params=payload)
        try:
            res_json = json.loads(res.text)
            identifiers += pd.json_normalize(res_json['data'])['id'].tolist()
        except Exception as e:
            print('error:', e)
        
    return identifiers

In [40]:
#Method: getInfo
#Purpose: get the json data of collection
#Parametes: method -> the records information format
#           identifier -> a array of identifers that you want to retrieve
def getInfo(method, identifier):
    payload = {'full': 'false', 'format': method}
    url = f'https://mars.cyverse.org/thing/{identifier}'

    res = requests.get(url, params=payload)
    if res.text != 'Internal Server Error':
        res_json = json.loads(res.text)
        return res_json
    return None

In [41]:
#Method: convertToDf
#Purpose: convert json data to pandas data frame
#Parametes: id -> a array of identifers 
#           method -> the data format
#           collection -> the collection name
def convertToDf(id, method, collection):
    result = pd.DataFrame()
    for i in id:
        try:
            df = pd.json_normalize(getInfo(method, i))
            if method == 'core' and collection=='OPENCONTEXT':
                df['uri'] = i
            result = result.append(df, ignore_index=True)
        except Exception as e:
            print(e)
    return result

In [6]:
#SESAR records
subsetIndex = pd.DataFrame([i*1000  for i in range(int(collectionsInfo['authority'][2]['count']/1000))]).sample(n=50, random_state=1)[0].tolist()

In [7]:
identifiers_5000 = getID(subsetIndex, 'SESAR', 1000)
identifiers_5000 = pd.DataFrame(identifiers_5000).sample(n=5000, random_state=1)[0].tolist()

In [10]:
SESAR_5000_original = convertToDf(identifiers_5000, 'original', 'SESAR')
SESAR_5000_core = convertToDf(identifiers_5000, 'core', 'SESAR')

In [40]:
SESAR_5000_original.to_csv('../Collections_data/SESAR_5000_original.csv')
SESAR_5000_core.to_csv('../Collections_data/SESAR_5000_core.csv')

In [10]:
#openContext data
openContextIndex = pd.DataFrame([i*1000 for i in range(int(collectionsInfo['authority'][1]['count']/1000))]).sample(n=50, random_state=1)[0].tolist()

In [12]:
openContext_5000 = getID(openContextIndex, 'OPENCONTEXT', 1000)

error: Expecting value: line 1 column 1 (char 0)
error: Expecting value: line 1 column 1 (char 0)


In [15]:
openContext_5000 = pd.DataFrame(openContext_5000).sample(n=5000, random_state=1)[0].tolist()

In [16]:
OPENCONTEXT_5000_original = convertToDf(openContext_5000, 'original', 'OPENCONTEXT')


In [32]:
OPENCONTEXT_5000_core = convertToDf(openContext_5000, 'core', 'OPENCONTEXT')







In [36]:
OPENCONTEXT_5000_original.to_csv('../Collections_data/OPENCONTEXT_5000_original.csv')
OPENCONTEXT_5000_core.to_csv('../Collections_data/OPENCONTEXT_5000_core.csv')
