In [1]:
#Purpose: loading collections records from the Mars
import requests
import json
import pandas as pd
import numpy as np

In [2]:
url = 'https://mars.cyverse.org/thing'
res = requests.get(url)
collectionsInfo = json.loads(res.text)
collectionsInfo

{'status': [{'status': '200', 'count': 3027724},
  {'status': '500', 'count': 4},
  {'status': '503', 'count': 249}],
 'authority': [{'authority': 'GEOME', 'count': 228457},
  {'authority': 'OPENCONTEXT', 'count': 820272},
  {'authority': 'SESAR', 'count': 1784731},
  {'authority': 'SMITHSONIAN', 'count': 194517}]}

In [3]:
def getID(index, collection):
    identifiers = []

    n = 0
    for offset in index:
        payload = {'offset':offset, 'limit': 1, 'status':200, 'authority': collection}
        url = f'https://mars.cyverse.org/thing/'

        res = requests.get(url, params=payload)
        
        try:
            res_json = json.loads(res.text)
            identifiers.append(res_json['data'][0]['id'])
        except Exception as e:
            print('error:', e)
        
    return identifiers

In [4]:
def getInfo(method, identifier):
    payload = {'full': 'false', 'format': method}
    url = f'https://mars.cyverse.org/thing/{identifier}'

    res = requests.get(url, params=payload)
    if res.text != 'Internal Server Error':
        res_json = json.loads(res.text)
        return res_json
    return None

In [5]:
def convertToDf(id, method):
    result = pd.DataFrame()
    for i in id:
        result = result.append(pd.json_normalize(getInfo(method, i)), ignore_index=True)
    return result

In [6]:
#SESAR records
subsetIndex = pd.DataFrame([i for i in range(collectionsInfo['authority'][2]['count'])]).sample(n=5000, random_state=1)[0].tolist()

In [7]:
identifiers_5000 = getID(subsetIndex, 'SESAR')

In [10]:
SESAR_5000_original = convertToDf(identifiers_5000, 'original')
SESAR_5000_core = convertToDf(identifiers_5000, 'core')

In [40]:
SESAR_5000_original.to_csv('../Collections_data/SESAR_5000_original.csv')
SESAR_5000_core.to_csv('../Collections_data/SESAR_5000_core.csv')

In [8]:
#openContext data
openContextIndex = pd.DataFrame([i for i in range(collectionsInfo['authority'][1]['count'])]).sample(n=1000, random_state=1)[0].tolist()

In [14]:
#Only 996 records, there are some 502 bad gateway issues.
openContext_1000 = getID(openContextIndex, 'OPENCONTEXT')

offset 78698
response 200
0
offset 353035
response 200
1
offset 638581
response 200
2
offset 816920
response 200
3
offset 530571
response 200
4
offset 83959
response 200
5
offset 141797
response 200
6
offset 240223
response 200
7
offset 264186
response 200
8
offset 439279
response 200
9
offset 161507
response 200
10
offset 799381
response 200
11
offset 599718
response 200
12
offset 388067
response 200
13
offset 693762
response 200
14
offset 190135
response 200
15
offset 674545
response 200
16
offset 681454
response 200
17
offset 588377
response 200
18
offset 647376
response 200
19
offset 42665
response 200
20
offset 463858
response 200
21
offset 93306
response 200
22
offset 744193
response 200
23
offset 639486
response 200
24
offset 537183
response 200
25
offset 451190
response 200
26
offset 211122
response 200
27
offset 215428
response 200
28
offset 318238
response 200
29
offset 560079
response 200
30
offset 334747
response 200
31
offset 277530
response 200
32
offset 35060
response 20

In [16]:
OPENCONTEXT_996_original = convertToDf(openContext_1000, 'original')
OPENCONTEXT_996_core = convertToDf(openContext_1000, 'core')

In [18]:
OPENCONTEXT_996_original.to_csv('../Collections_data/OPENCONTEXT_996_original.csv')
OPENCONTEXT_996_core.to_csv('../Collections_data/OPENCONTEXT_996_core.csv')
