In [1]:
from pyspark.sql import SparkSession
#used to read from Met's object API
import requests
import time
from IPython.display import clear_output

In [2]:
spark = SparkSession.builder.master("local[4]").appName("Extract").config("spark.ui.port", '4050').getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/19 19:47:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# get the IDs of all of the objects in the collection
ids = requests.get('https://collectionapi.metmuseum.org/public/collection/v1/objects').json()

In [4]:
# put IDs in list
ids_rdd = sc.parallelize(ids["objectIDs"])

In [5]:
# select random sample of objects in the database
# n represents the approximate total number of records to collect
n = 10000
ids_sample = ids_rdd.sample(withReplacement=False, fraction=(n/ids["total"]), seed=1)

In [6]:
id_list = list(ids_sample.collect())

                                                                                

In [7]:
# The API requires that requests/second is limited to 80
# quick and dirty solution: make request, then sleep for 1/80 seconds
# build a list of JSON objects
# pick objects based on the IDs that were randomly selected above
# This process is extremely slow so I can't get more than 10,000 records in a reasonable amount of time
# Collecting 9974 objects took 39 minutes
url = 'https://collectionapi.metmuseum.org/public/collection/v1/objects/'
objects = []
i = 0
id_total = len(id_list)
for id in id_list:
    i += 1
    clear_output(wait=True)
    path = url+str(id)
    print("Collecting object {a}/{b}\nObject ID: {c}".format(a=i, b=id_total, c=id))
    request = requests.get(path)
    obj_dict = request.json()
    if "message" in obj_dict:
        print(obj_dict)
        break
    else:
        objects.append(obj_dict)
    time.sleep(float(1/80))

Collecting object 9974/9974
Object ID: 832623


In [8]:
def getFields(obj:dict):
    # The fields I want to keep
    keys = ['objectID', 'isHighlight', 'accessionYear', 'department', 'objectName', 
            'culture','artistNationality', 'artistGender', 'objectEndDate', 'isTimelineWork']
    new_obj = {}
    for key in obj:
        if key in keys:
            new_obj[key] = obj[key]
    return new_obj

In [9]:
objects_rdd = sc.parallelize(objects)

In [10]:
objects_rdd = objects_rdd.map(lambda x: getFields(x))

In [11]:
objects_rdd.collect()

24/12/19 20:27:23 WARN TaskSetManager: Stage 1 contains a task of very large size (3502 KiB). The maximum recommended task size is 1000 KiB.


[{'objectID': 14,
  'isHighlight': False,
  'accessionYear': '1979',
  'department': 'The American Wing',
  'objectName': 'Coin',
  'culture': '',
  'artistNationality': 'American',
  'artistGender': '',
  'objectEndDate': 1907,
  'isTimelineWork': False},
 {'objectID': 78,
  'isHighlight': False,
  'accessionYear': '1937',
  'department': 'The American Wing',
  'objectName': 'Andiron',
  'culture': '',
  'artistNationality': '',
  'artistGender': '',
  'objectEndDate': 1700,
  'isTimelineWork': False},
 {'objectID': 108,
  'isHighlight': False,
  'accessionYear': '1989',
  'department': 'The American Wing',
  'objectName': 'Andiron',
  'culture': 'American',
  'artistNationality': 'American',
  'artistGender': '',
  'objectEndDate': 1814,
  'isTimelineWork': False},
 {'objectID': 227,
  'isHighlight': False,
  'accessionYear': '1968',
  'department': 'The American Wing',
  'objectName': 'Armchair',
  'culture': 'American',
  'artistNationality': 'French',
  'artistGender': '',
  'obje

In [12]:
df = objects_rdd.toDF()

24/12/19 20:27:25 WARN TaskSetManager: Stage 2 contains a task of very large size (3502 KiB). The maximum recommended task size is 1000 KiB.


In [None]:
import os
path = os.getcwd() + "/extractedDataset"
df.repartition(1).write.csv(path, mode='overwrite', header=True)

In [14]:
spark.stop()