## Download Images and Metadata (MoMA)

- Code in this notebook scrapes selected artwork images from the Museum of Modern Art's online collection.
- Associated metadata for each artwork was downloaded from MoMA's open API [here](https://github.com/MuseumofModernArt/collection). 

### Imports

In [None]:
import pandas as pd
import requests
import pickle
import random
import time
from bs4 import BeautifulSoup
from lxml import html
from fake_useragent import UserAgent
from itertools import islice

In [None]:
# Web scraping user agent
ua = UserAgent()
user_agent = {'User-agent': ua.random}

### Download and Clean Metadata

In [None]:
# Display all columns in dataframe
pd.set_option('display.max_columns', 999)

In [None]:
# Read in metadata
moma = pd.read_csv('./moma_data/artworks_moma.csv')

In [None]:
# Display column headings
moma.columns

In [None]:
# Filter dataframe to include only art with images available in the online collection
moma = moma[moma.ThumbnailURL.notnull()]

In [None]:
# Rename columns
moma.rename(columns={'ObjectID': 'id', 'URL': 'page_url', 
                     'ThumbnailURL': 'image_url', 'Title': 'title', 
                     'Artist': 'artist', 'Date': 'date', 'Medium': 'medium', 
                     'Classification': 'classification', 'Department': 'department'}, inplace=True)

moma['id'] = moma['id'].astype(str)

moma['source'] = 'moma'

# Keep only selected metadata
moma = moma[['id', 'title', 'artist', 'date', 'classification', 
             'department', 'medium', 'source', 'page_url', 'image_url']]

In [None]:
# Display first 5 rows of dataframe
moma.head()

### Pickle Dataframe

In [None]:
pickle.dump(moma, open('./all_data_final/moma.pickle', 'wb'))

### Download and Save Images

In [None]:
# Create image counter.
imagecounter = 0

In [None]:
pickle.dump(imagecounter, open('./moma_data/Image counter.pickle', 'wb'))

In [None]:
imagecounter = pickle.load(open('./moma_data/Image counter.pickle', 'rb'))
print('Count:', imagecounter)
print()

# Loop through each piece of art
for index, row in islice(moma.iterrows(), imagecounter, 100):
   
    timeDelay = random.randrange(20, 30)/1000
    time.sleep(timeDelay)
    soup = BeautifulSoup(requests.get(row['URL'], headers = user_agent).text, "lxml")
    
    # Get URL for artwork
    try: 
        link = 'http://www.moma.org' + soup.find('div', class_='work__image-container').find('img')['src']
    except: 
        pass
    
    # Get image
    img = requests.get(link, headers = user_agent).content
    file = './moma_data/' + str(row['ObjectID']) + '.jpg'

    with open(file, 'wb') as handler:
            handler.write(img)
            
    # Increment image counter and save every 10 images
    imagecounter += 1
    if imagecounter % 10 == 0:
        pickle.dump(imagecounter, open('./moma_data/Image counter.pickle', 'wb'))
        print('Count:', imagecounter)