## Download Images and Metadata (Met)

- Code in this notebook scrapes selected artwork images from the Museum of Modern Art's online collection.
- Associated metadata for each artwork was downloaded from The Met's open API [here](https://github.com/metmuseum/openaccess). 

### Imports

In [None]:
import pandas as pd
import requests
import pickle
import random
import time
from bs4 import BeautifulSoup
from lxml import html
from fake_useragent import UserAgent
from itertools import islice

In [None]:
# Web scraping user agent
ua = UserAgent()
user_agent = {'User-agent': ua.random}

### Download and Clean Metadata

In [None]:
# Display all columns in dataframe
pd.set_option('display.max_columns', 999)

In [None]:
# Read in metadata
met = pd.read_csv('./met_data/artworks_met.csv')

In [None]:
# Display column headings
met.columns

In [None]:
# Filter dataframe to include only only art with images available in the online collection
met = met.loc[met['Is Public Domain'] == True]

In [None]:
# Rename columns
met.rename(columns={'Object ID': 'id', 'Link Resource': 'page_url', 
                    'Title': 'title', 'Artist Display Name': 'artist', 
                    'Object End Date': 'date', 'Medium': 'medium', 'Tags': 'tags', 
                    'Department': 'department', 'Classification': 'classification'}, inplace=True)

met['id'] = met['id'].astype(str)

met['source'] = 'met'

# Keep only selected metadata
met = met[['id', 'title', 'artist', 'date', 'classification', 
           'department', 'medium', 'source', 'page_url', 'tags']]

In [None]:
# Display first 5 rows of dataframe
met.head()

### Pickle Dataframe

In [None]:
pickle.dump(met, open('./all_data_final/met.pickle', 'wb'))

### Download and Save Images

In [None]:
# Imports for webscraping 
import os
from bs4 import BeautifulSoup
from selenium import webdriver

# path to the chromedriver exe
chromedriver = "/Users/mjordan/Downloads/chromedriver"

os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)

In [None]:
# Create image counter.
imagecounter = 0

In [None]:
pickle.dump(imagecounter, open('./met_data/Image counter.pickle', 'wb'))

In [None]:
imagecounter = pickle.load(open('./met_data/Image counter.pickle', 'rb'))
print('Count:', imagecounter)
print()

# Loop through each piece of art
for index, row in islice(met.iterrows(), imagecounter, 100):
    
    timeDelay = random.randrange(20, 30)/1000
    time.sleep(timeDelay)
    driver.get(row['Link Resource'])
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Get URL for artwork
    link = soup.find('div', class_='cell the-artwork gtm__TGACTEST01').find('img')['src']
    
    # Get image
    img = requests.get(link, headers = user_agent).content
    file = './met_data/' + str(row['Object ID']) + '.jpg'

    with open(file, 'wb') as handler:
            handler.write(img)
            
    # Increment image counter and save every 10 images
    imagecounter += 1
    if imagecounter % 10 == 0:
        pickle.dump(imagecounter, open('./met_data/Image counter.pickle', 'wb'))
        print('Count:', imagecounter)
