In [None]:
import pandas as pd
import numpy as np

## Load Dataset

In [None]:
df_artworks = pd.read_csv('./collection/Artworks.csv')
#Necesary to drop repited columns
df_artists = pd.read_csv('./collection/Artists.csv')

### Explore Arttwork dataset

In [None]:
columns = list(filter( lambda x : x not in list(df_artists.columns), list(df_artworks.columns)))
df_artworks = df_artworks[columns]

In [None]:
df_artworks.head()

**Shape of artworks dataset**

In [None]:
df_artworks.shape

**Columns with NaN**

In [None]:
df_artworks.isnull().any()

**Date distribution**

**Check differents types of date**

In [None]:
dates = df_artworks['Date'].dropna()
#drop Unknown values 
dates = dates[(dates != 'n.d.') & (dates != 'Unknown')]
dates.shape

In [None]:
#Fast way to obtain dates (could be extracting wrong dates)
data_creation = dates.str.extract(r'(\d{4})')
data_creation = data_creation.dropna()
data_creation = pd.to_numeric(data_creation[0])
data_creation = data_creation.rename('Date')
data_creation.head()

In [None]:
data_creation.hist(bins=100)

In [None]:
# e.g 1987
date_dddd = dates[dates.str.match(r'\d{4}$')]
print(date_dddd.shape)
date_dddd.head()

In [None]:
dates = dates.drop(date_dddd.index)
dates.head()

In [None]:
#e.g 1983-...
date_period = dates[dates.str.match(r'\d{4}[- –].+')]
print(date_period.shape)
date_period.head()

In [None]:
dates = dates.drop(date_period.index)
dates.head()

In [None]:
#e.g. c. 1934...
date_c = dates[dates.str.match(r'^c\. \d{4}.*')]
print(date_c.shape)
date_c.head()

In [None]:
dates = dates.drop(date_c.index)
dates.head()

In [None]:
# e.g. 1968.
date_dddd_point = dates[dates.str.match(r'^\d{4}\.$')]
print(date_dddd_point.shape)
date_dddd_point.head()

**Classification distribution**

In [None]:
df_artworks['Classification'].value_counts().plot.bar()

**Cataloged distribution**

In [None]:
df_artworks['Cataloged'].value_counts()

### Artwork image extraction

In [None]:
from bs4 import BeautifulSoup
import urllib
import os
import zipfile

In [None]:
base_url = 'https://www.moma.org/'
base_image_path = os.path.join(os.getcwd(), 'collection/images')

**Drop artworks without link**

In [None]:
df_artworks_download = df_artworks.dropna(subset=['URL'])
df_artworks_download.head()

In [None]:
df_artworks_download.shape

**Download artwork's images**

In [None]:
# w to write a zipfile || a to append to an existing file
zf = zipfile.ZipFile(os.path.join(base_image_path, 'image.zip'), 'a')
zf.write(base_image_path)

In [None]:
def download_image(x):
    global zf
    try:
        html = urllib.request.urlopen(x['URL'])
        soup = BeautifulSoup(html)
        imgs = soup.findAll('img')
        image_url = base_url + imgs[0].get('src')
        path = os.path.join(base_image_path, str(x.name)+'.jpg')
        urllib.request.urlretrieve(image_url, path)
        zf.write(path)
        os.remove(path)
        return path
    except:
        return 'error at index ' + str(x.name)

In [None]:
#Select a window
df_to_download = df_artworks_download.loc[50001:65000]

df_to_download['Image path'] = df_to_download.apply(download_image, axis=1)

In [None]:
df_to_download.head()

In [None]:
zf.close()

In [None]:
df_to_download.to_csv('./tmp/download05.csv')

### Check data download

In [None]:
zfr = zipfile.ZipFile(os.path.join(base_image_path, 'image.zip'), 'r')

In [None]:
len(zfr.namelist())

In [None]:
zfr.namelist()[-1]

In [None]:
zfr.close()

In [None]:
df_last_download = pd.read_csv('./tmp/download04.csv')
df_last_download.tail()

In [None]:
df_artworks['URL'][0]

In [None]:
html = urllib.request.urlopen(df_artworks['URL'][0])
soup = BeautifulSoup(html)

In [None]:
artwork_name = soup.findAll('span', {'class': 'work__short-caption__text--primary balance-text'})
artwork_name[0].text

In [None]:
imgs = soup.findAll('img')

In [None]:
image_url = base_url + imgs[0].get('src')

In [None]:
urllib.request.urlretrieve(image_url, os.path.join(base_image_path, '0.jpg') )

In [None]:
urllib.urlretrieve('/media/W1siZiIsIjU5NDA1Il0sWyJwIiwiY29udmVydCIsIi1yZXNpemUgMjAwMHgyMDAwXHUwMDNlIl1d.jpg?sha=7a282c9a6f26636d', os.path.basename('example'))