# Where are the women artists here? üñºÔ∏è
## In this notebook, I scraped all the paintings belonging to the MASP collection (The S√£o Paulo Museum of Art)

In [1]:
# Import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm  # Adding tqdm for the progress bar

# URL of the web page to scrape
url = "https://pt.wikipedia.org/wiki/Lista_de_pinturas_do_Museu_de_Arte_de_S%C3%A3o_Paulo"

# Sending an HTTP request to the URL and parsing the HTML content
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Extracting tables from the parsed HTML
tables = soup.select("table")
table = tables[1]  # Selecting the desired table

# List to store the URLs
urls = []

# Extracting URLs from the table and storing them in the list
for url_paint in table.find_all("a", class_="external autonumber"):
    href = url_paint.get("href")
    if href.startswith("https://masp.org.br"):
        urls.append(href)

# Lists to store the extracted data
titles = []
authors = []
images = []

# Loop through the URLs and extract data, adding tqdm for the progress bar
for url_paint in tqdm(urls, desc="Processing..."):
    response_paint = requests.get(url_paint)
    soup_paint = BeautifulSoup(response_paint.text, "html.parser")
    
    # Extracting the title of the painting
    title_element = soup_paint.find("h4", class_="sub-category medium italic")
    if title_element:
        title_text = title_element.text.strip()
        titles.append(title_text)
    else:
        titles.append(np.nan)  # Add 'nan' if title is not found
    
    # Extracting the author of the painting
    author_element = soup_paint.find("h3", class_="sub-title no-uppercase")
    if author_element:
        author_text = author_element.text.strip()
        authors.append(author_text)
    else:
        authors.append(np.nan)  # Add 'nan' if author is not found

    # Extracting the image URL of the painting
    img_element = soup_paint.find("img", class_="image-to-zoom-1")
    if img_element:
        img_url = img_element.get("src")
        images.append(img_url)
    else:
        images.append(np.nan)  # Add 'nan' if image URL is not found
        
# Create a dictionary with the extracted data
items = {"titles": titles, "authors": authors, "images": images, "urls": urls}

# Create a DataFrame
df = pd.DataFrame(items)

Processing...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 445/445 [02:57<00:00,  2.51it/s]


In [2]:
# Displaying the DataFrame
df.head()

Unnamed: 0,titles,authors,images,urls
0,"Sra. Franck Rolleston, 1785-92",Gilbert Stuart,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/mrs-franck-rol...
1,"Vista de Salvador, 1951",Rafael Borjes de Oliveira,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/vista-de-salvador
2,"Cabe√ßa de mulher, 1973",Emiliano Di Cavalcanti,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/cabeca-de-mulher
3,,,,https://masp.org.br/acervo/obra/paisagem-com-c...
4,"Vaca, Sem data",√âmile Claus,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/vaca


### Oh no, I have some problems in my scrapper. Let me see how many "NaNs" I have in df

In [3]:
nan_counts = df.isna().sum()
print(nan_counts)

titles     52
authors    53
images     53
urls        0
dtype: int64


In [4]:
# I will collect only the names, extracting them from the URLs
df["name"] = df["urls"].str.split("obra/").str.get(1)

In [5]:
df.head()

Unnamed: 0,titles,authors,images,urls,name
0,"Sra. Franck Rolleston, 1785-92",Gilbert Stuart,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/mrs-franck-rol...,mrs-franck-rolleston
1,"Vista de Salvador, 1951",Rafael Borjes de Oliveira,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/vista-de-salvador,vista-de-salvador
2,"Cabe√ßa de mulher, 1973",Emiliano Di Cavalcanti,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/cabeca-de-mulher,cabeca-de-mulher
3,,,,https://masp.org.br/acervo/obra/paisagem-com-c...,paisagem-com-casal-de-camponeses
4,"Vaca, Sem data",√âmile Claus,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/vaca,vaca


In [6]:
# Now I will join these names with the Google link to complete this manually (and tedious) process
df["search"] = df["name"] + " site:https://masp.org.br/acervo/"

In [7]:
df.head()

Unnamed: 0,titles,authors,images,urls,name,search
0,"Sra. Franck Rolleston, 1785-92",Gilbert Stuart,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/mrs-franck-rol...,mrs-franck-rolleston,mrs-franck-rolleston site:https://masp.org.br/...
1,"Vista de Salvador, 1951",Rafael Borjes de Oliveira,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/vista-de-salvador,vista-de-salvador,vista-de-salvador site:https://masp.org.br/ace...
2,"Cabe√ßa de mulher, 1973",Emiliano Di Cavalcanti,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/cabeca-de-mulher,cabeca-de-mulher,cabeca-de-mulher site:https://masp.org.br/acervo/
3,,,,https://masp.org.br/acervo/obra/paisagem-com-c...,paisagem-com-casal-de-camponeses,paisagem-com-casal-de-camponeses site:https://...
4,"Vaca, Sem data",√âmile Claus,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/vaca,vaca,vaca site:https://masp.org.br/acervo/


In [8]:
# Save the DataFrame to a CSV file
df.to_csv("paints_links.csv", index=False)

### Ok, after a little manual work we have all the links I need

### I couldn't find this painting, "Retrato de Jos√© Honorato dos Santos", on the museum's website. It's probably not there anymore, so it's been removed from the list.

In [9]:
# Load a DataFrame from a CSV file named "paints_links_complete.csv"
df_complete = pd.read_csv("paints_links_complete.csv")

# Extract the 'urls' column from the DataFrame
urls = df_complete["urls"]

# Lists to store the extracted data
titles = []
authors = []
images = []

# Loop through each URL in the 'urls' list, showing a progress bar with the description "Processing..."
for url in tqdm(urls, desc="Processing..."):
    try:
        # Send a GET request to the URL and get the response
        response_paint = requests.get(url)
        
        # Parse the HTML content of the response using BeautifulSoup
        soup_paint = BeautifulSoup(response_paint.text, "html.parser")

        # Extracting the title of the painting
        title_element = soup_paint.find("h4", class_="sub-category medium italic")
        if title_element:
            title_text = title_element.text.strip()
            titles.append(title_text)
        else:
            titles.append(np.nan)  # Add 'nan' if title is not found

        # Extracting the author of the painting
        author_element = soup_paint.find("h3", class_="sub-title no-uppercase")
        if author_element:
            author_text = author_element.text.strip()
            authors.append(author_text)
        else:
            authors.append(np.nan)  # Add 'nan' if author is not found

        # Extracting the image URL of the painting
        img_element = soup_paint.find("img", class_="image-to-zoom-1")
        if img_element:
            img_url = img_element.get("src")
            images.append(img_url)
        else:
            images.append(np.nan)  # Add 'nan' if image URL is not found

    except requests.exceptions.RequestException as e:
        print(f"Request Exception: {e}")
        titles.append(np.nan)
        authors.append(np.nan)
        images.append(np.nan)

    except Exception as e:
        print(f"Exception: {e}")
        titles.append(np.nan)
        authors.append(np.nan)
        images.append(np.nan)

# Create a dictionary with the extracted data
items = {"titles": titles, "authors": authors, "images": images, "urls": urls}

# Create a DataFrame
df_complete = pd.DataFrame(items)


Processing...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 442/442 [03:11<00:00,  2.31it/s]


In [10]:
df_complete.head()

Unnamed: 0,titles,authors,images,urls
0,"Sra. Franck Rolleston, 1785-92",Gilbert Stuart,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/mrs-franck-rol...
1,"Vista de Salvador, 1951",Rafael Borjes de Oliveira,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/vista-de-salvador
2,"Cabe√ßa de mulher, 1973",Emiliano Di Cavalcanti,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/cabeca-de-mulher
3,"Paisagem com um casal de camponeses, Sem data",Eug√®ne Laermans,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/paisagem-com-u...
4,"Vaca, Sem data",√âmile Claus,https://assets.masp.org.br/uploads/collection/...,https://masp.org.br/acervo/obra/vaca


### Let's see if there's still something we can't see

In [11]:
nan_counts = df_complete.isna().sum()
print(nan_counts)

titles     0
authors    0
images     0
urls       0
dtype: int64


### Great! Let's save this in a new DataFrame

In [13]:
# Save the DataFrame to a CSV file
df_complete.to_csv("paints_links_full.csv", index=False)