In [108]:
import pandas as pd
import re

In [21]:
#read in tables and print to see counts & columns. 
artists_full = pd.read_csv('artists_table_final.csv')
paintings_wip = pd.read_csv('paintings_wip.csv')


#read out object counts & unique counts - these are primary keys in the db so they all need to be unique
print(paintings_wip.Object_id.count())
print(paintings_wip.Object_id.nunique())
print("----------------")
print(artists_full.Artist_id.count())
print(artists_full.Artist_id.nunique())

2819
2652
----------------
2299
2299


In [22]:
#paintings_wip has duplicate ids in it. This will mess up the primary key constraint in our database, so dedupe & read out to confirm
paintings_wip.drop_duplicates(subset='Object_id', keep = 'first', inplace=True)
print(paintings_wip.Object_id.count())
print(paintings_wip.Object_id.nunique())

2652
2652


In [97]:
#We probably dropped a lot of artists when we dropped paintings outside the public domain, so we can consolidate our artist table to make our db more efficient
#This is important because we want to host on Heroku and therefore want to keep the database light. 

#read out artist_id into a list that we can use to filter our artist table
artist_list = paintings_wip.Artist_id.to_list()

#filter artist table
filter_in_paintings = artists_full.Artist_id.isin(artist_list)
artists_wip = artists_full[filter_in_paintings]

print(artists_wip.Artist_id.count())
print(artists_wip.Artist_id.nunique())


983
983


In [98]:
artists_wip.head(10)

Unnamed: 0,Surname,First_name,Full_name,Birth_date,Death_date,Artist_id,image,Biography
0,Defeuille,Louis Benjamin Huber,Louis Benjamin Huber Defeuille,Unknown,Unknown,0,image could not be found,Biography could not be found
2,Unknown,blank_,Unknown,Unknown,Unknown,2,image could not be found,Biography could not be found
4,Nini,Jean-Baptiste,Jean-Baptiste Nini,1717,1786,3,image could not be found,Biography could not be found
5,Sanford,H.,H. Sanford,1783,1822,4,image could not be found,Biography could not be found
6,Babcock,William P.,William P. Babcock,1826,1899,5,image could not be found,Biography could not be found
7,Stuart,Gilbert,Gilbert Stuart,1755,1828,6,/wiki/File:Gilbert_Stuart_crop.jpg,s foremost portraitists.[2] His best known wor...
9,Inman,Henry,Henry Inman,1801,1846,8,image could not be found,Biography could not be found
10,La Farge,John,John La Farge,1835,1910,9,/wiki/File:Portrait_of_John_LaFarge.jpg,"John La Farge (March 31, 1835 – November 14, 1..."
11,Ryder,Albert Pinkham,Albert Pinkham Ryder,1847,1917,10,/wiki/File:Albert_Pinkham_Ryder.jpg,"Albert Pinkham Ryder (March 19, 1847 – March 2..."
12,Abbey,Edwin Austin,Edwin Austin Abbey,1852,1911,11,/wiki/File:Edwin_Austin_Abbey_cropped.jpg,"Edwin Austin Abbey RA (April 1, 1852\xa0– Augu..."


In [41]:
#We have another problem in that our original 'scrape' of the artist images that we /could/ find was incorrect
#It's a file path to an image...NOT the actual hosted image path that we need in order to display it

#So I'll need to re-scrape for those who we originally found a wiki entry for. 
has_wiki = artists_wip.image != "image could not be found"

artists_for_scrape = artists_wip[has_wiki]


In [45]:
print("Artists we can scrape for: " + str(artists_for_scrape.Artist_id.count()))

Artists we can scrape for: 683


In [52]:
######### Scraper. I'm just repurposing most of the OG scraper from our ETL project. 
######### Let's do this. :P 

import requests
from bs4 import BeautifulSoup

results = []
error = []

limit = 20

for index, row in artists_for_scrape.iterrows():

    name = artists.loc[index,'Full_name']

    #url = input('SENT FROM USER REQUEST')
    url = 'https://en.wikipedia.org/wiki/'+ name
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    print('grabbing page for...' + name)

    try:
        header_artist_name = soup.find('h1', class_="firstHeading").text
        name_error = False
    except:
        header_artist_name = 'unfound' 
        name_error = True

    try: 
        artist_image_search = soup.find("a",{"class":"image"})
        artist_image_link = artist_image_search.findChild()
        img_src = artist_image_link["src"]
        artists_for_scrape.loc[index, 'Image_link'] = img_src
        image_error = False
    except:
        artist_image_link = 'unfound' 
        image_error = True
    
    print('Errors:' + str(name_error) + str(image_error))
    

print("Finished!")

grabbing page for...Gilbert Stuart
Errors:FalseFalse
grabbing page for...John La Farge
Errors:FalseFalse
grabbing page for...Albert Pinkham Ryder
Errors:FalseFalse
grabbing page for...Edwin Austin Abbey
Errors:FalseFalse
grabbing page for...Francis Alexander
Errors:FalseFalse
grabbing page for...John White Alexander
Errors:FalseFalse
grabbing page for...Washington Allston
Errors:FalseFalse
grabbing page for...Ezra Ames
Errors:FalseFalse
grabbing page for...Joseph Alexander Ames
Errors:FalseFalse
grabbing page for...Thomas Anshutz
Errors:FalseFalse
grabbing page for...John Woodhouse Audubon
Errors:FalseFalse
grabbing page for...Joseph Badger
Errors:FalseFalse
grabbing page for...W. H. Bean
Errors:FalseTrue
grabbing page for...J. Carroll Beckwith
Errors:FalseFalse
grabbing page for...Henry Benbridge
Errors:FalseFalse
grabbing page for...Albert Bierstadt
Errors:FalseFalse
grabbing page for...George Caleb Bingham
Errors:FalseFalse
grabbing page for...Robert Frederick Blum
Errors:FalseFalse

grabbing page for...Jacopo Amigoni
Errors:FalseFalse
grabbing page for...Fra Angelico
Errors:FalseFalse
grabbing page for...Zanobi Strozzi
Errors:FalseFalse
grabbing page for...Antonello da Messina
Errors:FalseFalse
grabbing page for...Antoniazzo Romano
Errors:FalseFalse
grabbing page for...Jacques d' Arthois
Errors:FalseFalse
grabbing page for...David Bailly
Errors:FalseFalse
grabbing page for...Hans Baldung
Errors:FalseFalse
grabbing page for...Charles Bargue
Errors:FalseFalse
grabbing page for...Fra Bartolomeo
Errors:FalseFalse
grabbing page for...Bartolomeo degli Erri
Errors:FalseFalse
grabbing page for...Jules Bastien-Lepage
Errors:FalseFalse
grabbing page for...Pompeo Batoni
Errors:FalseFalse
grabbing page for...Jean-Frédéric Bazille
Errors:FalseFalse
grabbing page for...William Beechey
Errors:FalseFalse
grabbing page for...Giovanni Bellini
Errors:FalseFalse
grabbing page for...Jacopo Bellini
Errors:FalseFalse
grabbing page for...Bernardo Bellotto
Errors:FalseFalse
grabbing page 

grabbing page for...Charles-Théodore Frère
Errors:FalseFalse
grabbing page for...Eugène Fromentin
Errors:FalseFalse
grabbing page for...Bernardino Fungai
Errors:FalseFalse
grabbing page for...Henry Fuseli
Errors:FalseFalse
grabbing page for...Jan Fyt
Errors:FalseFalse
grabbing page for...Agnolo Gaddi
Errors:FalseFalse
grabbing page for...Thomas Gainsborough
Errors:FalseFalse
grabbing page for...Saturnino Gatti
Errors:FalseFalse
grabbing page for...Paul Gauguin
Errors:FalseFalse
grabbing page for...Gentile da Fabriano
Errors:FalseFalse
grabbing page for...Artemisia Gentileschi
Errors:FalseFalse
grabbing page for...Théodore Gericault
Errors:FalseFalse
grabbing page for...Niccolò di Pietro Gerini
Errors:FalseFalse
grabbing page for...Jean-Léon Gérôme
Errors:FalseFalse
grabbing page for...Domenico Ghirlandaio
Errors:FalseFalse
grabbing page for...Davide Ghirlandaio
Errors:FalseFalse
grabbing page for...Michele Giambono
Errors:FalseFalse
grabbing page for...Giampietrino
Errors:FalseFalse
gr

grabbing page for...Arnold Boonen
Errors:FalseFalse
grabbing page for...Daniël Mijtens
Errors:FalseFalse
grabbing page for...John Everett Millais
Errors:FalseFalse
grabbing page for...Francisque Millet
Errors:FalseFalse
grabbing page for...Jean-François Millet
Errors:FalseFalse
grabbing page for...Pier Francesco Mola
Errors:FalseFalse
grabbing page for...Pieter de Molijn
Errors:FalseFalse
grabbing page for...Peter Monamy
Errors:FalseFalse
grabbing page for...Bartolomeo Montagna
Errors:FalseFalse
grabbing page for...Francesco Montemezzano
Errors:FalseFalse
grabbing page for...Adolphe Monticelli
Errors:FalseFalse
grabbing page for...Anthonis Mor van Dashorst
Errors:FalseFalse
grabbing page for...Paulus Moreelse
Errors:FalseFalse
grabbing page for...Moretto da Brescia
Errors:FalseFalse
grabbing page for...Berthe Morisot
Errors:FalseFalse
grabbing page for...George Morland
Errors:FalseFalse
grabbing page for...Giovanni Battista Moroni
Errors:FalseFalse
grabbing page for...Jan Mostaert
Erro

grabbing page for...Anne Vallayer-Coster
Errors:FalseFalse
grabbing page for...Carle Vanloo
Errors:FalseFalse
grabbing page for...Lippo Vanni
Errors:FalseFalse
grabbing page for...Velázquez
Errors:FalseFalse
grabbing page for...Willem van de Velde
Errors:FalseFalse
grabbing page for...Johannes Vermeer
Errors:FalseFalse
grabbing page for...Joseph Vernet
Errors:FalseFalse
grabbing page for...Horace Vernet
Errors:FalseFalse
grabbing page for...Paolo Veronese
Errors:FalseFalse
grabbing page for...Andrea del Verrocchio
Errors:FalseFalse
grabbing page for...Johannes Verspronck
Errors:FalseFalse
grabbing page for...Antoine Vestier
Errors:FalseFalse
grabbing page for...Jean-Georges Vibert
Errors:FalseFalse
grabbing page for...Jan Victors
Errors:FalseFalse
grabbing page for...Élisabeth Louise Vigée Le Brun
Errors:FalseFalse
grabbing page for...Antonio Vivarini
Errors:FalseFalse
grabbing page for...Bartolomeo Vivarini
Errors:FalseFalse
grabbing page for...Simon de Vlieger
Errors:FalseFalse
grabb

In [99]:
#now join back into the wip table; drop the original "Image" column (we'll use Image_link moving forward)
#Fill in all missing values with our anonymous_artist.png. 

artists_wip = artists_wip.join(artists_for_scrape, how = "left", on='Artist_id', rsuffix="_y")

In [100]:
artists_wip = artists_wip.drop(columns=["image", "Surname_y", "First_name_y", "Full_name_y", "Birth_date_y", "Death_date_y", "image_y", "Biography_y", "Artist_id_y"])

In [103]:
artists_wip = artists_wip.fillna("../static/img/anonymous_artist.png")
artists_wip.head(10)

Unnamed: 0,Surname,First_name,Full_name,Birth_date,Death_date,Artist_id,Biography,Image_link
0,Defeuille,Louis Benjamin Huber,Louis Benjamin Huber Defeuille,Unknown,Unknown,0,Biography could not be found,../static/img/anonymous_artist.png
2,Unknown,blank_,Unknown,Unknown,Unknown,2,Biography could not be found,../static/img/anonymous_artist.png
4,Nini,Jean-Baptiste,Jean-Baptiste Nini,1717,1786,3,Biography could not be found,../static/img/anonymous_artist.png
5,Sanford,H.,H. Sanford,1783,1822,4,Biography could not be found,../static/img/anonymous_artist.png
6,Babcock,William P.,William P. Babcock,1826,1899,5,Biography could not be found,../static/img/anonymous_artist.png
7,Stuart,Gilbert,Gilbert Stuart,1755,1828,6,s foremost portraitists.[2] His best known wor...,../static/img/anonymous_artist.png
9,Inman,Henry,Henry Inman,1801,1846,8,Biography could not be found,../static/img/anonymous_artist.png
10,La Farge,John,John La Farge,1835,1910,9,"John La Farge (March 31, 1835 – November 14, 1...",../static/img/anonymous_artist.png
11,Ryder,Albert Pinkham,Albert Pinkham Ryder,1847,1917,10,"Albert Pinkham Ryder (March 19, 1847 – March 2...",//upload.wikimedia.org/wikipedia/commons/thumb...
12,Abbey,Edwin Austin,Edwin Austin Abbey,1852,1911,11,"Edwin Austin Abbey RA (April 1, 1852\xa0– Augu...",//upload.wikimedia.org/wikipedia/commons/thumb...


In [107]:
#Finally, we need to double-check Biography to clean up some of the strings. Print out a few tests for records we know have bios
print(artists_wip.loc[10,'Biography'])
print("----")
print(artists_wip.loc[11,'Biography'])
print("----")
print(artists_wip.loc[12,'Biography'])


John La Farge (March 31, 1835 – November 14, 1910) was an American painter, muralist, stained glass window maker, decorator, and writer.\n
----
Albert Pinkham Ryder (March 19, 1847 – March 28, 1917) was an American painter best known for his poetic and moody allegorical works and seascapes, as well as his eccentric personality. While his art shared an emphasis on subtle variations of color with tonalist works of the time, it was unique for accentuating form in a way that some art historians regard as modernist.\n
----
Edwin Austin Abbey RA (April 1, 1852\xa0– August 1, 1911) was an American muralist, illustrator, and painter. He flourished at the beginning of what is now referred to as the "golden age" of illustration, and is best known for his drawings and paintings of Shakespearean and Victorian subjects, as well as for his painting of Edward VII\'s coronation.[1][2][3] His most famous set of murals, The Quest and Achievement of the Holy Grail, adorns the Boston Public Library.\n


In [130]:
#These Bios have a trailing "\n" as well a citation references "[x]" that we want to strip out

for index, row in artists_wip.iterrows():
    Biography = artists_wip.loc[index, "Biography"]
    
    if Biography != "Biography could not be found":
        Biography = Biography.replace(r"\n", "")
        Biography = Biography.replace("\\", "")
        Biography = re.sub("[\(\[].*?[\)\]]", "", Biography)
        Biography = Biography.replace(r'"]', "")
        
        artists_wip.loc[index, "Biography"] = Biography

In [133]:
#Tests are clean now!
print(artists_wip.loc[10,'Biography'])
print("----")
print(artists_wip.loc[11,'Biography'])
print("----")
print(artists_wip.loc[12,'Biography'])


John La Farge  was an American painter, muralist, stained glass window maker, decorator, and writer.
----
Albert Pinkham Ryder  was an American painter best known for his poetic and moody allegorical works and seascapes, as well as his eccentric personality. While his art shared an emphasis on subtle variations of color with tonalist works of the time, it was unique for accentuating form in a way that some art historians regard as modernist.
----
Edwin Austin Abbey RA  was an American muralist, illustrator, and painter. He flourished at the beginning of what is now referred to as the "golden age" of illustration, and is best known for his drawings and paintings of Shakespearean and Victorian subjects, as well as for his painting of Edward VII's coronation. His most famous set of murals, The Quest and Achievement of the Holy Grail, adorns the Boston Public Library.


In [None]:
#Save out the finalized paintings and artists files to csv
artists_wip.to_csv('')