Rhizome Dataframes: 
- 1 x Artist DF based on rdf information + manual additions 
- 1 x Artist DF w/ concatenated gender and nationality columns for visualization 
- 1 x Artworks DF based on rdf information + manual checks 
- 1 x Artworks DF w/ additional information (gender, nationality) from Artist DF for visualization 
- 1 x Artworks DF w/ additional info + scraped text

For collectives we display all IDs, gender, and nationality of potential members (if they exist as individuals in the db). For individual members of a collective that were not originally included in the Rhizome db we only included them if they were a duo. Therefore collectives with more than two members who did not exist in the original Rhizome DB exist as a sole entry with a corresponding ID while every other type of collective has multiple IDs for their constituent members. 

TO DO:
- clean scrapes, analyse for keywords

In [12]:
from __future__ import print_function
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
import pandas as pd
import pandas_profiling as pp
path = '/Users/laurentfintoni/Desktop/University/COURSE DOCS/YEAR 2/EPUB/PROJECT/EPDS/Rhizome_data/'

In [6]:
#create a df for artists, pickle 

rhz_artists= pd.read_csv('./Rhizome_data/artists_complete.csv', dtype='string')
#set empty fields w/ missing
rhz_artists['artistLabel'] = rhz_artists['artistLabel'].fillna('')
rhz_artists['artistPage'] = rhz_artists['artistPage'].fillna('')
rhz_artists['Nationality'] = rhz_artists['Nationality'].fillna('')
rhz_artists['Nationality Guessed'] = rhz_artists['Nationality Guessed'].fillna('')
rhz_artists['Gender'] = rhz_artists['Gender'].fillna('')
rhz_artists['Gender Guessed'] = rhz_artists['Gender Guessed'].fillna('')
rhz_artists['Birth'] = rhz_artists['Birth'].fillna('0').astype('int')
rhz_artists['Death'] = rhz_artists['Death'].fillna('0').astype('int')
rhz_artists['Wiki QID'] = rhz_artists['Wiki QID'].fillna('')
rhz_artists['ULAN'] = rhz_artists['ULAN'].fillna('')
#set collective fields to empty not missing 
rhz_artists['collectiveLabel'] = rhz_artists['collectiveLabel'].fillna('')
rhz_artists['collectivePage'] = rhz_artists['collectivePage'].fillna('')
#create and populate ID field to cross reference with artworks 
rhz_artists['ID'] = range(1, 1+len(rhz_artists))
rhz_artists['ID'] = rhz_artists['ID'].astype('string')
#drop the notes column 
rhz_artists = rhz_artists.drop('Notes', axis=1)
#rename columns 
rhz_artists.rename(columns={'artistLabel': 'Artist', 'artistPage': 'Artist URL', 'collectiveLabel': 'Collective', 'collectivePage': 'Collective URL', 'Wiki QID': 'WikiQID', 'Nationality Guessed': 'nationalityGuessed', 'Gender Guessed': 'genderGuessed', 'Wiki QID': 'wikiQID'}, inplace=True)
rhz_artists.to_pickle(path+'rhizome_artists.pkl')

In [7]:
#create a second artists DF w/ concataned gender and nationality columns 

rhz_artists_extra = rhz_artists.copy()
rhz_artists_extra['Gender'] = rhz_artists_extra['Gender'].str.cat(rhz_artists_extra['genderGuessed'], join='outer', na_rep='missing')
rhz_artists_extra = rhz_artists_extra.drop('genderGuessed', axis=1)
rhz_artists_extra.loc[rhz_artists_extra['Gender'] == '', 'Gender'] = 'missing'
rhz_artists_extra['Nationality'] = rhz_artists_extra['Nationality'].str.cat(rhz_artists_extra['nationalityGuessed'], join='outer', na_rep='missing')
rhz_artists_extra = rhz_artists_extra.drop('nationalityGuessed', axis=1)
rhz_artists_extra.loc[rhz_artists_extra['Nationality'] == '', 'Nationality'] = 'missing'
rhz_artists_extra.to_pickle(path+'rhizome_artists_extra.pkl')

In [8]:
#create a df for artworks, pickle 

rhz_artworks= pd.read_csv('./Rhizome_data/artwork_complete.csv', dtype='string')
#set empty fields w/ missing or empty strings 
rhz_artworks['accession'] = rhz_artworks['accession'].fillna('0')
rhz_artworks.loc[:,'accession'] = rhz_artworks['accession'].where((rhz_artworks['accession'].str.len() <= 4), rhz_artworks['accession'].str[0:4])
rhz_artworks['inception'] = rhz_artworks['inception'].fillna('0')
rhz_artworks.loc[:,'inception'] = rhz_artworks['inception'].where((rhz_artworks['inception'].str.len() <= 4), rhz_artworks['inception'].str[0:4])
url_columns = ['summary_url', 'summary_url_2', 'description_url', 'description_url_2', 'statement_url']
rhz_artworks[url_columns] = rhz_artworks[url_columns].fillna('')
#rename columns 
rhz_artworks.rename(columns={'artwork_label': 'Title', 'artist_label': 'Artist', 'accession': 'dateAcquired', 'inception': 'dateCreated', 'artwork_page': 'URL'}, inplace=True)
#auto populate ID based on artist df 
for index, row in rhz_artworks.iterrows():
    ids = list()
    artist = getattr(row, 'Artist')
    artist = artist.split(', ')
    artists = rhz_artists_extra[['Artist', 'ID']]
    artists = dict(list(zip(artists.Artist, artists.ID)))
    collective_ids = defaultdict(list)
    for i, j in zip(rhz_artists_extra.Collective,rhz_artists_extra.ID):
        collective_ids[i].append(j)
    collective_ids = dict(collective_ids)
    del collective_ids['']
    for k, v in collective_ids.items():
        collective_ids.update({k: ', '.join([n for n in v])})

    for person in artist:
        if person in artists:
            ids.append(str(artists[person]))
        elif person in collective_ids:
            ids.append(collective_ids[person])
        else:
            ids.append('missing')
    string = ', '.join(ids)

    rhz_artworks.at[index, 'ID'] = string
rhz_artworks.to_pickle(path+'rhizome_artworks.pkl')

In [9]:
#create a second df for artworks w/ additional artist info for visualisation 

rhz_artworks_extra = rhz_artworks.copy()
rhz_artworks_extra.drop(['summary_url', 'summary_url_2', 'description_url', 'description_url_2', 'statement_url'], inplace=True, axis=1)

#add nationality 
for index, row in rhz_artworks_extra.iterrows():
    nationality = list()
    artist = getattr(row, 'Artist')
    artist = artist.split(', ')
    artists = rhz_artists_extra[['Artist', 'Nationality']]
    artists = dict(list(zip(artists.Artist, artists.Nationality)))
    collectives_nat = defaultdict(list)
    for i, j in zip(rhz_artists_extra.Collective,rhz_artists_extra.Nationality):
        collectives_nat[i].append(j)
    collectives_nat = dict(collectives_nat)
    del collectives_nat['']
    for k, v in collectives_nat.items():
        collectives_nat.update({k: ', '.join([n for n in v])})

    for person in artist:
        if person in artists:
            nationality.append(str(artists[person]))
        elif person in collectives_nat:
            nationality.append(collectives_nat[person])
        else:
            nationality.append('missing')
    string_nationality = ', '.join(nationality)

    rhz_artworks_extra.at[index, 'Nationality'] = string_nationality
    
#add gender 
for index, row in rhz_artworks_extra.iterrows():
    gender = list()
    artist = getattr(row, 'Artist')
    artist = artist.split(', ')
    artists = rhz_artists_extra[['Artist', 'Gender']]
    artists = dict(list(zip(artists.Artist, artists.Gender)))
    collectives_gen = defaultdict(list)
    for i, j in zip(rhz_artists_extra.Collective,rhz_artists_extra.Gender):
        collectives_gen[i].append(j)
    collectives_gen = dict(collectives_gen)
    del collectives_gen['']
    for k, v in collectives_gen.items():
        collectives_gen.update({k: ', '.join([n for n in v])})

    for person in artist:
        if person in artists:
            gender.append(str(artists[person]))
        elif person in collectives_gen:
            gender.append(collectives_gen[person])
        else:
            gender.append('missing')
    string_gender = ', '.join(gender)

    rhz_artworks_extra.at[index, 'Gender'] = string_gender

rhz_artworks_extra.to_pickle(path+'rhizome_artworks_extra.pkl')
rhz_artworks_extra.to_csv(path+'rhizome_artworks_extra.csv')

In [13]:
#scrape summaries, descriptions and artists statements from Rhizome website 

def url_to_text_rhizome(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    #text = [p.text for p in soup.find(class_="mw-parser-output").find_all('p')]
    #description = [div.text.strip() for div in soup.find(class_="artbase-summary-1").find_all('div')]
    #summary_statement = [div.text.strip() for div in soup.find(class_="artbase-description-1").find_all('div')]
    accordion = [p.text.strip() for p in soup.find(id="AccordionDescriptionBody").find_all('div')]
    print(url)
    return accordion

In [16]:
#grab URLs from artworks DF, scrape them and return them back to the DF
urls = rhz_artworks_extra['URL'].to_list()
scrapes = [url_to_text_rhizome(u) for u in urls]
rhz_artworks_extra['Text'] = pd.Series(scrapes)
#fix an erroneous ID in original first round of scraping 
rhz_artworks_extra.loc[777, 'ID'] = '926, 1268'
rhz_artworks_extra = rhz_artworks_extra.astype('string')
rhz_artworks_extra.to_pickle(path+'rhizome_artworks_extra_text.pkl')

In [2]:
rhz_artworks_extra = pd.read_pickle(path+'rhizome_artworks_extra_text.pkl')

In [None]:
remove = ['description edit\\\\n\\\\n\\\\t\\\\t\\\\t\\\\t', '[', ']', '\\\\n\\\\n\\\\n\\\\', '\'description edit\', ', 'edit\\\\n\\\\n']
for char in remove:
    rhz_artworks_extra['Text'] = rhz_artworks_extra['Text'].str.replace(char, '')

In [None]:
rhz_artworks_extra.iloc[3]['Text']

Reports 

In [None]:
pp.ProfileReport(rhz_artworks_extra)