# NET ART REVOLUTION?

**Abstract**

The advent of the internet and personal computers as tools of communication and creation radically transformed the world we live in over the past 40 years. The impacts of both technologies across various fields and topics have been well documented, including on the art world. However, discussion in the latter field is often focused on how the internet and computers have transformed art creation as well as the ways in which memory institutions function and interact with their audiences. Slightly less focus has been given to how the technologies may have impacted how artworks are acquired. Net Art Revolution? aims to look at this particular aspect of the digital revolution by analyzing data from two different NYC-based institutions -- MoMA, representing the more traditional art world, and Rhizome, a smaller institution dedicated to digital-born art and culture, often refered to as net art. Both institutions make data about their collections publicly available -- MoMA in csv and json formats, Rhizome as Linked Open Data -- and this data is used to investigate our primary research question: **Did the internet and personal computers have an impact on the make up of art collections?** 


This project was undertaken by Margherita Donelli, Chiara Catizone, and Laurent Fintoni for the Electronic Publishing and Digital Storytelling (a.y. 2021-2022) final examination, as part of the *Digital Humanities and Digital Knowledge Master's Degree* at the University of Bologna.

***

In [1]:
#IMPORTS + PATH
path = './'

#GENERIC
import sys
import pandas as pd
import numpy as np
import re
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
#PLOTLY
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pycountry
import chart_studio.tools as tls
import ipywidgets as ipw
from statsmodels.graphics.mosaicplot import mosaic
from plotly.subplots import make_subplots
#PREPPING/CLEANING
from SPARQLWrapper import SPARQLWrapper, JSON
from functools import reduce
from collections import defaultdict
pio.templates.default = "simple_white"
#SCRAPING
from __future__ import print_function
from bs4 import BeautifulSoup
import requests
#KEYWORDS
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize 
from nltk.util import ngrams

In [2]:
#FUNCTIONS

#CLEAN DATA
def normalizeDate(year):
    if  re.match('.*\d\d\d\d.*', str(year)):
        newDate = re.findall(r'\d\d\d\d', year)
        #modify the date in order to normalize it using the first year in the list 
        return int(newDate[0])
        
    else: 
        return 0

def getGender(ids):
    listIds = str(ids).split(', ')
    listGenders = list()
    for id in listIds:
        gender = artists[artists['ID']== id]
        if len(gender)>0:
            gender = gender['Gender'].values[0]
            listGenders.append(gender)
        else:
            listGenders.append('missing')

    
    return ", ".join(listGenders)

def getNationality(ids):
    listIds = str(ids).split(', ')
    listNationality = list()
    for id in listIds:
        nationality = artists[artists['ID']== id]
        if len(nationality)>0:
            nationality = nationality['Nationality'].values[0]
            listNationality.append(nationality)
        else:
            listNationality.append('missing')

    
    return ", ".join(listNationality)

#SPARQL QUERY
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

#Scrape summaries, descriptions and artists statements from Rhizome website 
def url_to_text_rhizome(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "html.parser")
    accordion = [p.text.strip() for p in soup.find(id="AccordionDescriptionBody").find_all('div')]
    print(url)
    return accordion

#Extract text from MoMA website w/ exclusions for 404 status, missing URLs, and pages w/ no despcription
def url_to_text_moma(url):
    if url != 'missing':
        page = requests.get(url)
        status = page.status_code
        if status != 404:
            soup = BeautifulSoup(page.content, "html.parser")
            try:
                if soup.find(class_="uneven-columns--work").find(class_='main-content') is not None:
                    text = soup.find(class_="uneven-columns--work").find(class_='main-content').find_all('p')
                else:
                    text = ''
            except AttributeError:
                text = ''
                pass
        else:
            text = '404'
    else:
        text = 'missing'     
    print(url)
    return text

#Extract keywords Rhizome and pass them to a new column
def get_keywords(row):
    punct_tokenizer = nltk.RegexpTokenizer(r"\w+")
    some_text = row['Text']
    tokens = punct_tokenizer.tokenize(some_text)
    keywords = [keyword for keyword in tokens if keyword.isalpha() and not keyword in stop_words]
    freqdist = nltk.FreqDist(keywords)
    most_common = freqdist.most_common(20)
    most_common_list = []
    for t in most_common:
        most_common_list.append(t[0])
    keywords_string = ', '.join(most_common_list)
    return keywords_string

#get keywords MoMA 
def get_keywords_2(row):
    punct_tokenizer = nltk.RegexpTokenizer(r"\w+")
    some_text = row['text']
    tokens = punct_tokenizer.tokenize(some_text)
    keywords = [keyword for keyword in tokens if keyword.isalpha() and not keyword in stop_words]
    freqdist = nltk.FreqDist(keywords)
    most_common = freqdist.most_common(20)
    most_common_list = []
    for t in most_common:
        most_common_list.append(t[0])
    keywords_string = ', '.join(most_common_list)
    return keywords_string

#group genders 
def group_genders(df):
    collectives = ['M', 'F', 'missing', 'NB']
    df = df['Gender'].value_counts().rename_axis('Gender').reset_index(name='Counts')
    df_collectives = ['Collectives & Collabs', df[df.Gender.str.contains(',')]['Counts'].sum()]
    df = df.loc[~df.Gender.isin(collectives) == False].reset_index(drop=True)
    df.loc[5] = df_collectives
    df = df.sort_values(by='Counts', ascending=False).replace('M', 'Male').replace('F', 'Female').replace('missing', 'Missing')
    return df

#group genders by department and source
def group_genders_dept(df, period, department):
    collectives = ['M', 'F', 'missing', 'NB']
    df = df['Gender'].value_counts().rename_axis('Gender').reset_index(name='Counts')
    df_collectives = ['Collectives & Collabs', df[df.Gender.str.contains(',')]['Counts'].sum()]
    df = df.loc[~df.Gender.isin(collectives) == False].reset_index(drop=True)
    df.loc[5] = df_collectives
    df = df.sort_values(by='Counts', ascending=False).replace('M', 'Male').replace('F', 'Female').replace('missing', 'Missing')
    df['Period'] = period
    df['Department'] = department
    if period == 'Before 1983':
        df['Counts'] = df['Counts'].div(100).round(2)
        df['Counts'] = df['Counts'].multiply(43).round(2)
        df['Counts'] = df['Counts'].astype(int)
    df = df.sort_values(by='Counts', ascending=False).replace('M', 'Male').replace('F', 'Female').replace('missing', 'Missing')
    return df

#Turn nationalities into north/south divide
def getDivide(nation):
    if nation in global_north:
        return 'Global North'
    elif nation in global_south:
        return 'Global South'
    else: 
        return 'missing'

#Turn nationalities into north/south divide
def getRegion(nation):
    if nation in africa:
        return 'Africa'
    elif nation in arab_states:
        return 'Arab States'
    elif nation in asia_pacific:
        return 'Asia & Pacific'
    elif nation in europe:
        return 'Europe'
    elif nation in middle_east:
        return 'Middle east'
    elif nation in north_america:
        return 'North America'
    elif nation in south_america:
        return 'South/Latin America'
    else: 
        return 'missing'

#Get country codes
def do_fuzzy_search(country):
    try:
        result = pycountry.countries.search_fuzzy(country)
        return result[0].alpha_3
    except:
        return np.nan

def assign_codes(df):
    df_nats = df.drop_duplicates(subset ="Artist", keep = 'last')
    df_nats_set = set(df_nats.Nationality)
    missing = pd.DataFrame({'Aalborgenser': ['Korean', 'Native American', 'Canadian Inuit'], 'Aalborg': ['Korea', 'United States', 'Canada']})
    parser = pd.read_csv('https://raw.githubusercontent.com/knowitall/chunkedextractor/master/src/main/resources/edu/knowitall/chunkedextractor/demonyms.csv')
    corrections = pd.concat([missing, parser])
    df_nats_2 = df_nats['Nationality'].value_counts().rename_axis('Nationality').reset_index(name='Count')
    for item in df_nats_set:
        my = corrections[corrections['Aalborgenser'] == item]
        country = my[:1]['Aalborg'].values
        if len(country)>0:
            country_str = my[:1]['Aalborg'].values[0]
            df_nats_2.loc[df_nats_2["Nationality"] == item, "Nationality"] = country_str
    df_countries_count = pd.DataFrame(columns= ['Nation', 'Count'])
    new_set = set(df_nats_2.Nationality)
    for item in new_set:
        subCountry = df_nats_2[df_nats_2['Nationality'] == item]
        sum_acquisitions = subCountry['Count'].sum()
        df_countries_count.loc[len(df_countries_count.index)] = [item, sum_acquisitions]
    df_countries_count["country_code"] = df_countries_count["Nation"].apply(lambda country: do_fuzzy_search(country))
    df_countries_count["country_code"] = df_countries_count["country_code"].astype(str)
    df_countries_count = df_countries_count.sort_values(by='Nation').reset_index(drop=True)
    return df_countries_count

In [3]:
#LOAD ALL NEEDED PICKLED/EXTERNAL DATA 

#complete DFs
rhz_artworks = pd.read_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra.pkl')
rhz_artworks = rhz_artworks.replace('N/B', 'NB')
moma_artworks = pd.read_pickle(path+'MOMA_data/pickle/MoMAartworks.pkl')
moma_artworks_old =  pd.read_pickle(path+'MOMA_data/pickle/old_artworks.pkl')
moma_artworks_new = pd.read_pickle(path+'MOMA_data/pickle/new_artworks.pkl')
moma_rhz_compare = moma_artworks_new.loc[moma_artworks_new['dateAcquired'] >= 2000]

#MoMA department DFs
moma_draw = moma_artworks.loc[moma_artworks['Department'] == 'Drawings & Prints']
moma_draw_before = moma_draw.loc[moma_draw['dateCreated'] <= 1982]
moma_draw_after = moma_draw.loc[moma_draw['dateCreated'] >= 1983]
moma_films = moma_artworks.loc[moma_artworks['Department'] == 'Film']
moma_films_before = moma_films.loc[moma_films['dateCreated'] <= 1982]
moma_films_after = moma_films.loc[moma_films['dateCreated'] >= 1983]
moma_media = moma_artworks.loc[moma_artworks['Department'] == 'Media and Performance']
moma_media_before = moma_media.loc[moma_media['dateCreated'] <= 1982]
moma_media_after = moma_media.loc[moma_media['dateCreated'] >= 1983]
moma_photo = moma_artworks.loc[moma_artworks['Department'] == 'Photography']
moma_photo_before = moma_photo.loc[moma_photo['dateCreated'] <= 1982]
moma_photo_after = moma_photo.loc[moma_photo['dateCreated'] >= 1983]

#Rhizome with text
rhizome_txt_clean = pd.read_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra_text_clean.pkl')
rhizome_txt_stop_kw = pd.read_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra_text_clean_stop_keywords.pkl')

#MoMA with text
moma_arch_cont_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_cont_text_final_stop.pkl')
moma_arch_mod_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_mod_text_only_final_stop.pkl')
moma_draw_cont_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/draws_prints_cont_text_final_stop.pkl')
moma_draw_mod_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/draws_prints_mod_text_final_stop.pkl')
moma_films_cont_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/films_cont_text_final_stop.pkl')
moma_films_mod_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/films_mod_text_final_stop.pkl')
moma_media_cont_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/media_perf_cont_text_final_stop.pkl')
moma_media_mod_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/media_perf_mod_text_final_stop.pkl')
moma_paint_cont_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/paint_sculp_cont_text_final_stop.pkl')
moma_paint_mod_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/paint_sculp_mod_text_final_stop.pkl')
moma_photo_cont_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/photo_cont_text_final_stop.pkl')
moma_photo_mod_text_stop = pd.read_pickle(path+'MOMA_data/pickle/departments/photo_mod_text_final_stop.pkl')
moma_full_text_stop = pd.concat([moma_arch_cont_text_stop, moma_draw_cont_text_stop, moma_draw_mod_text_stop, moma_films_cont_text_stop, moma_films_mod_text_stop, moma_media_cont_text_stop, moma_media_mod_text_stop, moma_paint_cont_text_stop, moma_paint_mod_text_stop, moma_photo_cont_text_stop, moma_photo_mod_text_stop])

#create global north/south filters + regions
north_south_divide = pd.read_excel(path+'divide_list.xlsx')
global_north = north_south_divide.loc[north_south_divide['Divide'] == 'Global North']
global_north = global_north['Country'].to_list()
global_south = north_south_divide.loc[north_south_divide['Divide'] == 'Global South']
global_south = global_south['Country'].to_list()
africa = north_south_divide.loc[north_south_divide['Region'] == 'Africa']
africa = africa['Country'].to_list()
arab_states = north_south_divide.loc[north_south_divide['Region'] == 'Arab States']
arab_states = arab_states['Country'].to_list()
asia_pacific = north_south_divide.loc[north_south_divide['Region'] == 'Asia & Pacific']
asia_pacific = asia_pacific['Country'].to_list()
europe = north_south_divide.loc[north_south_divide['Region'] == 'Europe']
europe = europe['Country'].to_list()
middle_east = north_south_divide.loc[north_south_divide['Region'] == 'Middle east']
middle_east = middle_east['Country'].to_list()
north_america = north_south_divide.loc[north_south_divide['Region'] == 'North America']
north_america = north_america['Country'].to_list()
south_america = north_south_divide.loc[north_south_divide['Region'] == 'South/Latin America']
south_america = south_america['Country'].to_list()

***

## Assessing the Available Data (Everyone)

Our research began by looking at the available datasets. We downloaded [MoMA's publicly available dumps](https://github.com/MuseumofModernArt/collection) and looked at their formatting, and we then explored Rhizome's dataset (AKA the ArtBase) via their [SPARQL endpoint](https://query.artbase.rhizome.org/). The first key observations were as follows:
- MoMA's artworks and artists datasets are linked via an ID system, which did not exist in Rhizome
- Rhizome's dataset has more missing information, such as biographical information of artists 
- The sizes of the datasets are drastically different with Rhizome's artists set being 10% of MoMA's and its artworks set being 1.6% percent of MoMA's 

We contacted Rhizome with some questions about how their data was formatted and they did eventually supply us with a copy of their RDF dump as of February 2022. We used this RDF file to primarily check our queries and extracted all the data we needed directly via the endpoint. We would like to acknowledge and thank Dragan Espenschied, Preservation Director at Rhizome, for his willingness to answer some of our questions about the formatting of Rhizome's data and supplying us with a copy of the RDF dump. 

***

## Cleaning and Preparing MoMA Data (Chiara)

For the artworks dataset we selected the columns in MoMA's datasets most relevant to our research, as well as most aligned with existing Rhizome data formatting. These were: Title; Artist; ConstituentID (linking system); Date (renamed to Date Created); Medium; Department; DateAcquired; URL (for web scraping); ThumbnailURL (for potential use of images). We also decided to split the artworks dataset by Departments, created subsequent filtered sets for potential more meaningful analysis with Rhizome's collection.<br><br>
For the artists dataset all columns were kept as they were deemed potentially useful and we then used the Constituent ID mechanism to populate the artworks dataset with two key values for our analysis: gender and nationality. This way we would be able to work from the artworks dataset only. <br><br>
Finally we also selected 1983 as a pivot date for the MoMA artworks dataset that could help us see changes between acquisitions before and after the internet. 1983 was the year that the first artwork was created in Rhizome as well as the year in which TCP/IP was standardized, in effect a turning point towards the ubiquity of the worldwide web. The MoMA datasets were thus also duplicated into a before and after version.<br><br>
Finally, considering that we knew there would be missing data relevant to our research (such as gender) we decided to normalize all missing gender values using the string 'missing' and all numerical values (years) using 0 and transformed all dates to 4-digit years as we did not want to go more granular than the year level when looking at time. 

In [236]:
""" #COMMENTED OUT TO AVOID BINDER RUNNING IT - select and clean artist columns 
original_artists = pd.read_csv(path+'MOMA_data/csv/Artists.csv')
artists= original_artists[['ConstituentID','DisplayName','Nationality','Gender','BeginDate','EndDate','Wiki QID','ULAN']]
artists.rename(columns={'ConstituentID': 'ID', 'DisplayName': 'Artist','BeginDate': 'Birth', 'EndDate': 'Death'}, inplace=True)
artists['Gender'] = artists['Gender'].fillna('missing').astype('str')
artists['Wiki QID'] = artists['Wiki QID'].fillna('missing').astype('str')
artists['ULAN'] = artists['ULAN'].fillna('0').astype('int')
artists['ID'] = artists['ID'].fillna('0').astype('str')
artists['Artist'] = artists['Artist'].fillna('Unknown').astype('str')
artists['Nationality'] = artists['Nationality'].fillna('missing').astype('str')
artists['Gender'] = artists['Gender'].fillna('missing').astype('str')
artists['Birth'] = artists['Birth'].fillna('0').astype('int')
artists['Death'] = artists['Death'].fillna('0').astype('int')

#normalize gender labels to M, F, NB for ease of comparison w/ Rhizome
for x, row in artists.iterrows():
    if re.match(r'[M|m]ale', row.Gender):
        artists.at[x, 'Gender'] = 'M'
    elif re.match(r'[F|f]emale',row.Gender):
        artists.at[x, 'Gender'] = 'F'
    elif re.match(r'Non-[B|b]inary', row.Gender):
        artists.at[x, 'Gender'] = 'NB'

#clean original artworks df
original_artworks = pd.read_csv(path+'MOMA_data/csv/Artworks.csv')
original_artworks =  original_artworks[['Title','Artist','ConstituentID','Date','Medium','Department','DateAcquired','URL','ThumbnailURL']]
original_artworks['Date'] = original_artworks['Date'].apply(lambda x: normalizeDate(x))
original_artworks['DateAcquired']=original_artworks['DateAcquired'].where((original_artworks['DateAcquired'].str.len() <= 4), original_artworks['DateAcquired'].str[0:4])
original_artworks['Gender'] = original_artworks['ConstituentID'].apply(lambda x: getGender(x))
original_artworks['Nationality'] = original_artworks['ConstituentID'].apply(lambda x: getNationality(x))

original_artworks = original_artworks.rename(columns={"Date": "dateCreated", "DateAcquired": "dateAcquired"})
original_artworks['Title'] = original_artworks['Title'].fillna('Unknown')
original_artworks['dateAcquired'] = original_artworks['dateAcquired'].astype('str')
original_artworks['dateAcquired'] = original_artworks['dateAcquired'].replace('nan', str('0'))
original_artworks['dateAcquired']=original_artworks['dateAcquired'].astype('int')
original_artworks['dateCreated'] = original_artworks['dateCreated'].astype('str')
original_artworks['dateCreated'] = original_artworks['dateCreated'].replace('nan', str('0'))
original_artworks['dateCreated']=original_artworks['dateCreated'].astype('int')

#pickle for reuse
artists.to_pickle(path+'MoMA_data/pickle/MoMAartists.pkl')
original_artworks.to_pickle(path+'MoMA_data/pickle/MoMAartworks.pkl')

#create time-based subsets
before83 = original_artworks[(original_artworks['dateAcquired'] < 1983) & (original_artworks['dateCreated'] < 1983)]
after83 = original_artworks[(original_artworks['dateAcquired'] >= 1983) & (original_artworks['dateCreated'] >= 1983)]
before83.to_pickle(path+'MoMA_data/pickle/old_artworks.pkl')
after83.to_pickle(path+'MoMA_data/pickle/new_artworks.pkl')

#split artwork datasets further based on department for ease of comparison
architecture_design = before83[before83['Department'] == "Architecture & Design"]
architecture_design.to_pickle(path+'MoMA_data/pickle/departments/architecture_design_mod.pkl')
architecture_design_img  = before83[before83['Department'] == "Architecture & Design - Image Archive"]
architecture_design_img.to_pickle(path+'MoMA_data/pickle/departments/architecture_design_img_mod.pkl')
draws_prints= before83[before83['Department'] == "Drawings & Prints"]
draws_prints.to_pickle(path+'MoMA_data/pickle/departments/draws_prints_mod.pkl')
films= before83[before83['Department'] == "Film"]
films.to_pickle(path+'MoMA_data/pickle/departments/films_mod.pkl')
fluxus= before83[before83['Department'] == "Fluxus Collection"]
fluxus.to_pickle(path+'MoMA_data/pickle/departments/fluxus_mod.pkl')
media_perf= before83[before83['Department'] == "Media and Performance"]
media_perf.to_pickle(path+'MoMA_data/pickle/departments/media_perf_mod.pkl')
painting_sculp= before83[before83['Department'] == "Painting & Sculpture"]
painting_sculp.to_pickle(path+'MoMA_data/pickle/departments/paint_sculp_mod.pkl')
photo= before83[before83['Department'] == "Photography"]
photo.to_pickle(path+'MoMA_data/pickle/departments/photo_mod.pkl')
architecture_design = after83[after83['Department'] == "Architecture & Design"]
architecture_design.to_pickle(path+'MoMA_data/pickle/departments/architecture_design_cont.pkl')
architecture_design_img  = after83[after83['Department'] == "Architecture & Design - Image Archive"]
architecture_design_img.to_pickle(path+'MoMA_data/pickle/departments/architecture_design_img_cont.pkl')
draws_prints= after83[after83['Department'] == "Drawings & Prints"]
draws_prints.to_pickle(path+'MoMA_data/pickle/departments/draws_prints_cont.pkl')
films= after83[after83['Department'] == "Film"]
films.to_pickle(path+'MoMA_data/pickle/departments/films_cont.pkl')
fluxus= after83[after83['Department'] == "Fluxus Collection"]
fluxus.to_pickle(path+'MoMA_data/pickle/departments/fluxus_cont.pkl')"""  """
media_perf= after83[after83['Department'] == "Media and Performance"]
media_perf.to_pickle(path+'MoMA_data/pickle/departments/media_perf_cont.pkl')
painting_sculp= after83[after83['Department'] == "Painting & Sculpture"]
painting_sculp.to_pickle(path+'MoMA_data/pickle/departments/paint_sculp_cont.pkl')
photo= after83[after83['Department'] == "Photography"]
photo.to_pickle(path+'MoMA_data/pickle/departments/photo_cont.pkl') """

In [4]:
#raw data
moma_raw = pd.read_csv(path+'MOMA_data/csv/Artworks.csv')
moma_raw.head()

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,ThumbnailURL,Circumference (cm),Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.)
0,"FerdinandsbrÃ¼cke Project, Vienna, Austria (Ele...",Otto Wagner,6210,"(Austrian, 1841â€“1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,,,,48.6,,,168.9,,
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",(French),(1944),(0),(Male),1987,Paint and colored pencil on print,...,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,,,,40.6401,,,29.8451,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",Emil Hoppe,7605,"(Austrian, 1876â€“1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, pen, color pencil, ink, and gouache ...",...,http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,,,,34.3,,,31.8,,
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",(),(1944),(0),(Male),1980,Photographic reproduction with colored synthet...,...,http://www.moma.org/media/W1siZiIsIjEyNCJdLFsi...,,,,50.8,,,50.8,,
4,"Villa, project, outside Vienna, Austria, Exter...",Emil Hoppe,7605,"(Austrian, 1876â€“1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, color pencil, ink, and gouache on tr...",...,http://www.moma.org/media/W1siZiIsIjEyNiJdLFsi...,,,,38.4,,,19.1,,


In [237]:
#cleaned data
original_artworks = pd.read_pickle(path+'MoMA_data/pickle/MoMAartworks.pkl')
original_artworks.head()

Unnamed: 0,Title,Artist,ConstituentID,dateCreated,Medium,Department,dateAcquired,URL,ThumbnailURL,Gender,Nationality
0,"FerdinandsbrÃ¼cke Project, Vienna, Austria (Ele...",Otto Wagner,6210,1896,Ink and cut-and-pasted painted pages on paper,Architecture & Design,1996,http://www.moma.org/collection/works/2,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,M,Austrian
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,1987,Paint and colored pencil on print,Architecture & Design,1995,http://www.moma.org/collection/works/3,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,M,French
2,"Villa near Vienna Project, Outside Vienna, Aus...",Emil Hoppe,7605,1903,"Graphite, pen, color pencil, ink, and gouache ...",Architecture & Design,1997,http://www.moma.org/collection/works/4,http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,M,Austrian
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,1980,Photographic reproduction with colored synthet...,Architecture & Design,1995,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjEyNCJdLFsi...,M,missing
4,"Villa, project, outside Vienna, Austria, Exter...",Emil Hoppe,7605,1903,"Graphite, color pencil, ink, and gouache on tr...",Architecture & Design,1997,http://www.moma.org/collection/works/6,http://www.moma.org/media/W1siZiIsIjEyNiJdLFsi...,M,Austrian


***

## Cleaning and Preparing Rhizome Data (Laurent & Margherita)

We began our exploration of Rhizome's dataset with a variety of SPARQL queries before settling on the following three to help us build artworks and artists datasets that could be equivalent to MoMA's: 
- All artists with available biographic information
- All artworks with available metadata (title, dates, associated artist, website URLs) 
- Artists who are/aren't part of a collective 

These queries were conducted directly on the endpoint (and checked against the local RDF file for consistency) to retrieve them as csv files and are replicated below formatted using the SPARQLwrapper library.


In [None]:
#set endpoint

endpoint_url = "https://query.artbase.rhizome.org/proxy/wdqs/bigdata/namespace/wdq/sparql"

#query to get all artworks and available information 
query_all_artworks = """SELECT ?artwork_page ?artwork_label ?artist_label ?accession ?inception ?summary_url ?description_url ?statement_url
{ 
  ?artwork rdfs:label ?artwork_label ;
   rt:P3 r:Q5 ;
   rt:P29 ?artist .
   ?artist rdfs:label ?artist_label . 
   ?artwork_page schema:about ?artwork .
  OPTIONAL {
   ?artwork rt:P85 ?accession ;
           rt:P26 ?inception . }
  OPTIONAL {
  ?artwork rt:P123 ?summary_description .
  ?summary_description rt:P3 r:Q4985 ;
          rdfs:label ?summary_label .
  ?summary_url schema:about ?summary_description .}
  OPTIONAL {
  ?artwork rt:P123 ?description .
  ?description rt:P3 r:Q9759 ;
          rdfs:label ?description_label .
  ?description_url schema:about ?description . }
  OPTIONAL {
    ?artwork rt:P123 ?statement .
  ?statement rt:P3 r:Q11838 ;
          rdfs:label ?statement_label .
  ?statement_url schema:about ?statement .
}}

"""
#collectives and their individual members
query_artists = '''SELECT ?subject ?member ?collectiveLabel ?memberLabel
WHERE { 
  ?subject rt:P3 r:Q7;
           rdfs:label ?collectiveLabel;
           rt:P43 ?member.
  ?member rdfs:label ?memberLabel.
  
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }}'''

#artists who aren't in a collective 
query_collective_filter = '''SELECT DISTINCT ?artistLabel ?artistPage
WHERE {
?artwork rt:P3 r:Q5.
?artwork rt:P29 ?artist.
?artist rt:P3 r:Q6.
?artistPage schema:about ?artist ;
schema:isPartOf <https://artbase.rhizome.org/> .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
FILTER (?artist not in (r:Q7) )
}
ORDER BY ?artistLabel
'''

#results = get_results(endpoint_url, query_all_artworks)

#for result in results["results"]["bindings"]:
#    print(result)

The resulting datasets were however not complete enough for us to conduct a meaningful analysis. A major issue was that Rhizome's ArtBase had no meaningful biographical information for artists, and with gender and nationality being key variables for our exploration we needed to complete this information. We tried a variety of automated ways -- including using Google's [Graph Search API](https://developers.google.com/knowledge-graph) -- however they proved limiting as the majority of artists in Rhizome's ArtBase are not popular enough to appear in Google's graph or if they did they were likely to have a name similar to someone else. <br><br>
In the end we decided to complete our dataset manually by conducting searches for nationality, gender, birth and death dates as well as WikiData and ULAN IDs. We used a simple common procedure: take the artist name and input it into Google and [Getty's ULAN database](https://www.getty.edu/research/tools/vocabularies/ulan/); from there try to quickly assert if information is available and input it. We also gave ourselves the following leeway: *if an artist's gender or nationality was not clearly stated we inferred it from available text (such as the use of pronouns and primary places of work/study), and we added this into a separate gender/nationality column marked as guessed to differentiate it from the confirmed information*. In total we were able to confirm or infer nationality information for 70% of artists and gender information for over 90% of artists (we inferred a total of 224 nationalities and 233 genders).<br><br>
The final results of this work can be seen in the artists_complete.csv file held in the Rhizome_data folder. <br><br>
Following this we ran the Rhizome datasets through some scripts to align them with the MoMA datasets as well as perform the same linking exercise using an ID system that would allow us to have an artwork dataset containing gender and nationality information for our analysis. At this point inferred gender and nationality were combined with the confirmed information. We also decided that artists who were part of a collective would be treated as a collective only unless they also appeared in the ArtBase as a solo artist or unless the collective was larger than 2 artists. This decision was made to allow us to increase the Rhizome dataset as much as possible to help with our analysis, and was refined over various iterations including some discoveries during the scraping stage which revealed that Rhizome contained artworks on their site that were not yet fully catalogued in the ArtBase. <br><br>
As a result the total artists dataset increased from 1,272 entries to 1,299, while the artworks dataset was decreased from a raw search of 2,743 down to 2,270, mainly due to the removal of duplicates or entries with missing information (Rhizome publicly claims the ArtBase to contain more than 2,200 artworks). 


In [None]:
""" #COMMENTED OUT TO AVOID BINDER RUNNING IT - create a df for artists, pickle 
rhz_artists= pd.read_csv('./Rhizome_data/csv/artists_complete.csv', dtype='string')
#set empty fields w/ missing
rhz_artists['artistLabel'] = rhz_artists['artistLabel'].fillna('')
rhz_artists['artistPage'] = rhz_artists['artistPage'].fillna('')
rhz_artists['Nationality'] = rhz_artists['Nationality'].fillna('')
rhz_artists['Nationality Guessed'] = rhz_artists['Nationality Guessed'].fillna('')
rhz_artists['Gender'] = rhz_artists['Gender'].fillna('')
rhz_artists['Gender Guessed'] = rhz_artists['Gender Guessed'].fillna('')
rhz_artists['Birth'] = rhz_artists['Birth'].fillna('0').astype('int')
rhz_artists['Death'] = rhz_artists['Death'].fillna('0').astype('int')
rhz_artists['Wiki QID'] = rhz_artists['Wiki QID'].fillna('')
rhz_artists['ULAN'] = rhz_artists['ULAN'].fillna('')
#set collective fields to empty not missing 
rhz_artists['collectiveLabel'] = rhz_artists['collectiveLabel'].fillna('')
rhz_artists['collectivePage'] = rhz_artists['collectivePage'].fillna('')
#create and populate ID field to cross reference with artworks 
rhz_artists['ID'] = range(1, 1+len(rhz_artists))
rhz_artists['ID'] = rhz_artists['ID'].astype('string')
#drop the notes column 
rhz_artists = rhz_artists.drop('Notes', axis=1)
#rename columns 
rhz_artists.rename(columns={'artistLabel': 'Artist', 'artistPage': 'Artist URL', 'collectiveLabel': 'Collective', 'collectivePage': 'Collective URL', 'Wiki QID': 'WikiQID', 'Nationality Guessed': 'nationalityGuessed', 'Gender Guessed': 'genderGuessed', 'Wiki QID': 'wikiQID'}, inplace=True)

#rhz_artists.to_pickle(path+'Rhizome_data/pickles/rhizome_artists.pkl')

#create a second artists DF w/ concataned gender and nationality columns 
rhz_artists_extra = rhz_artists.copy()
rhz_artists_extra['Gender'] = rhz_artists_extra['Gender'].str.cat(rhz_artists_extra['genderGuessed'], join='outer', na_rep='missing')
rhz_artists_extra = rhz_artists_extra.drop('genderGuessed', axis=1)
rhz_artists_extra.loc[rhz_artists_extra['Gender'] == '', 'Gender'] = 'missing'
rhz_artists_extra['Nationality'] = rhz_artists_extra['Nationality'].str.cat(rhz_artists_extra['nationalityGuessed'], join='outer', na_rep='missing')
rhz_artists_extra = rhz_artists_extra.drop('nationalityGuessed', axis=1)
rhz_artists_extra.loc[rhz_artists_extra['Nationality'] == '', 'Nationality'] = 'missing'

#rhz_artists_extra.to_csv(path+'Rhizome_data/pickles/rhizome_artists_extra.csv')
#rhz_artists_extra.to_pickle(path+'Rhizome_data/pickles/rhizome_artists_extra.pkl')

#create a df for artworks, pickle 
rhz_artworks= pd.read_csv('./Rhizome_data/csv/artwork_complete.csv', dtype='string')
#set empty fields w/ missing or empty strings 
rhz_artworks['accession'] = rhz_artworks['accession'].fillna('0')
rhz_artworks.loc[:,'accession'] = rhz_artworks['accession'].where((rhz_artworks['accession'].str.len() <= 4), rhz_artworks['accession'].str[0:4])
rhz_artworks['inception'] = rhz_artworks['inception'].fillna('0')
rhz_artworks.loc[:,'inception'] = rhz_artworks['inception'].where((rhz_artworks['inception'].str.len() <= 4), rhz_artworks['inception'].str[0:4])
url_columns = ['summary_url', 'summary_url_2', 'description_url', 'description_url_2', 'statement_url']
rhz_artworks[url_columns] = rhz_artworks[url_columns].fillna('')
#rename columns 
rhz_artworks.rename(columns={'artwork_label': 'Title', 'artist_label': 'Artist', 'accession': 'dateAcquired', 'inception': 'dateCreated', 'artwork_page': 'URL'}, inplace=True)
#auto populate ID based on artist df 
for index, row in rhz_artworks.iterrows():
    ids = list()
    artist = getattr(row, 'Artist')
    artist = artist.split(', ')
    artists = rhz_artists_extra[['Artist', 'ID']]
    artists = dict(list(zip(artists.Artist, artists.ID)))
    collective_ids = defaultdict(list)
    for i, j in zip(rhz_artists_extra.Collective,rhz_artists_extra.ID):
        collective_ids[i].append(j)
    collective_ids = dict(collective_ids)
    del collective_ids['']
    for k, v in collective_ids.items():
        collective_ids.update({k: ', '.join([n for n in v])})

    for person in artist:
        if person in artists:
            ids.append(str(artists[person]))
        elif person in collective_ids:
            ids.append(collective_ids[person])
        else:
            ids.append('missing')
    string = ', '.join(ids)

    rhz_artworks.at[index, 'ID'] = string

#rhz_artworks.to_pickle(path+'Rhizome_data/pickles/rhizome_artworks.pkl')

#create a second df for artworks w/ additional artist info for visualisation 
rhz_artworks_extra = rhz_artworks.copy()
rhz_artworks_extra.drop(['summary_url', 'summary_url_2', 'description_url', 'description_url_2', 'statement_url'], inplace=True, axis=1)

#add nationality 
for index, row in rhz_artworks_extra.iterrows():
    nationality = list()
    artist = getattr(row, 'Artist')
    artist = artist.split(', ')
    artists = rhz_artists_extra[['Artist', 'Nationality']]
    artists = dict(list(zip(artists.Artist, artists.Nationality)))
    collectives_nat = defaultdict(list)
    for i, j in zip(rhz_artists_extra.Collective,rhz_artists_extra.Nationality):
        collectives_nat[i].append(j)
    collectives_nat = dict(collectives_nat)
    del collectives_nat['']
    for k, v in collectives_nat.items():
        collectives_nat.update({k: ', '.join([n for n in v])})

    for person in artist:
        if person in artists:
            nationality.append(str(artists[person]))
        elif person in collectives_nat:
            nationality.append(collectives_nat[person])
        else:
            nationality.append('missing')
    string_nationality = ', '.join(nationality)

    rhz_artworks_extra.at[index, 'Nationality'] = string_nationality
    
#add gender 
for index, row in rhz_artworks_extra.iterrows():
    gender = list()
    artist = getattr(row, 'Artist')
    artist = artist.split(', ')
    artists = rhz_artists_extra[['Artist', 'Gender']]
    artists = dict(list(zip(artists.Artist, artists.Gender)))
    collectives_gen = defaultdict(list)
    for i, j in zip(rhz_artists_extra.Collective,rhz_artists_extra.Gender):
        collectives_gen[i].append(j)
    collectives_gen = dict(collectives_gen)
    del collectives_gen['']
    for k, v in collectives_gen.items():
        collectives_gen.update({k: ', '.join([n for n in v])})

    for person in artist:
        if person in artists:
            gender.append(str(artists[person]))
        elif person in collectives_gen:
            gender.append(collectives_gen[person])
        else:
            gender.append('missing')
    string_gender = ', '.join(gender)

    rhz_artworks_extra.at[index, 'Gender'] = string_gender

#rhz_artworks_extra.to_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra.pkl')
#rhz_artworks_extra.to_csv(path+'Rhizome_data/pickles/rhizome_artworks_extra.csv') """

In [12]:
#raw data
rhz_raw = pd.read_csv(path+'Rhizome_data/csv/artists_unique.csv')
rhz_raw.head()

Unnamed: 0,artistLabel,artistPage
0,0100101110101101,https://artbase.rhizome.org/wiki/Q1167
1,1010 1010,https://artbase.rhizome.org/wiki/Q1005
2,11811,https://artbase.rhizome.org/wiki/Q4757
3,220hex,https://artbase.rhizome.org/wiki/Q139
4,6,https://artbase.rhizome.org/wiki/Q340


In [12]:
#cleaned data
rhz_artworks_extra = pd.read_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra.pkl')
rhz_artworks_extra.head()

Unnamed: 0,ID,URL,Title,Artist,dateAcquired,dateCreated,Nationality,Gender
0,879,https://artbase.rhizome.org/wiki/Q2423,ZUR FARBENLEHRE (THEORY OF COLOURS),Steven Jones,2007,2007,British,M
1,1020,https://artbase.rhizome.org/wiki/Q4089,Zones de Convergence,cicero,2005,2005,missing,missing
2,"243, 701",https://artbase.rhizome.org/wiki/Q1475,Zombie and Mummy,"Dragan Espenschied, Olia Lialina",2004,2002,"German, Russian","M, F"
3,312,https://artbase.rhizome.org/wiki/Q4374,"Zaira, City of Memories",Gokcen Erguven,2004,2004,Turkish,F
4,920,https://artbase.rhizome.org/wiki/Q3972,Z_G [zeitgeist gestalten],Tiago Borges,2008,2007,Angolan,M


***

## Scraping Text (Laurent)

One last data preparation step involved scraping text about artworks from both MoMA and Rhizome's websites. This was suggested early on in our exploration process and as we came to better understand the differences between the available datasets we decided that grabbing the text might prove useful as an additional variable for our analysis. In addition it was decided that by doing this we might be able to create department/medium information for Rhizome somewhat equivalent to MoMA's.<br><br>
After some trials with the html construction of the Rhizome artwork pages we decided to grab the `<div>` that includes all three possible descriptions (summary, artist statement, description).  

In [None]:
""" #COMMENTED OUT TO AVOID BINDER RUNNING IT - grab URLs from artworks DF, scrape them and return them back to the DF
rhz_artworks_extra = pd.read_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra.pkl')
urls = rhz_artworks_extra['URL'].to_list()
scrapes = [url_to_text_rhizome(u) for u in urls]
rhz_artworks_extra_text = rhz_artworks_extra.copy()
rhz_artworks_extra_text['Text'] = pd.Series(scrapes)
#fix an erroneous ID in original first round of scraping 
rhz_artworks_extra_text.loc[777, 'ID'] = '926, 1268'
rhz_artworks_extra_text = rhz_artworks_extra_text.astype(str)
rhz_artworks_extra_text.to_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra_text.pkl') """

In [14]:
#show result
rhz_artworks_extra_text = pd.read_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra_text.pkl')
rhz_artworks_extra_text.head()

Unnamed: 0,ID,URL,Title,Artist,dateAcquired,dateCreated,Nationality,Gender,Text
0,879,https://artbase.rhizome.org/wiki/Q2423,ZUR FARBENLEHRE (THEORY OF COLOURS),Steven Jones,2007,2007,British,M,"['summary edit\n\n\t\t\t\tA short film.\n""When..."
1,1020,https://artbase.rhizome.org/wiki/Q4089,Zones de Convergence,cicero,2005,2005,missing,missing,['description edit\n\n\t\t\t\tâ€˜Zones de Conver...
2,"243, 701",https://artbase.rhizome.org/wiki/Q1475,Zombie and Mummy,"Dragan Espenschied, Olia Lialina",2004,2002,"German, Russian","M, F",['summary edit\n\n Zombie and Mummy is a websi...
3,312,https://artbase.rhizome.org/wiki/Q4374,"Zaira, City of Memories",Gokcen Erguven,2004,2004,Turkish,F,['description edit\n\n\t\t\t\tThe project is b...
4,920,https://artbase.rhizome.org/wiki/Q3972,Z_G [zeitgeist gestalten],Tiago Borges,2008,2007,Angolan,M,['description edit\n\n\t\t\t\tfrom looking at ...


For MOMA URLs there was only one possible description on the page but the containing `<div>` is repeated elsewhere so we used its parent container to only extract what we needed. Considering the size of the MoMA dataset we used the department splits created earlier to go through it in chunks. We show the process for one dept only, but this was repeated for all of them.

In [None]:
""" # COMMENTED OUT TO AVOID BINDER RUNNING IT - load department DFs
moma_arch_cont = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_cont.pkl')
moma_arch_mod = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_mod.pkl')
moma_design_cont = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_img_cont.pkl')
moma_design_mod = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_img_mod.pkl')
moma_draw_cont = pd.read_pickle(path+'MOMA_data/pickle/departments/draws_prints_cont.pkl')
moma_draw_mod = pd.read_pickle(path+'MOMA_data/pickle/departments/draws_prints_mod.pkl')
moma_films_cont = pd.read_pickle(path+'MOMA_data/pickle/departments/films_cont.pkl')
moma_films_mod = pd.read_pickle(path+'MOMA_data/pickle/departments/films_mod.pkl')
moma_fluxus_cont = pd.read_pickle(path+'MOMA_data/pickle/departments/fluxus_cont.pkl')
moma_fluxus_mod = pd.read_pickle(path+'MOMA_data/pickle/departments/fluxus_mod.pkl')
moma_media_cont = pd.read_pickle(path+'MOMA_data/pickle/departments/media_perf_cont.pkl')
moma_media_mod = pd.read_pickle(path+'MOMA_data/pickle/departments/media_perf_mod.pkl')"""  """
moma_paint_cont = pd.read_pickle(path+'MOMA_data/pickle/departments/paint_sculp_cont.pkl')
moma_paint_mod = pd.read_pickle(path+'MOMA_data/pickle/departments/paint_sculp_mod.pkl')
moma_photo_cont = pd.read_pickle(path+'MOMA_data/pickle/departments/photo_cont.pkl')
moma_photo_mod = pd.read_pickle(path+'MOMA_data/pickle/departments/photo_mod.pkl')

#grab all links from a dept as a list 
links = moma_photo_mod['URL'].to_list()
#process links w/ function 
moma_photo_to_add = [url_to_text_moma(u) for u in links]
#add results back to a copy of the original DF
moma_photo_mod_text = moma_photo_mod.copy()
moma_photo_mod_text['Text'] = moma_photo_to_add
moma_photo_mod_text['Text'] = moma_photo_mod_text['Text'].astype(str)
moma_photo_mod_text.to_pickle(path+'MOMA_data/pickle/departments/photo_mod_text.pkl') """


In [16]:
#show results
moma_arch_cont_text = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_cont_text.pkl')
moma_arch_cont_text.head()

Unnamed: 0,Title,Artist,ID,DateCreated,Medium,Department,DateAcquired,URL,ThumbnailURL,Nationality,Gender,Text
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,1987,Paint and colored pencil on print,Architecture & Design,1995,http://www.moma.org/collection/works/3,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,French,M,missing
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,1980,Photographic reproduction with colored synthet...,Architecture & Design,1995,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjEyNCJdLFsi...,missing,M,The Manhattan Transcripts are theoretical prop...
31,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,1980,Photographic reproduction with colored synthet...,Architecture & Design,1995,http://www.moma.org/collection/works/33,http://www.moma.org/media/W1siZiIsIjIwMCJdLFsi...,missing,M,The Manhattan Transcripts are theoretical prop...
35,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,1980,Photographic reproduction with colored synthet...,Architecture & Design,1995,http://www.moma.org/collection/works/38,http://www.moma.org/media/W1siZiIsIjI2NyJdLFsi...,missing,M,The Manhattan Transcripts are theoretical prop...
40,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,1980,Ink on tracing paper,Architecture & Design,1995,http://www.moma.org/collection/works/44,http://www.moma.org/media/W1siZiIsIjI5NiJdLFsi...,missing,M,The Manhattan Transcripts are theoretical prop...


### Cleaning & Keywords

Scraping rhizome resulted in text heavy with encoding noise. We cleaned it as best we could without damaging the underlying text and then analyzed the text for the most common words usng NLTK's stopwords and our own custom list based on analysis on the scraped results and some early tests. We then extracted 20 keywords from available text and assigned those to their own column. 

In [None]:
""" #COMMENTED TO AVOID BINDER RUNNING IT - load the DF w/ scraped text
rhz_artworks_extra_text = pd.read_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra_text.pkl')
rhz_artworks_extra_text = rhz_artworks_extra_text.astype('string')

#remove all punctuation
rhz_artworks_extra_text['Text'] = rhz_artworks_extra_text['Text'].replace(regex=r'[^\w\s]', value='')

#create a list of strings to remove from all scrapes
remove = ['description editnntttt', '[', ']', 'nnn', 'description edit', 'editnn', 'summary edit', 'tttt', 'nn']
for char in remove:
    rhz_artworks_extra_text['Text'] = rhz_artworks_extra_text['Text'].str.replace(char, '')

#export cleaned version to pickle
rhz_artworks_extra_text_clean = rhz_artworks_extra_text.copy()
rhz_artworks_extra_text_clean.to_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra_text_clean.pkl')

# create a list of stop words and add custom stopwords
stop_words = set(stopwords.words("english"))
new_words =  ['rhizome', 'attributed', 'summary', 'inception', 'staff', 'attribution', 'summary', 'legacy', 'nattributed']
stop_words = stop_words.union(new_words)

#remove stopwords from cleanedtext 
rhz_artworks_extra_text_clean['Text'] = rhz_artworks_extra_text_clean['Text'].str.lower()
rhz_artworks_extra_text_clean['Text'] = rhz_artworks_extra_text_clean['Text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
#remove errand string
rhz_artworks_extra_text_clean['Text'] = rhz_artworks_extra_text_clean['Text'].replace(regex={r'.attributed.': '', r'.legacy descriptive tags.': '', r'.attribution: staff.': '', r'.attribution.': '', r'.inception.': '', r'http[a-z]*\s': ' '})

#extract kws and assign them to new column
rhz_artworks_extra_text_clean['Keywords'] = rhz_artworks_extra_text_clean.apply(lambda row:get_keywords(row), axis=1) 

#export new pickle for reuse
rhz_artworks_extra_text_clean.to_pickle(path+'Rhizome_data/rhizome_artworks_extra_text_clean_stop_keywords.pkl')
 """

In [18]:
#show results
rhz_artworks_extra_text_clean = pd.read_pickle(path+'Rhizome_data/pickles/rhizome_artworks_extra_text_clean_stop_keywords.pkl')
rhz_artworks_extra_text_clean.head()

Unnamed: 0,ID,URL,Title,Artist,dateAcquired,dateCreated,Nationality,Gender,Text,Keywords
0,879,https://artbase.rhizome.org/wiki/Q2423,ZUR FARBENLEHRE (THEORY OF COLOURS),Steven Jones,2007,2007,British,M,short filmnwhen eye sees colour immediately ex...,"colour, march, eye, sees, immediately, excited..."
1,1020,https://artbase.rhizome.org/wiki/Q4089,Zones de Convergence,cicero,2005,2005,missing,missing,zones de convergence treats days summit g8 evi...,"summit, art, august, zones, de, convergence, t..."
2,"243, 701",https://artbase.rhizome.org/wiki/Q1475,Zombie and Mummy,"Dragan Espenschied, Olia Lialina",2004,2002,"German, Russian","M, F",zombie mummy website hosted episodic online co...,"zombie, site, olia, espenschied, mummy, advent..."
3,312,https://artbase.rhizome.org/wiki/Q4374,"Zaira, City of Memories",Gokcen Erguven,2004,2004,Turkish,F,project based solely italo calvinos novel call...,"cities, content, city, zaira, project, based, ..."
4,920,https://artbase.rhizome.org/wiki/Q3972,Z_G [zeitgeist gestalten],Tiago Borges,2008,2007,Angolan,M,looking den zeitgeist gestalten design spirit ...,"experience, web, page, den, zeitgeist, gestalt..."


For MoMA the scrapes were cleaner of encoding noise so we simply ran everything through the same stopwords (and a different set of custom stopwords) and then removed empty entries. Out of over 138K artworks in MoMA's dataset we found text for 2,880 items.<br><br>
**NB**: while scraping one of the larger departments we made an error that resulted in the dataframe including only the extracted text (with the other original information dropped). Rather than rescrape everything we kept this DF but subjected it to slightly different processing as seen below.

In [None]:
""" #COMMENTED TO AVOID BINDER RUNNING IT - load DFs with text
moma_arch_cont_text = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_cont_text.pkl')
moma_arch_mod_text = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_mod_text_only.pkl')
moma_draw_cont_text = pd.read_pickle(path+'MOMA_data/pickle/departments/draws_prints_cont_text.pkl')
moma_draw_mod_text = pd.read_pickle(path+'MOMA_data/pickle/departments/draws_prints_mod_text.pkl')
moma_films_cont_text = pd.read_pickle(path+'MOMA_data/pickle/departments/films_cont_text.pkl')
moma_films_mod_text = pd.read_pickle(path+'MOMA_data/pickle/departments/films_mod_text.pkl')
moma_media_cont_text = pd.read_pickle(path+'MOMA_data/pickle/departments/media_perf_cont_text.pkl')
moma_media_mod_text = pd.read_pickle(path+'MOMA_data/pickle/departments/media_perf_mod_text.pkl')
moma_paint_cont_text = pd.read_pickle(path+'MOMA_data/pickle/departments/paint_sculp_cont_text.pkl')
moma_paint_mod_text = pd.read_pickle(path+'MOMA_data/pickle/departments/paint_sculp_mod_text.pkl')
moma_photo_cont_text = pd.read_pickle(path+'MOMA_data/pickle/departments/photo_cont_text.pkl')
moma_photo_mod_text = pd.read_pickle(path+'MOMA_data/pickle/departments/photo_mod_text.pkl')

#DFs to list then clean them 
moma_texts = [moma_arch_cont_text, moma_draw_cont_text, moma_draw_mod_text, moma_films_cont_text, moma_films_mod_text, moma_media_cont_text, moma_media_mod_text, moma_paint_cont_text, moma_paint_mod_text, moma_photo_cont_text, moma_photo_mod_text]

remove = ['[', ']', '</p>', '<p>', '<strong>', '</strong>', '<em>', '</em>', '</br>']
for text in moma_texts:
    for char in remove:
        text['Text'] = text['Text'].str.replace(char, '')

#exception for DF that was formatted differently due to scraping error (it has one column only w/ scraped text)
for char in remove:
    moma_arch_mod_text['text'] = moma_arch_mod_text['text'].str.replace(char, '')

#replace empty strings w/ missing value to filter useful results only
moma_arch_cont_text['Text'] = moma_arch_cont_text['Text'].replace([''], 'missing')
moma_arch_mod_text['text'] = moma_arch_mod_text['text'].replace([''], 'missing')
moma_draw_cont_text['Text'] = moma_draw_cont_text['Text'].replace([''], 'missing')
moma_draw_mod_text['Text'] = moma_draw_mod_text['Text'].replace([''], 'missing')
moma_films_cont_text['Text'] = moma_films_cont_text['Text'].replace([''], 'missing')
moma_films_mod_text['Text'] = moma_films_mod_text['Text'].replace([''], 'missing')
moma_media_cont_text['Text'] = moma_media_cont_text['Text'].replace([''], 'missing')
moma_media_mod_text['Text'] = moma_media_mod_text['Text'].replace([''], 'missing')
moma_paint_cont_text['Text'] = moma_paint_cont_text['Text'].replace([''], 'missing')
moma_paint_mod_text['Text'] = moma_paint_mod_text['Text'].replace([''], 'missing')
moma_photo_cont_text['Text'] = moma_photo_cont_text['Text'].replace([''], 'missing')
moma_photo_mod_text['Text'] = moma_photo_mod_text['Text'].replace([''], 'missing')

#save cleaned DFs back 
moma_arch_cont_text.to_pickle(path+'MOMA_data/pickle/departments/architecture_design_cont_text.pkl')
moma_arch_mod_text.to_pickle(path+'MOMA_data/pickle/departments/architecture_design_mod_text_only.pkl')
moma_draw_cont_text.to_pickle(path+'MOMA_data/pickle/departments/draws_prints_cont_text.pkl')
moma_draw_mod_text.to_pickle(path+'MOMA_data/pickle/departments/draws_prints_mod_text.pkl')
moma_films_cont_text.to_pickle(path+'MOMA_data/pickle/departments/films_cont_text.pkl')
moma_films_mod_text.to_pickle(path+'MOMA_data/pickle/departments/films_mod_text.pkl')
moma_media_cont_text.to_pickle(path+'MOMA_data/pickle/departments/media_perf_cont_text.pkl')
moma_media_mod_text.to_pickle(path+'MOMA_data/pickle/departments/media_perf_mod_text.pkl')
moma_paint_cont_text.to_pickle(path+'MOMA_data/pickle/departments/paint_sculp_cont_text.pkl')
moma_paint_mod_text.to_pickle(path+'MOMA_data/pickle/departments/paint_sculp_mod_text.pkl')
moma_photo_cont_text.to_pickle(path+'MOMA_data/pickle/departments/photo_cont_text.pkl')
moma_photo_mod_text.to_pickle(path+'MOMA_data/pickle/departments/photo_mod_text.pkl')

#reduce DFs down to entries w/ only useful text and remove duplicates 
moma_arch_cont_text_final = moma_arch_cont_text.loc[moma_arch_cont_text['Text'] != 'missing']
moma_arch_cont_text_final = moma_arch_cont_text_final.drop_duplicates(subset=['Text'])
moma_arch_mod_text_final = moma_arch_mod_text.loc[moma_arch_mod_text['text'] != 'missing']
moma_arch_mod_text_final = moma_arch_mod_text_final.drop_duplicates(subset=['text'])
moma_draw_cont_text_final = moma_draw_cont_text.loc[moma_draw_cont_text['Text'] != 'missing']
moma_draw_cont_text_final = moma_draw_cont_text_final.drop_duplicates(subset=['Text'])
moma_draw_mod_text_final = moma_draw_mod_text.loc[moma_draw_mod_text['Text'] != 'missing']
moma_draw_mod_text_final = moma_draw_mod_text_final.drop_duplicates(subset=['Text'])
moma_films_cont_text_final = moma_films_cont_text.loc[moma_films_cont_text['Text'] != 'missing']
moma_films_cont_text_final = moma_films_cont_text_final.drop_duplicates(subset=['Text'])
moma_films_mod_text_final = moma_films_mod_text.loc[moma_films_mod_text['Text'] != 'missing']
moma_films_mod_text_final = moma_films_mod_text_final.drop_duplicates(subset=['Text'])
moma_media_cont_text_final = moma_media_cont_text.loc[moma_media_cont_text['Text'] != 'missing']
moma_media_cont_text_final = moma_media_cont_text_final.drop_duplicates(subset=['Text'])
moma_media_mod_text_final = moma_media_mod_text.loc[moma_media_mod_text['Text'] != 'missing']
moma_media_mod_text_final = moma_media_mod_text_final.drop_duplicates(subset=['Text'])
moma_paint_cont_text_final = moma_paint_cont_text.loc[moma_paint_cont_text['Text'] != 'missing']
moma_paint_cont_text_final = moma_paint_cont_text_final.drop_duplicates(subset=['Text'])
moma_paint_mod_text_final = moma_paint_mod_text.loc[moma_paint_mod_text['Text'] != 'missing']
moma_paint_mod_text_final = moma_paint_mod_text_final.drop_duplicates(subset=['Text'])
moma_photo_cont_text_final = moma_photo_cont_text.loc[moma_photo_cont_text['Text'] != 'missing']
moma_photo_cont_text_final = moma_photo_cont_text_final.drop_duplicates(subset=['Text'])
moma_photo_mod_text_final = moma_photo_mod_text.loc[moma_photo_mod_text['Text'] != 'missing']
moma_photo_mod_text_final = moma_photo_mod_text_final.drop_duplicates(subset=['Text'])

#save DFs
moma_arch_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/architecture_design_cont_text_final.pkl')
moma_arch_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/architecture_design_mod_text_only_final.pkl')
moma_draw_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/draws_prints_cont_text_final.pkl')
moma_draw_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/draws_prints_mod_text_final.pkl')
moma_films_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/films_cont_text_final.pkl')
moma_films_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/films_mod_text_final.pkl')
moma_media_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/media_perf_cont_text_final.pkl')
moma_media_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/media_perf_mod_text_final.pkl')
moma_paint_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/paint_sculp_cont_text_final.pkl')
moma_paint_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/paint_sculp_mod_text_final.pkl')
moma_photo_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/photo_cont_text_final.pkl')
moma_photo_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/photo_mod_text_final.pkl')

#load stopwords and remove them
stop_words = set(stopwords.words("english"))
new_words =  ['.', 'one', 'two', 'also']
stop_words = stop_words.union(new_words)

moma_texts_final = [moma_arch_cont_text_final, moma_draw_cont_text_final, moma_draw_mod_text_final, moma_films_cont_text_final, moma_films_mod_text_final, moma_media_cont_text_final, moma_media_mod_text_final, moma_paint_cont_text_final, moma_paint_mod_text_final, moma_photo_cont_text_final, moma_photo_mod_text_final]

for text in moma_texts_final:
    text['Text'] = text['Text'].str.lower()
    text['Text'] = text['Text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

#exception 
moma_arch_mod_text_final['text'] = moma_arch_mod_text_final['text'].str.lower()
moma_arch_mod_text_final['text'] = moma_arch_mod_text_final['text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

#get keywords
moma_arch_cont_text_final['Keywords'] = moma_arch_cont_text_final.apply(lambda row:get_keywords(row), axis=1)
moma_draw_cont_text_final['Keywords'] = moma_draw_cont_text_final.apply(lambda row:get_keywords(row), axis=1) 
moma_draw_mod_text_final['Keywords'] = moma_draw_mod_text_final.apply(lambda row:get_keywords(row), axis=1) 
moma_films_cont_text_final['Keywords'] = moma_films_cont_text_final.apply(lambda row:get_keywords(row), axis=1)
moma_films_mod_text_final['Keywords'] = moma_films_mod_text_final.apply(lambda row:get_keywords(row), axis=1) 
moma_media_cont_text_final['Keywords'] = moma_media_cont_text_final.apply(lambda row:get_keywords(row), axis=1)
moma_media_mod_text_final['Keywords'] = moma_media_mod_text_final.apply(lambda row:get_keywords(row), axis=1)
moma_paint_cont_text_final['Keywords'] = moma_paint_cont_text_final.apply(lambda row:get_keywords(row), axis=1)
moma_paint_mod_text_final['Keywords'] = moma_paint_mod_text_final.apply(lambda row:get_keywords(row), axis=1)
moma_photo_cont_text_final['Keywords'] = moma_photo_cont_text_final.apply(lambda row:get_keywords(row), axis=1)
moma_photo_mod_text_final['Keywords'] = moma_photo_mod_text_final.apply(lambda row:get_keywords(row), axis=1)
moma_arch_mod_text_final['Keywords'] = moma_arch_mod_text_final.apply(lambda row:get_keywords_2(row), axis=1) 

#pickle version w/ stopwords removed and keywords added
moma_arch_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/architecture_design_cont_text_final_stop.pkl')
moma_arch_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/architecture_design_mod_text_only_final_stop.pkl')
moma_draw_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/draws_prints_cont_text_final_stop.pkl')
moma_draw_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/draws_prints_mod_text_final_stop.pkl')
moma_films_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/films_cont_text_final_stop.pkl')
moma_films_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/films_mod_text_final_stop.pkl')
moma_media_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/media_perf_cont_text_final_stop.pkl')
moma_media_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/media_perf_mod_text_final_stop.pkl')
moma_paint_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/paint_sculp_cont_text_final_stop.pkl')
moma_paint_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/paint_sculp_mod_text_final_stop.pkl')
moma_photo_cont_text_final.to_pickle(path+'MOMA_data/pickle/departments/photo_cont_text_final_stop.pkl')
moma_photo_mod_text_final.to_pickle(path+'MOMA_data/pickle/departments/photo_mod_text_final_stop.pkl')
 """

In [19]:
#show results
moma_arch_cont_text_final = pd.read_pickle(path+'MOMA_data/pickle/departments/architecture_design_cont_text_final_stop.pkl')
moma_arch_cont_text_final.head()

Unnamed: 0,Title,Artist,ID,DateCreated,Medium,Department,DateAcquired,URL,ThumbnailURL,Nationality,Gender,Text,Keywords
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,1980,Photographic reproduction with colored synthet...,Architecture & Design,1995,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjEyNCJdLFsi...,missing,M,manhattan transcripts theoretical propositions...,"events, architecture, manhattan, transcripts, ..."
35,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,1980,Photographic reproduction with colored synthet...,Architecture & Design,1995,http://www.moma.org/collection/works/38,http://www.moma.org/media/W1siZiIsIjI2NyJdLFsi...,missing,M,manhattan transcripts theoretical propositions...,"events, architecture, manhattan, movement, tra..."
151,"Slow House Project, North Haven, New York, Pla...","Diller + Scofidio, Elizabeth Diller, Ricardo S...","8707, 6951, 6952",1989,Computer-generated print on frosted polymer sh...,Architecture & Design,1992,http://www.moma.org/collection/works/201,http://www.moma.org/media/W1siZiIsIjE5MjQiXSxb...,"American, American, American","missing, F, M",elizabeth diller ricardo scofidio's slow house...,"house, drawing, view, diller, scofidio, city, ..."
152,"The Peak Project, Hong Kong, China (Exterior p...",Zaha Hadid,6953,1991,Synthetic polymer on paper mounted on canvas,Architecture & Design,1992,http://www.moma.org/collection/works/202,http://www.moma.org/media/W1siZiIsIjE5MzgiXSxb...,British,F,painting depicts hadidâ€™s winning entry archite...,"hadid, painting, hills, structure, forms, winn..."
170,"Chikatsu-Asuka Historical Museum, Minami-Kawac...",Tadao Ando,7055,1989,Graphite and crayon with scoring on paper,Architecture & Design,1993,http://www.moma.org/collection/works/224,http://www.moma.org/media/W1siZiIsIjI0MjEiXSxb...,Japanese,M,tadao ando's drawings like hieroglyphic map re...,"ando, museum, building, landscape, tadao, like..."


***

## Manipulating and Visualising the Data (Everyone)

For our manipulation and visualisation of the data we decided on a workflow inspired by what we learnt during the class. Starting from our initial research question -- Did the internet and personal computers have an impact on the make up of art collections? -- we looked at the types of variable we had available, their potential relationships, and the ways in which this could help us explore and visualize the data. <br><br>
We decided on the following variables as essential to our exploration: 
- Gender (categorical + tranformed into nominal)
- Nationality (categorical + tranformed into nominal)
- Date Created and Acquired (ordinal)
- Departments (categorical)
- Keywords (categorical + tranformed into numerical)

Considering the difference in size between our datasets we eventually decided to settle on the following approach to help our manipulation, visualization, and storytelling: **the Rhizome dataset would be used in full, MoMA would only be used in full untouched for the first visualization and subsequently would only be used as either full with the 1983 pivot date (to see if the internet and personal computers might have made a difference in acquisitions) or as a trimmed-down version for comparison with Rhizome, aligned to Rhizome by using the same creation date for the first artwork (1893) and the same first acquisition date (2000)**.<br><br> 
Therefore we would in effect have three versions of the MoMA artworks dataset: the full version (138K items); a version split at 1983 as a pivot date (69,310 items vs 30,227); and a version trimmed alongside Rhizome's key dates (21,580 items). With this we felt we could more confidently work on the data as when comparing MoMA before and after 1983 we could divide any needed counts for the pre-1983 period by 43% and when comparing MoMA and Rhizome we could divide needed counts by 10%.<br><br>
As part of the cleaning work we decided that it would be useful to represent collectives and collaborations as a type of gender as they are relevant to Rhizome and help expand gender beyond the male/female binary. Similarly we found many artists in Rhizome with dual nationalities. In MoMA, collectives were not clearly marked but once we connected artworks to genders and nationalities we could see that some of them also had multiple artists, often due to collaborations or collected works. As such we decided to represent artworks with multiple creators as belonging to collectives and collaborations in terms of gender. Nationalities of collectives and collaborations were left out of specific nationality counts.<br><br>
From there we discussed what types of visualizations we might want -- comparisons, relationships, distribution, composition etc... -- and ended up with a list of questions expanding from the initial research question which became a guide for both creating visualizations and building the story. For the latter we also made use of Miro to create a storyboard of the website and help decide which sub question/visualization would prove useful. <br><br>
The final list of questions and their associated viusalizations and datasets are:
1. **What is the gender representation in each dataset?**
    - Rhizome vs MoMA full + MoMA before and after 1983 
    - Pie charts -> gender as categorical and numerical  
2. **How does gender representation compare between both datasets?** 
    - Rhizome vs MoMA sampled
    - Pie charts -> gender as categorical and numerical 
3. **How many artworks were created by collectives and what is their gender breakdown?** 
    - Rhizome vs MoMA sampled
    - Sunburst charts -> gender as categorical and numerical  
4. **How does gender representation compare between datasets alongside departments?** 
    - Rhizome vs MoMA sampled + MoMA before and after 1983 
    - Barcharts -> gender as categorical and numerical + departments as categorical
5. **What is the nationality representation in each dataset?**
    - Rhizome vs MoMA sampled 
    - Map -> nationality as categorical and numerical  
6. **How does nationality representation compare between datasets?** 
    - Rhizome vs MoMA sampled
    - Bar charts -> nationality as numerical + dataset as categorical (with additional filtering using the concept of Global North/South divide)
7. **How does nationality relate to gender?** 
    - Rhizome vs MoMA sampled
    - Parallel chart -> gender as categorical + department as categorical + nationality as categorical and numerical (with some additional filtering using the concept of Global North/South divide)
8. **Which years have the highest acquisition date in relation to gender and/or nationality?**
    - Rhizome vs MoMA sampled 
    - Scatter plot -> date created/acquired as ordinal + gender/nationality as categorical and numerical (with some additional filtering using the concept of Global North/South divide)
9. **Which keywords are most prominent in artwork descriptions?** 
    - Rhizome vs MoMA TBC
    - Tree maps -> TBC
10. **How do keywords relate to gender and/or nationality?**
    - TBC
    - TBC

Let's do this! ðŸ’ª

***

### Representing Gender by Dataset (Margherita & Laurent)

We decided to use pie and sunburst charts to give a simple overall perspective of the gender breakdown, and the collectives and collaborations segment is pulled out of the pie charts to link it directly to the sunburst charts in the storytelling. <br><br>
For the sunburst charts we tried to automate the counting process but kept hitting some walls with regular expressions so in the end we exported the needed data to csv and did some regex and counting in Excel as the Plotly figure could be easily manuallly populated. 

In [7]:
#Rhizome vs MoMA full
rhz_artists = rhz_artworks.drop_duplicates(subset='Artist', keep='last')
rhz_gender_rep = group_genders(rhz_artists)
moma_artists = moma_artworks.drop_duplicates(subset='Artist', keep='last')
moma_gender_rep = group_genders(moma_artists)

rhz_gender_pie = px.pie(rhz_gender_rep, values='Counts', names='Gender',
             title='Gender Breakdown of Rhizome Collection', color_discrete_sequence = ['#007c91', "#ff5252", "#8c7b75", "#ce93d8", "#ffd54f",])

rhz_gender_pie.update_traces(textposition='auto', textinfo='label+percent', hoverinfo='label+value+percent')
rhz_gender_pie.update_layout(width=800, title_x=0.5)

moma_gender_pie = px.pie(moma_gender_rep, values='Counts', names='Gender',
             title='Gender Breakdown of MoMA Collection (Full)', color_discrete_sequence = ['#007c91', "#ce93d8", "#ff5252", "#8c7b75", "#ffd54f",])
moma_gender_pie.update_traces(textposition='auto', textinfo='label+percent', hoverinfo='label+value+percent')
moma_gender_pie.update_layout(width=800, title_x=0.5)

#create local copies
#rhz_gender_pie.write_html(path+"Plotly_embeds/rhz_full_pie.html", include_plotlyjs='directory')
#moma_gender_pie.write_html(path+"Plotly_embeds/moma_full_pie.html", include_plotlyjs='directory')

rhz_gender_pie = go.FigureWidget(rhz_gender_pie)
moma_gender_pie = go.FigureWidget(moma_gender_pie)
gender_pies = ipw.HBox([rhz_gender_pie, moma_gender_pie])
gender_pies

HBox(children=(FigureWidget({
    'data': [{'domain': {'x': [0.0, 1.0], 'y': [0.0, 1.0]},
              'hoverâ€¦

In [12]:
#Rhizome vs MoMA sampled
moma_artists_compare = moma_rhz_compare.drop_duplicates(subset='Artist', keep='last')
moma_rhz_compare_gender_rep = group_genders(moma_artists_compare)

moma_gender_pie_2 = px.pie(moma_rhz_compare_gender_rep, values='Counts', names='Gender',
             title='Gender Breakdown of MoMA Collection (Sampled)',
             color_discrete_sequence = ['#007c91', "#ff5252", "#ce93d8", "#8c7b75", "#ffd54f"])
moma_gender_pie_2.update_traces(textposition='auto', textinfo='label+percent', hoverinfo='label+value+percent', hovertemplate=None, pull=[0, 0, 0, 0.2], sort=False)
moma_gender_pie_2.update_layout(width=800, title_x=0.5)

#create local copies
#moma_gender_pie_2.write_html(path+"Plotly_embeds/moma_sampled_pie.html", include_plotlyjs='directory')

moma_gender_pie_2 = go.FigureWidget(moma_gender_pie_2)
gender_pies_2 = ipw.HBox([rhz_gender_pie, moma_gender_pie_2])
gender_pies_2

HBox(children=(FigureWidget({
    'data': [{'domain': {'x': [0.0, 1.0], 'y': [0.0, 1.0]},
              'hoverâ€¦

In [14]:
#create a DF w/ genders and counts, group collectives together
moma_artists_old = moma_artworks_old.drop_duplicates(subset='Artist', keep='last')
moma_b4_gender_rep = group_genders(moma_artists_old)
moma_artists_new = moma_artworks_new.drop_duplicates(subset='Artist', keep='last')
moma_after_gender_rep = group_genders(moma_artists_new)

moma_gender_pie_b4 = px.pie(moma_b4_gender_rep, values='Counts', names='Gender',
             title='Gender Breakdown of MoMA Collection Before 1983',
            color_discrete_sequence = ['#007c91', "#ff5252",  "#ce93d8", "#8c7b75"])
moma_gender_pie_b4.update_traces(textposition='auto', textinfo='label+percent', hoverinfo='label+value+percent', hovertemplate=None, pull=[0, 0, 0, 0.2])
moma_gender_pie_b4.update_layout(width=800, title_x=0.5)

moma_gender_pie_after = px.pie(moma_after_gender_rep, values='Counts', names='Gender',
             title='Gender Breakdown of MoMA Collection After 1983',
            color_discrete_sequence = ['#007c91', "#ff5252", "#ce93d8", "#8c7b75", "#ffd54f"]
)
moma_gender_pie_after.update_traces(textposition='auto', textinfo='label+percent', hoverinfo='label+value+percent', hovertemplate=None, pull=[0, 0, 0, 0.2])
moma_gender_pie_after.update_layout(width=800,  title_x=0.5)

#create local copies
#moma_gender_pie_b4.write_html(path+"Plotly_embeds/moma_b4_pie.html", include_plotlyjs='directory')
#moma_gender_pie_after.write_html(path+"Plotly_embeds/moma_after_pie.html", include_plotlyjs='directory')

moma_gender_pie_3 = go.FigureWidget(moma_gender_pie_b4)
moma_gender_pie_4 = go.FigureWidget(moma_gender_pie_after)
gender_pies_3 = ipw.HBox([moma_gender_pie_3, moma_gender_pie_4])
gender_pies_3

HBox(children=(FigureWidget({
    'data': [{'domain': {'x': [0.0, 1.0], 'y': [0.0, 1.0]},
              'hoverâ€¦

In [15]:
#count genders, remove M and F only, send to csv for counting
moma_gender_counts = moma_artists_compare['Gender'].value_counts()
rhz_gender_counts = rhz_artists['Gender'].value_counts()
#choose only collectives
moma_gender_collectives = moma_gender_counts[3:].reset_index()
moma_gender_collectives.to_csv(path+'MOMA_data/csv/moma_collectives_gender.csv')
rhz_gender_collectives = rhz_gender_counts[3:].reset_index()
rhz_gender_collectives.to_csv(path+'Rhizome_data/csv/rhz_collectives_gender.csv')

In [17]:
#Rhizome vs MoMA sampled - collectives gender rep 
#set color sequence
scale1=['#ffffff', '#007c91','#007c91', '#007c91', '#ff5252' ,  "#ff5252",  "#ff5252",  "#ce93d8", "#ce93d8"]

rhz_sun =go.Figure(go.Sunburst(
    labels=["Collectives & Collabs", "Males", "Male Only", "Male & Missing", "Females", "Female Only", "Female & Missing", "Mixed Gender", 'Incl. Missing Gender'],
    parents=["", "Collectives & Collabs","Males", "Males", "Collectives & Collabs", "Females", "Females", "Collectives & Collabs", 'Mixed Gender'],
    values=[71, 22, 20, 2, 7, 5, 2, 42, 2], 
),layout=dict(title=dict(text="Gender Breakdown of Rhizome Collectives & Collaborations")))
rhz_sun.update_traces(textinfo='label+value', hoverinfo='label+value', hovertemplate=None, 
    marker=dict(colors = scale1))
rhz_sun.update_layout(height=800, title_x=0.5, uniformtext=dict(minsize=8, mode='hide'))

moma_sun =go.Figure(go.Sunburst(
    labels=["Collectives & Collabs", "Males", "Male Only", "Male & Missing", "Females", "Female Only", "Female & Missing", "Mixed Gender", 'Incl. Missing Gender', 'Incl. Non-Binary'],
    parents=["", "Collectives & Collabs","Males", "Males", "Collectives & Collabs", "Females", "Females", "Collectives & Collabs", 'Mixed Gender', 'Mixed Gender'],
    values=[744, 434, 195, 239, 62, 28, 34, 248, 117, 1]
),layout=dict(title=dict(text="Gender Breakdown of MoMA Collectives & Collaborations")))
moma_sun.update_traces(textinfo='label+value', hoverinfo='label+value', hovertemplate=None,
       marker=dict(colors = scale1))
moma_sun.update_layout(height=800, title_x=0.5, uniformtext=dict(minsize=8, mode='hide'))

#create local copies
#rhz_sun.write_html(path+"Plotly_embeds/rhz_sunburst.html", include_plotlyjs='directory')
#moma_sun.write_html(path+"Plotly_embeds/moma_sunburst.html", include_plotlyjs='directory')

rhz_sun = go.FigureWidget(rhz_sun)
moma_sun = go.FigureWidget(moma_sun)
rhz_moma_sun = ipw.HBox([rhz_sun, moma_sun])
rhz_moma_sun

HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'label+value',
              'labels': [Collectives &â€¦

***

### Representing Gender by Departments (Everyone)

When it came to gender and departments we decided to use barcharts and facet the results to allow the user to see things more clearly: in the first chart the facet is department because they're even, and this way you can easily see if gender representation changes before and after 1983; in the second chart we used the databases as the facet because the departments have been reduced to three and this way you can more easily see that Rhizome has acquired more evenly across departments.<br><br>
We also decided to use a shared logarithmic representation of the y axis for the first chart as we had some outliers in the Drawings & Prints department that made a standard barchart representation more difficult to read. <br><br>
For the departments we started with the four MoMA departments we felt were most aligned to the types of art also found in Rhizome: film, photography, media and performance, and drawing and prints. The latter might seem odd but there are over 300 items in the Rhizome collection that make reference to print and drawing. To align them with Rhizome we first created four equivalent departments for Rhizome, using the description text to assign them, and then we combined photography and drawings & prints into still images for MoMA and for Rhizome we combined installations and net art into media and performance. **The result is three overall departments: still images, moving images, and media and performance**.<br><br>
A note on the Rhizome department assignment: the process is far from perfect as it looks for a specific keyword in the text and uses the first one it finds to assign the department. Therefore it misses out on artworks that could potentially belong to multiple departments but considering MoMA is strict with its department assignment we feel taking the same approach for Rhizome is appropriate. <br><br>
Lastly, 9% of the Rhizome dataset does not have a department assigned after filtering (due to the keywords not appearing in the text or there not being any text associated with the artwork). So we added 9% to the Rhizome numbers when counting to keep them in line with the percentage reduction applied to the MoMA sampled dataset (10%).

In [11]:
#get gender counts, source, department
moma_draw_before_gender_rep = group_genders_dept(moma_draw_before, 'Before 1983', 'Drawings & Prints')
moma_draw_after_gender_rep = group_genders_dept(moma_draw_after, 'After 1983', 'Drawings & Prints')
moma_films_before_gender_rep = group_genders_dept(moma_films_before, 'Before 1983', 'Film')
moma_films_after_gender_rep = group_genders_dept(moma_films_after, 'After 1983', 'Film')
moma_media_before_gender_rep = group_genders_dept(moma_media_before, 'Before 1983', 'Media and Performance')
moma_media_after_gender_rep = group_genders_dept(moma_media_after, 'After 1983', 'Media and Performance')
moma_photo_before_gender_rep = group_genders_dept(moma_photo_before, 'Before 1983', 'Photography')
moma_photo_after_gender_rep = group_genders_dept(moma_photo_after, 'After 1983', 'Photography')

#group all together
moma_draw_gender_rep_all = pd.concat([moma_draw_before_gender_rep, moma_draw_after_gender_rep])
moma_films_gender_rep_all = pd.concat([moma_films_before_gender_rep, moma_films_after_gender_rep])
moma_media_gender_rep_all = pd.concat([moma_media_before_gender_rep, moma_media_after_gender_rep])
moma_photo_gender_rep_all = pd.concat([moma_photo_before_gender_rep, moma_photo_after_gender_rep])
moma_depts_gender_rep_all = pd.concat([moma_photo_gender_rep_all, moma_media_gender_rep_all, moma_films_gender_rep_all, moma_draw_gender_rep_all])
moma_depts_gender_rep_all = moma_depts_gender_rep_all.reset_index(drop='True')

In [12]:
moma_gender_depts_full = px.bar(moma_depts_gender_rep_all, x="Gender", y="Counts", color="Period", facet_col="Department",
    facet_col_spacing=0.06,  facet_col_wrap=2, title="Gender Representation in MoMA Departments before and after 1983",
    barmode='group', log_y=True, color_discrete_map={"Before 1983": "#49599a", "After 1983": "#aab6fe"},
    category_orders={"Period": ["Before 1983", 'After 1983']}, text='Counts')
moma_gender_depts_full.update_xaxes(showticklabels=True, categoryorder='array', categoryarray=['Male', 'Female', 'Collectives & Collabs', 'Missing', 'NB'])
moma_gender_depts_full.update_yaxes(showticklabels=True)
moma_gender_depts_full.update_traces(textposition="auto", hoverinfo='x+y+name', hovertemplate=None)
moma_gender_depts_full.update_layout(height=800, width=1600, title_x=0.5)
#create local copies
#moma_gender_depts_full.write_html(path+"Plotly_embeds/moma_full_gender_dept_bars.html", include_plotlyjs='directory')
moma_gender_depts_full.show()

In [13]:
moma_gender_depts_full_2 = px.bar(moma_depts_gender_rep_all, x="Gender", y="Counts", color="Period", facet_col="Department", facet_col_spacing=0.06,  facet_col_wrap=2, title="Gender Representation in MoMA Departments before and after 1983", barmode='group',color_discrete_map={"Before 1983": "#49599a", "After 1983": "#aab6fe"}, category_orders={"Period": ["Before 1983", 'After 1983']}, text='Counts')
moma_gender_depts_full_2.update_xaxes(showticklabels=True, categoryorder='array', categoryarray=['Male', 'Female', 'Collectives & Collabs', 'Missing', 'NB'])
moma_gender_depts_full_2.update_yaxes(matches=None, showticklabels=True)
moma_gender_depts_full_2.update_traces(textposition="auto", hoverinfo='x+y+name', hovertemplate=None)
moma_gender_depts_full_2.update_layout(height=800, width=1600, title_x=0.5)
#create local copies
#moma_gender_depts_full.write_html(path+"Plotly_embeds/moma_full_gender_dept_bars_option2.html", include_plotlyjs='directory')
moma_gender_depts_full_2.show()

In [4]:
#create a list of rhizome mediums based on keywords research 
rhz_medium = rhizome_txt_stop_kw.copy()
rhz_medium = rhz_medium.replace('N/B', 'NB')
# create a list of our conditionsP
conditions = [
    (rhz_medium['Text'].str.contains('video|film|animation|movie|tv', regex=True)),
    (rhz_medium['Text'].str.contains('images|image|photographs|photography|photo|photos|painting|drawing', regex=True)),
    (rhz_medium['Text'].str.contains('installation|performance|sound|audio|song|music', regex=True)),
    (rhz_medium['Text'].str.contains('website|flash|java|data|database|webpage|net|internet|site|sites|online|webbased|augmented|virtual|reality|game|interaction|interactive|interface|project|xml|mashup|map|generative|wifi|network|email|computer', regex=True)),
    ]

# create a list of the values we want to assign for each condition
values = ['Moving Images', 'Still Images', 'Installation & Performance', 'Net Art']

# create a new column and use np.select to assign values to it using our lists as arguments
rhz_medium['Medium'] = np.select(conditions, values)

# display updated DataFrame
rhz_medium['Medium'] = rhz_medium['Medium'].replace('0', 'missing')
rhz_medium['Medium'] = rhz_medium['Medium'].astype(str)

In [15]:
#count Rhizome genders by department
rhz_gender_rep_mi = group_genders(rhz_medium.loc[rhz_medium['Medium'] == 'Moving Images'])
rhz_gender_rep_img = group_genders(rhz_medium.loc[rhz_medium['Medium'] == 'Still Images'])
rhz_gender_rep_intall = group_genders(rhz_medium.loc[rhz_medium['Medium'] == 'Installation & Performance'])
rhz_gender_rep_netart = group_genders(rhz_medium.loc[rhz_medium['Medium'] == 'Net Art'])
rhz_gender_rep_mi['Department'] = 'Moving Images'
rhz_gender_rep_img['Department'] = 'Still Images'
rhz_gender_rep_intall['Department'] = 'Installation'
rhz_gender_rep_netart['Department'] = 'Net Art'
rhz_rep_all = pd.concat([rhz_gender_rep_mi, rhz_gender_rep_img, rhz_gender_rep_intall, rhz_gender_rep_netart])
rhz_rep_all['Source'] = 'Rhizome'

#align departments alongside new 3x3 grouping
rhz_rep_all = rhz_rep_all.replace(['Installation', 'Net Art'], 'Media and Performance').groupby(['Department', 'Source', 'Gender'],as_index=False).agg({'Counts': 'sum'})
moma_depts_gender_rep_all_2 = moma_depts_gender_rep_all.replace(['Photography', 'Drawings & Prints'], 'Still Images').replace(['Film'], 'Moving Images').groupby(['Department', 'Period', 'Gender'],as_index=False).agg({'Counts': 'sum'})

#create a combined dataframe
moma_depts_gender_rep_all_2 = moma_depts_gender_rep_all_2.loc[moma_depts_gender_rep_all_2['Period'] == 'After 1983']
moma_depts_gender_rep_all_2['Counts'] = moma_depts_gender_rep_all_2['Counts'].div(10).round(2)
moma_depts_gender_rep_all_2['Counts'] = moma_depts_gender_rep_all_2['Counts'].astype(int)
moma_depts_gender_rep_all_2['Source'] = 'MoMA'
rhz_rep_all['Counts'] = rhz_rep_all['Counts'].multiply(1.09)
rhz_rep_all['Counts'] = rhz_rep_all['Counts'].apply(np.floor)
rhz_rep_all['Counts'] = rhz_rep_all['Counts'].astype(int)
moma_rhz_gender_compare = pd.concat([rhz_rep_all, moma_depts_gender_rep_all_2])
moma_rhz_gender_compare = moma_rhz_gender_compare.drop('Period', axis=1)

In [16]:
moma_rhz_gender_compare = moma_rhz_gender_compare.replace('MoMA', 'MoMA (sampled)')
moma_rhz_depts_compare = px.bar(moma_rhz_gender_compare.sort_values(by='Source'), x="Gender", y="Counts", color="Department", facet_col="Source",
             facet_col_spacing=0.06,  facet_col_wrap=2,
             title="Gender Representation Between Rhizome and MoMA (sampled) by Departments", barmode='group', log_y=True,
             text_auto=True, color_discrete_map ={"Still Images":"#90caf9", "Media and Performance":"#ffe082", "Moving Images":"#b94e52"} , category_orders={"Department": ["Still Images", 'Media and Performance', "Moving Images"]})
moma_rhz_depts_compare.update_xaxes(matches=None, showticklabels=True,  categoryorder='array', categoryarray=['Male', 'Female', 'Collectives & Collabs', 'Missing', 'NB'])
moma_rhz_depts_compare.update_yaxes(showticklabels=True)
moma_rhz_depts_compare.update_traces(textposition="outside", hoverinfo='x+y+name', hovertemplate=None), 
moma_rhz_depts_compare.update_layout(height=800, width=1600, title_x=0.5)
#create local copies
#moma_rhz_depts_compare.write_html(path+"Plotly_embeds/moma_rhz_depts_compare.html", include_plotlyjs='directory')
moma_rhz_depts_compare.show()

***

### Representing Nationality by Dataset (Everyone)

For nationalities we decided to use maps first for a simple and interactive visualisation of how the collections are spread across the globe. From there we decided to use the concept of the Global North/South divide to go deeper into what the maps showed us. We used [Wikimedia's list of country classications](https://meta.wikimedia.org/wiki/List_of_countries_by_regional_classification) to assign north/south divide and regions and displayed the results using bar charts, first comparing both datasets by the north/south divide, then going into nationality and regional details for the Global South.<br><br>
Having populated the Rhizome dataset manually we were able to take into account dual nationalities of artists in these vizualisations, by doing some additional manual cleaning of countries. 


In [12]:
#Create a DF of MoMA nationalities transformed into countries and country code w/ counts
moma_countries_count = assign_codes(moma_rhz_compare)
#check for countries w/ missing codes
missing_codes = moma_countries_count.loc[moma_countries_count['country_code'] == 'nan']
missing_codes[~missing_codes.Nation.str.contains(',')]

Unnamed: 0,Nation,Count,country_code
90,Beninese,1,
254,Ivory Coast,1,
282,Nationality unknown,3,
313,Scotland,14,
374,Yugoslavia,2,
376,missing,196,


In [13]:
#fix them
moma_countries_count.loc[254, 'country_code'] = 'CIV'
moma_countries_count.loc[90, 'country_code'] = 'BEN'
moma_countries_count.loc[90, 'Nation'] = 'Benin'
moma_countries_count.loc[266, 'Nation'] = 'South Korea'
moma_countries_count.loc[266, 'country_code'] = 'KOR'
#add Scots to GBR
moma_countries_count.loc[366, 'Count'] = 194
#add Serbs to SRB
moma_countries_count.loc[315, 'Count'] = 44
#add Croats to HRV
moma_countries_count.loc[152, 'Count'] = 8

#reset types and filter for final use
moma_countries_count['Count'] = moma_countries_count['Count'].astype(int)
moma_countries_count = moma_countries_count.loc[~moma_countries_count.Nation.str.contains(',')]
moma_countries_count = moma_countries_count.loc[~moma_countries_count.country_code.str.contains('nan')]
moma_countries_count = moma_countries_count.sort_values(by='Nation').reset_index(drop=True)
moma_countries_count.head()

Unnamed: 0,Nation,Count,country_code
0,Afghanistan,1,AFG
1,Albania,2,ALB
2,Algeria,3,DZA
3,Argentina,20,ARG
4,Australia,10,AUS


In [14]:
#Create a DF of Rhizome nationalities transformed into countries and country code w/ counts
rhz_countries_count = assign_codes(rhz_artworks)
#check for countries w/ missing codes
missing_codes_2 = rhz_countries_count.loc[rhz_countries_count['country_code'] == 'nan']
missing_codes_2[missing_codes_2.Nation.str.contains('/')]

Unnamed: 0,Nation,Count,country_code
0,Afghan/American,1,
7,"American, Russian, British, Swiss/American, Du...",1,
9,American/British,2,
10,American/Canadian,1,
11,American/Israeli,1,
12,American/Mexican,1,
13,American/South African,1,
26,Brasilian/American,2,
31,British/American,1,
32,British/Canadian,1,


In [15]:
#fix them
#Add Swiss 
rhz_countries_count.loc[121, 'Count'] = 7
#Add Iranians 
rhz_countries_count.loc[74, 'Count'] = 3
#Add Danes 
rhz_countries_count.loc[49, 'Count'] = 2
#Add Mexicans 
rhz_countries_count.loc[92, 'Count'] = 11
#Add SAs 
rhz_countries_count.loc[109, 'Count'] = 3
#Add Poles 
rhz_countries_count.loc[97, 'Count'] = 12
#Add Chileans 
rhz_countries_count.loc[40, 'Count'] = 6
#Add Turks
rhz_countries_count.loc[124, 'Count'] = 6
#Add Canadians
rhz_countries_count.loc[35, 'Count'] = 51
#Add Serbs
rhz_countries_count.loc[106, 'Count'] = 4
#Add Fins
rhz_countries_count.loc[55, 'Count'] = 3
#Add Japanese
rhz_countries_count.loc[86, 'Count'] = 11
#Add Americans
rhz_countries_count.loc[127, 'Count'] = 287
#Add Columbians
rhz_countries_count.loc[44, 'Count'] = 3
#Add Germans
rhz_countries_count.loc[66, 'Count'] = 34
#Add Brits
rhz_countries_count.loc[126, 'Count'] = 58
#Add Irish
rhz_countries_count.loc[77, 'Count'] = 3
#Add Greeks
rhz_countries_count.loc[67, 'Count'] = 5
#Add Australians
rhz_countries_count.loc[18, 'Count'] = 22
#Add Spanish
rhz_countries_count.loc[114, 'Count'] = 15
#Add French
rhz_countries_count.loc[57, 'Count'] = 30
#Add Israeli 
rhz_countries_count.loc[78, 'Count'] = 15
#Add Dutch 
rhz_countries_count.loc[93, 'Count'] = 15
#Add Italians 
rhz_countries_count.loc[85, 'Count'] = 22
#Create Brazilians
rhz_countries_count.loc[140, 'Count'] = 16
rhz_countries_count.loc[140, 'Nation'] = 'Brazil'
rhz_countries_count.loc[140, 'country_code'] = 'BRA'
#Create Afghanis
rhz_countries_count.loc[141, 'Count'] = 1
rhz_countries_count.loc[141, 'Nation'] = 'Afghanistan'
rhz_countries_count.loc[141, 'country_code'] = 'AFG'
#Create Icelandic
rhz_countries_count.loc[142, 'Count'] = 1
rhz_countries_count.loc[142, 'Nation'] = 'Iceland'
rhz_countries_count.loc[142, 'country_code'] = 'ISL'

#reset types and filter for final use
rhz_countries_count['Count'] = rhz_countries_count['Count'].astype(int)
rhz_countries_count = rhz_countries_count.loc[~rhz_countries_count.Nation.str.contains(',')]
rhz_countries_count = rhz_countries_count.loc[~rhz_countries_count.country_code.str.contains('nan')]
rhz_countries_count = rhz_countries_count.sort_values(by='Nation').reset_index(drop=True)
rhz_countries_count.head()

Unnamed: 0,Nation,Count,country_code
0,Afghanistan,1,AFG
1,American,11,ASM
2,Angola,1,AGO
3,Argentina,12,ARG
4,Argentine,1,ARG


In [16]:
#add stragglers based on north/south divide
rhz_countries_count.loc[56, 'Count'] = 298
rhz_countries_count.loc[20, 'Count'] = 31
rhz_countries_count.loc[33, 'Count'] = 24
rhz_countries_count.loc[3, 'Count'] = 13
#remove them
rhz_countries_count = rhz_countries_count.drop([1, 4, 21, 32], axis=0)
rhz_countries_count['Count'] = rhz_countries_count['Count'].astype(int)
rhz_countries_count = rhz_countries_count.sort_values(by='Nation').reset_index(drop=True)
rhz_countries_count.head()

Unnamed: 0,Nation,Count,country_code
0,Afghanistan,1,AFG
1,Angola,1,AGO
2,Argentina,13,ARG
3,Australia,22,AUS
4,Austria,16,AUT


In [54]:
#MAP IT! 

moma_map = go.Figure(data=go.Choropleth(
    locations = moma_countries_count['country_code'],
    z = moma_countries_count['Count'].apply(np.floor),
    text = moma_countries_count['Nation'],
    colorscale=[
            [0,"#0e1123"],
[0.1 ,"#12172f"],
[0.2,"#171c3a"],
[0.3 ,"#1b2246"],
[0.4 ,"#222a58"],
[0.5 ,"#394792"],
[0.6 ,"#4252a9"],
[0.7 ,"#4455af"],
[0.8 ,"#5666bd"],
[0.9 ,"#5c6cc0"],
[1, "#7381c9"]],
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title = 'Total artworks',
))

moma_map.update_traces(hoverinfo='z+text')

moma_map.update_layout(
    title_text='Nationality of Solo Artists in MoMA Collection (Sampled)',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='natural earth',
        showocean=True, oceancolor="LightBlue"
    ),
    height = 700, title_x = 0.5
)

rhz_map = go.Figure(data=go.Choropleth(
    locations = rhz_countries_count['country_code'],
    z = rhz_countries_count['Count'].apply(np.floor),
    text = rhz_countries_count['Nation'],
    colorscale=[
            [0,"#18230c"],
[0.1 ,"#243512"],
[0.2,"#314617"],
[0.3 ,"#3d581d"],
[0.4 ,"#496a23"],
[0.5 ,"#557b29"],
[0.6 ,"#618d2f"],
[0.7 ,"#6d9f35"],
[0.8 ,"#79b03a"],
[0.9 ,"#85c042"],
[1, "#91c653"]],
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title = 'Total artworks',
))

rhz_map.update_traces(hoverinfo='z+text')

rhz_map.update_layout(
    title_text='Nationality of Solo Artists in Rhizome Collection',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='natural earth',
        showocean=True, oceancolor="LightBlue"
    ),
    height = 700, title_x=0.5
)

#create local copies
#rhz_map.write_html(path+"Plotly_embeds/rhz_map.html", include_plotlyjs='directory')
#moma_map.write_html(path+"Plotly_embeds/moma_map.html", include_plotlyjs='directory')

rhz_map = go.FigureWidget(rhz_map)
moma_map = go.FigureWidget(moma_map)
maps = ipw.VBox([rhz_map, moma_map])
maps

VBox(children=(FigureWidget({
    'data': [{'autocolorscale': False,
              'colorbar': {'title': {'texâ€¦

In [18]:
#populate Rhizome country count DF w/ divide information 
rhz_north_south = rhz_countries_count.copy()
rhz_north_south['Divide'] = rhz_north_south['Nation'].apply(lambda nation: getDivide(nation))
rhz_north_south['Region'] = rhz_north_south['Nation'].apply(lambda nation: getRegion(nation))
rhz_north_south = rhz_north_south.sort_values(by='Nation').reset_index(drop=True)

#check for missing info
rhz_north_south.loc[rhz_north_south['Divide'] == 'missing']

Unnamed: 0,Nation,Count,country_code,Divide,Region
26,Iran,3,IRN,missing,missing
40,Russia,8,RUS,missing,missing
44,South Korea,6,KOR,missing,missing
49,Tanzania,1,TZA,missing,missing


In [19]:
#fix them 
rhz_north_south.loc[26, 'Divide'] = 'Global South'
rhz_north_south.loc[26, 'Region'] = 'Middle east'
rhz_north_south.loc[40, 'Divide'] = 'Global North'
rhz_north_south.loc[40, 'Region'] = 'Europe'
rhz_north_south.loc[44, 'Divide'] = 'Global North'
rhz_north_south.loc[44, 'Region'] = 'Asia & Pacific'
rhz_north_south.loc[49, 'Divide'] = 'Global South'
rhz_north_south.loc[49, 'Region'] = 'Africa'

In [20]:
#same for MoMA
moma_north_south = moma_countries_count.copy()
moma_north_south['Divide'] = moma_north_south['Nation'].apply(lambda nation: getDivide(nation))
moma_north_south['Region'] = moma_north_south['Nation'].apply(lambda nation: getRegion(nation))
moma_north_south = moma_north_south.sort_values(by='Nation').reset_index(drop=True)

#check for missing info
moma_north_south.loc[moma_north_south['Divide'] == 'missing']

Unnamed: 0,Nation,Count,country_code,Divide,Region
10,Bosnia And Herzegovina,5,BIH,missing,missing
35,Iran,4,IRN,missing,missing
39,Ivory Coast,1,CIV,missing,missing
55,Palestine,2,PSE,missing,missing
60,Republic Of The Congo,3,COG,missing,missing
62,Russia,25,RUS,missing,missing
69,South Korea,22,KOR,missing,missing
76,The Bahamas,1,BHS,missing,missing


In [21]:
#fix them 
moma_north_south.loc[10, 'Divide'] = 'Global North'
moma_north_south.loc[10, 'Region'] = 'Europe'
moma_north_south.loc[35, 'Divide'] = 'Global South'
moma_north_south.loc[35, 'Region'] = 'Middle east'
moma_north_south.loc[39, 'Divide'] = 'Global South'
moma_north_south.loc[39, 'Region'] = 'Africa'
moma_north_south.loc[55, 'Divide'] = 'Global South'
moma_north_south.loc[55, 'Region'] = 'Arab States'
moma_north_south.loc[60, 'Divide'] = 'Global South'
moma_north_south.loc[60, 'Region'] = 'Africa'
moma_north_south.loc[62, 'Divide'] = 'Global North'
moma_north_south.loc[62, 'Region'] = 'Europe'
moma_north_south.loc[69, 'Divide'] = 'Global North'
moma_north_south.loc[69, 'Region'] = 'Asia & Pacific'
moma_north_south.loc[76, 'Divide'] = 'Global South'
moma_north_south.loc[76, 'Region'] = 'South/Latin America'

In [22]:
#combine the two
moma_north_south['Source'] = 'MoMA'
rhz_north_south['Source'] = 'Rhizome'
north_v_south = pd.concat([moma_north_south, rhz_north_south])
north_v_south.head()

Unnamed: 0,Nation,Count,country_code,Divide,Region,Source
0,Afghanistan,1,AFG,Global South,Asia & Pacific,MoMA
1,Albania,2,ALB,Global North,Europe,MoMA
2,Algeria,3,DZA,Global South,Arab States,MoMA
3,Argentina,20,ARG,Global South,South/Latin America,MoMA
4,Australia,10,AUS,Global North,Asia & Pacific,MoMA


In [23]:
north_south_viz = north_v_south.groupby(['Source', 'Divide'],as_index=False).agg({'Count': 'sum'})
north_south_viz = north_south_viz.replace(['MoMA'], 'MoMA (sampled)')
north_south_bars = px.bar(north_south_viz, x="Divide", y="Count", color='Source', title="The Global North/South Divide across Rhizome and MoMA (sampled)", barmode='group', text_auto=True, color_discrete_map ={"MoMA (sampled)":"#aab6fe", "Rhizome":"#9ccc65"}, category_orders={"Source": ["MoMA (sampled)", 'Rhizome']})
north_south_bars.update_xaxes(showticklabels=True)
north_south_bars.update_yaxes(showticklabels=True)
north_south_bars.update_traces(textposition="inside", hoverinfo='x+y+name', hovertemplate=None)
north_south_bars.update_layout(height=800, width=800, title_x=0.5)
#create local copies
#north_south_bars.write_html(path+"Plotly_embeds/rhz_moma_north_south_bars.html", include_plotlyjs='directory')
north_south_bars.show()

In [24]:
#focus only on Global South
rhz_moma_south = north_v_south.loc[north_v_south['Divide'] == 'Global South'] 
rhz_moma_south_nats = rhz_moma_south.groupby(['Source', 'Nation', 'Region'],as_index=False).agg({'Count': 'sum'})
rhz_moma_south_regions = rhz_moma_south.groupby(['Source', 'Region'],as_index=False).agg({'Count': 'sum'})

In [25]:
#BAR IT! 
rhz_moma_south_nats = rhz_moma_south_nats.replace(['MoMA'], 'MoMA (sampled)')
north_south_nats = px.bar(rhz_moma_south_nats, x="Nation", y="Count", color='Source', title="Artists from the Global South by Nationality - Rhizome vs MoMA (sampled)", color_discrete_map ={"MoMA (sampled)":"#aab6fe", "Rhizome":"#9ccc65"},
category_orders={"Source": ["MoMA (sampled)", 'Rhizome']}, text_auto=True, barmode='stack')
north_south_nats.update_xaxes(showticklabels=True, categoryorder='sum descending', tickangle=45)
north_south_nats.update_yaxes(showticklabels=True)
north_south_nats.update_traces(textposition="outside", hoverinfo='x+y+name', hovertemplate=None)
north_south_nats.update_layout(height=600, title_x=0.5)

rhz_moma_south_regions = rhz_moma_south_regions.replace(['MoMA'], 'MoMA (sampled)')
north_south_regs = px.bar(rhz_moma_south_regions, x="Region", y="Count", color='Source', title="Artists from the Global South by Regions - Rhizome vs MoMA (sampled)", color_discrete_map ={"MoMA (sampled)":"#aab6fe", "Rhizome":"#9ccc65"},
category_orders={"Source": ["MoMA (sampled)", 'Rhizome']}, text_auto=True, barmode='group')
north_south_regs.update_xaxes(showticklabels=True, categoryorder='sum descending')
north_south_regs.update_yaxes(showticklabels=True)
north_south_regs.update_traces(textposition="outside", hoverinfo='x+y+name', hovertemplate=None)
north_south_regs.update_layout(height=600, title_x=0.5)

#create local copies
#north_south_nats.write_html(path+"Plotly_embeds/rhz_moma_north_south_bars_nats.html", include_plotlyjs='directory')
#north_south_regs.write_html(path+"Plotly_embeds/rhz_moma_north_south_bars_regions.html", include_plotlyjs='directory')

north_south_nats = go.FigureWidget(north_south_nats)
north_south_regs = go.FigureWidget(north_south_regs)
north_south = ipw.VBox([north_south_nats, north_south_regs])
north_south

VBox(children=(FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hoverinfo': 'x+y+name',
 â€¦

***

### Relationship between Gender and Nationality (Everyone)

parallel sets. this time we keep gender and dept info and count artworks instead of artists. we display them by region, following from nationality work. 

In [26]:
#create a new Rhizome DF w/ gender + department info
missing = pd.DataFrame({'Aalborgenser': ['Korean', 'Native American', 'Canadian Inuit'], 'Aalborg': ['Korea', 'United States', 'Canada']})
parser = pd.read_csv('https://raw.githubusercontent.com/knowitall/chunkedextractor/master/src/main/resources/edu/knowitall/chunkedextractor/demonyms.csv')
corrections = pd.concat([missing, parser])
rhz_parallel = rhz_medium.copy()
rhz_parallel['Nation'] = ''
nats = set(rhz_parallel.Nationality)
for item in nats:
    my = corrections[corrections['Aalborgenser'] == item]
    country = my[:1]['Aalborg'].values
    if len(country)>0:
        country_str = my[:1]['Aalborg'].values[0]
        rhz_parallel.loc[rhz_parallel["Nationality"] == item, "Nation"] = country_str
rhz_parallel['Nation'] = rhz_parallel['Nation'].replace('', 'missing')
rhz_parallel['country_code'] = rhz_parallel['Nation'].apply(lambda country: do_fuzzy_search(country))
rhz_parallel['Divide'] = rhz_parallel['Nation'].apply(lambda country: getDivide(country))
rhz_parallel['Region'] = rhz_parallel['Nation'].apply(lambda country: getRegion(country))
rhz_parallel = rhz_parallel.drop(['ID'], axis=1)
rhz_parallel = rhz_parallel.rename(columns={"Medium": "Department"})
rhz_parallel['country_code'] = rhz_parallel['country_code'].fillna('missing')
rhz_parallel['Source'] = 'Rhizome'
rhz_parallel = rhz_parallel.replace(['Installation & Performance', 'Net Art'], 'Media and Performance') 
rhz_parallel = rhz_parallel.loc[rhz_parallel['Department'] != 'missing'] 
rhz_parallel.to_pickle(path+'Rhizome_data/pickles/rhizome_parallel.pkl')

In [27]:
#export to excel to manually clean up missing country values
#rhz_parallel.to_excel(path+'Rhizome_data/csv/rhz_parallel.xlsx')

In [43]:
rhz_parallel = pd.read_excel(path+'Rhizome_data/csv/rhz_parallel_clean.xlsx')

In [32]:
rhz_parallel.drop_duplicates(subset='Artist', keep='last')

Unnamed: 0,URL,Title,Artist,dateAcquired,dateCreated,Nationality,Gender,Text,Keywords,Department,Nation,country_code,Divide,Region,Source
1,https://artbase.rhizome.org/wiki/Q4089,Zones de Convergence,cicero,2005,2005,missing,missing,zones de convergence treats days summit g8 evi...,"summit, art, august, zones, de, convergence, t...",Moving Images,missing,missing,missing,missing,Rhizome
2,https://artbase.rhizome.org/wiki/Q1475,Zombie and Mummy,"Dragan Espenschied, Olia Lialina",2004,2002,"German, Russian","M, F",zombie mummy website hosted episodic online co...,"zombie, site, olia, espenschied, mummy, advent...",Still Images,Germany,DEU,Global North,Europe,Rhizome
3,https://artbase.rhizome.org/wiki/Q4374,"Zaira, City of Memories",Gokcen Erguven,2004,2004,Turkish,F,project based solely italo calvinos novel call...,"cities, content, city, zaira, project, based, ...",Media and Performance,Turkey,TUR,Global North,Europe,Rhizome
4,https://artbase.rhizome.org/wiki/Q3972,Z_G [zeitgeist gestalten],Tiago Borges,2008,2007,Angolan,M,looking den zeitgeist gestalten design spirit ...,"experience, web, page, den, zeitgeist, gestalt...",Media and Performance,Angola,AGO,Global South,Africa,Rhizome
5,https://artbase.rhizome.org/wiki/Q2580,Z,Antoni Abad,2003,2001,Spanish,M,z freeware fly variable behaviour patterns fly...,"z, fly, online, flies, users, canal, abad, spa...",Media and Performance,Spain,ESP,Global North,Europe,Rhizome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062,https://artbase.rhizome.org/wiki/Q4358,1999,joan escofet,2001,2000,missing,missing,straddle3xa0 1999xa0nmeasuring chains construc...,"space, na, reality, may, queer, public, narrat...",Media and Performance,missing,missing,missing,missing,Rhizome
2063,https://artbase.rhizome.org/wiki/Q3761,1969,Rhea Myers,2004,2004,British,F,1969 followup previous project 1968 1968 remix...,"images, remixed, make, december, followup, pre...",Still Images,United Kingdom,GBR,Global North,Europe,Rhizome
2064,https://artbase.rhizome.org/wiki/Q2283,1953,Skye Thorstenson,2003,2002,missing,M,1953 comic series three people trapped room ho...,"year, thorstenson, identity, comic, trapped, r...",Media and Performance,missing,missing,missing,missing,Rhizome
2065,https://artbase.rhizome.org/wiki/Q2511,160,Katie Lips,2005,2005,British,F,160 archive 160 sms messages treasured past 18...,"messages, people, content, sms, mobile, ipod, ...",Media and Performance,United Kingdom,GBR,Global North,Europe,Rhizome


In [28]:
#same for MoMA 
moma_parallel = moma_rhz_compare.copy()
moma_parallel['Nation'] = ''
nats_moma = set(moma_parallel.Nationality)
for item in nats_moma:
    my = corrections[corrections['Aalborgenser'] == item]
    country = my[:1]['Aalborg'].values
    if len(country)>0:
        country_str = my[:1]['Aalborg'].values[0]
        moma_parallel.loc[moma_parallel["Nationality"] == item, "Nation"] = country_str
moma_parallel['Nation'] = moma_parallel['Nation'].replace('', 'missing')
moma_parallel['country_code'] = moma_parallel['Nation'].apply(lambda country: do_fuzzy_search(country))
moma_parallel['Divide'] = moma_parallel['Nation'].apply(lambda country: getDivide(country))
moma_parallel['Region'] = moma_parallel['Nation'].apply(lambda country: getRegion(country))
moma_parallel = moma_parallel.drop(['ThumbnailURL', 'ConstituentID', 'Medium'], axis=1)
moma_parallel['country_code'] = moma_parallel['country_code'].fillna('missing')
moma_parallel['Source'] = 'MoMA (sampled)'
moma_parallel = moma_parallel.replace(['Photography', 'Drawings & Prints'], 'Still Images').replace(['Film'], 'Moving Images')
moma_parallel = moma_parallel.loc[moma_parallel['Department'] != 'Architecture & Design']
moma_parallel = moma_parallel.loc[moma_parallel['Department'] != 'Fluxus Collection'] 
moma_parallel = moma_parallel.loc[moma_parallel['Department'] != 'Painting & Sculpture'] 
moma_parallel.to_pickle(path+'MOMA_data/pickle/moma_parallel.pkl')

In [30]:
#export to excel to manually clean up missing country values
moma_parallel.to_csv(path+'MOMA_data/csv/moma_parallel.csv')

In [44]:
moma_parallel = pd.read_pickle(path+'MOMA_data/pickle/moma_parallel.pkl')

In [45]:
#combine them, remove entries with missing gender/divide info, add a numerical version of gender for parallel viz
rhz_parallel_both = rhz_parallel.copy()
rhz_parallel_both = rhz_parallel_both.loc[(rhz_parallel_both['Gender'] != 'missing') & (rhz_parallel_both['Divide'] != 'missing')]
moma_parallel_both = moma_parallel.copy()
moma_parallel_both = moma_parallel_both.loc[(moma_parallel_both['Gender'] != 'missing')]
moma_rhz_parallel_both = pd.concat([rhz_parallel_both, moma_parallel_both])
moma_rhz_parallel_both['Gender_ID'] = moma_rhz_parallel_both['Gender']
moma_rhz_parallel_both['Gender_ID'] = moma_rhz_parallel_both['Gender_ID'].replace('M', 0)
moma_rhz_parallel_both['Gender_ID'] = moma_rhz_parallel_both['Gender_ID'].replace('F', 1)
moma_rhz_parallel_both['Gender_ID'] = moma_rhz_parallel_both['Gender_ID'].replace('NB', 3)
moma_rhz_parallel_both['Gender_ID'] = moma_rhz_parallel_both['Gender_ID'].astype(str)
moma_rhz_parallel_both.loc[moma_rhz_parallel_both.Gender_ID.str.contains(','), 'Gender_ID'] = '2'
moma_rhz_parallel_both['Gender_ID'] = moma_rhz_parallel_both['Gender_ID'].astype(int)

In [53]:
#RHZ // Sets - by region

rhz_only = moma_rhz_parallel_both.loc[moma_rhz_parallel_both['Source'] == 'MoMA (sampled)']
#rhz_only = rhz_only.loc[rhz_only['Department'] != 'missing']
rhz_only = rhz_only.loc[rhz_only['Divide'] == 'Global South']
#create regions

# Create dimensions

#nats
nat_dim_asia = go.parcats.Dimension(
    values=rhz_only.Region,
    categoryorder='category ascending', label="Region"
)


#departments

dept_dim_asia = go.parcats.Dimension(values=rhz_only.Department, label="Department")


#genders

gender_dim_asia = go.parcats.Dimension(
    values=rhz_only.Gender_ID, label="Gender", categoryarray=[0, 1, 2],
    ticktext=['Male', 'Female', 'Collectives']
)

# Create parcats trace
color_asia = rhz_only.Gender_ID;
colorscale_asia = ['#007c91', '#ff5252', '#ce93d8'];


rhz_parallel_set_asia = go.Figure(data = [go.Parcats(dimensions=[nat_dim_asia, gender_dim_asia, dept_dim_asia], 
        line={'color': color_asia, 'colorscale': colorscale_asia},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
rhz_parallel_set_asia.update_traces(dimensions=[{'categoryorder':'category ascending'}])
rhz_parallel_set_asia.update_layout(height=1000, title="Artists from the Global South by Region, Gender, and Department - Rhizome", title_x=0.5)


In [47]:
#RHZ // Sets - by region

rhz_only = moma_rhz_parallel_both.loc[moma_rhz_parallel_both['Source'] == 'Rhizome']
rhz_only = rhz_only.loc[rhz_only['Department'] != 'missing']
rhz_only = rhz_only.loc[rhz_only['Divide'] == 'Global South']
#create regions
rhz_only_asia = rhz_only.loc[rhz_only['Region'] == 'Asia & Pacific']
rhz_only_africa = rhz_only.loc[rhz_only['Region'] == 'Africa']
rhz_only_middle_east = rhz_only.loc[rhz_only['Region'] == 'Middle east']
rhz_only_latin = rhz_only.loc[rhz_only['Region'] == 'South/Latin America']

# Create dimensions

#nats
nat_dim_asia = go.parcats.Dimension(
    values=rhz_only_asia.Nation,
    categoryorder='category ascending', label="Nation"
)

nat_dim_africa = go.parcats.Dimension(
    values=rhz_only_africa.Nation,
    categoryorder='category ascending', label="Nation"
)

nat_dim_me = go.parcats.Dimension(
    values=rhz_only_middle_east.Nation,
    categoryorder='category ascending', label="Nation"
)

nat_dim_latin = go.parcats.Dimension(
    values=rhz_only_latin.Nation,
    categoryorder='category ascending', label="Nation"
)

#departments

dept_dim_asia = go.parcats.Dimension(values=rhz_only_asia.Department, label="Department")
dept_dim_africa = go.parcats.Dimension(values=rhz_only_africa.Department, label="Department")
dept_dim_me = go.parcats.Dimension(values=rhz_only_middle_east.Department, label="Department")
dept_dim_latin = go.parcats.Dimension(values=rhz_only_latin.Department, label="Department")

#genders

gender_dim_asia = go.parcats.Dimension(
    values=rhz_only_asia.Gender_ID, label="Gender", categoryarray=[0, 1, 2],
    ticktext=['Male', 'Female', 'Collectives']
)

gender_dim_africa = go.parcats.Dimension(
    values=rhz_only_africa.Gender_ID, label="Gender", categoryarray=[0, 1, 2],
    ticktext=['Male', 'Female', 'Collectives']
)

gender_dim_me = go.parcats.Dimension(
    values=rhz_only_middle_east.Gender_ID, label="Gender", categoryarray=[0, 1, 2],
    ticktext=['Male', 'Female', 'Collectives']
)

gender_dim_latin = go.parcats.Dimension(
    values=rhz_only_latin.Gender_ID, label="Gender", categoryarray=[0, 1, 2],
    ticktext=['Male', 'Female', 'Collectives']
)

# Create parcats trace
color_asia = rhz_only_asia.Gender_ID;
colorscale_asia = ['#007c91', '#ff5252', '#ce93d8'];

color_me = rhz_only_middle_east.Gender_ID;
colorscale_me = ['#007c91', '#ff5252', '#ce93d8'];

color_africa = rhz_only_africa.Gender_ID;
colorscale_africa = ['#007c91', '#ff5252', '#ce93d8'];

color_latin = rhz_only_latin.Gender_ID;
colorscale_latin = ['#007c91', '#ff5252', '#ce93d8'];


rhz_parallel_set_asia = go.Figure(data = [go.Parcats(dimensions=[nat_dim_asia, gender_dim_asia, dept_dim_asia], 
        line={'color': color_asia, 'colorscale': colorscale_asia},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
rhz_parallel_set_asia.update_traces(dimensions=[{'categoryorder':'category ascending'}])
rhz_parallel_set_asia.update_layout(height=1000, title="Artworks from Asia & Pacific - Rhizome", title_x=0.5)

rhz_parallel_set_africa = go.Figure(data = [go.Parcats(dimensions=[nat_dim_africa, gender_dim_africa, dept_dim_africa], 
        line={'color': color_africa, 'colorscale': colorscale_africa},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
rhz_parallel_set_africa.update_traces(dimensions=[{'categoryorder':'category ascending'}])
rhz_parallel_set_africa.update_layout(height=1000, title="Artworks from Africa - Rhizome", title_x=0.5)

rhz_parallel_set_me = go.Figure(data = [go.Parcats(dimensions=[nat_dim_me, gender_dim_me, dept_dim_me], 
        line={'color': color_me, 'colorscale': colorscale_me},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
rhz_parallel_set_me.update_traces(dimensions=[{'categoryorder':'category ascending'}])
rhz_parallel_set_me.update_layout(height=1000, title="Artworks from the Middle east - Rhizome", title_x=0.5)

rhz_parallel_set_latin = go.Figure(data = [go.Parcats(dimensions=[nat_dim_latin, gender_dim_latin, dept_dim_latin], 
        line={'color': color_latin, 'colorscale': colorscale_latin},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
rhz_parallel_set_latin.update_traces(dimensions=[{'categoryorder':'category ascending'}])
rhz_parallel_set_latin.update_layout(height=1000, title="Artworks from South/Latin America - Rhizome", title_x=0.5)

#create local copies
#north_south_nats.write_html(path+"Plotly_embeds/rhz_moma_north_south_bars_nats.html", include_plotlyjs='directory')
#rhz_parallel_set_2.write_html(path+"Plotly_embeds/parallel_test.html", include_plotlyjs='directory')

rhz_parallel_set_asia = go.FigureWidget(rhz_parallel_set_asia)
rhz_parallel_set_africa = go.FigureWidget(rhz_parallel_set_africa)
rhz_parallel_set_me = go.FigureWidget(rhz_parallel_set_me)
rhz_parallel_set_latin = go.FigureWidget(rhz_parallel_set_latin)
rhz_parallel_together = ipw.VBox([rhz_parallel_set_asia, rhz_parallel_set_africa, rhz_parallel_set_me, rhz_parallel_set_latin])
rhz_parallel_together

In [132]:
#MoMA // Sets - by region

moma_only = moma_rhz_parallel_both.loc[moma_rhz_parallel_both['Source'] == 'MoMA (sampled)']
moma_only = moma_rhz_parallel_both.loc[moma_rhz_parallel_both['Divide'] == 'Global South']
#create regions
moma_only_asia = moma_only.loc[moma_only['Region'] == 'Asia & Pacific']
moma_only_africa = moma_only.loc[moma_only['Region'] == 'Africa']
moma_only_middle_east = moma_only.loc[moma_only['Region'] == 'Middle east']
moma_only_latin = moma_only.loc[moma_only['Region'] == 'South/Latin America']
moma_only_arab = moma_only.loc[moma_only['Region'] == 'Arab States']

# Create dimensions

#nats
moma_nat_dim_asia = go.parcats.Dimension(
    values=moma_only_asia.Nationality,
    categoryorder='category ascending', label="Nationality"
)

moma_nat_dim_africa = go.parcats.Dimension(
    values=moma_only_africa.Nationality,
    categoryorder='category ascending', label="Nationality"
)

moma_nat_dim_me = go.parcats.Dimension(
    values=moma_only_middle_east.Nationality,
    categoryorder='category ascending', label="Nationality"
)

moma_nat_dim_latin = go.parcats.Dimension(
    values=moma_only_latin.Nationality,
    categoryorder='category ascending', label="Nationality"
)

moma_nat_dim_arab = go.parcats.Dimension(
    values=moma_only_arab.Nationality,
    categoryorder='category ascending', label="Nationality"
)

#departments

moma_dept_dim_asia = go.parcats.Dimension(values=moma_only_asia.Department, label="Department")
moma_dept_dim_africa = go.parcats.Dimension(values=moma_only_africa.Department, label="Department")
moma_dept_dim_me = go.parcats.Dimension(values=moma_only_middle_east.Department, label="Department")
moma_dept_dim_latin = go.parcats.Dimension(values=moma_only_latin.Department, label="Department")
moma_dept_dim_arab = go.parcats.Dimension(values=moma_only_arab.Department, label="Department")

#genders

moma_gender_dim_asia = go.parcats.Dimension(
    values=moma_only_asia.Gender_ID, label="Gender", categoryarray=[0, 1],
    ticktext=['Male', 'Female']
)

moma_gender_dim_africa = go.parcats.Dimension(
    values=moma_only_africa.Gender_ID, label="Gender", categoryarray=[0, 1],
    ticktext=['Male', 'Female']
)

moma_gender_dim_me = go.parcats.Dimension(
    values=moma_only_middle_east.Gender_ID, label="Gender", categoryarray=[0, 1],
    ticktext=['Male', 'Female']
)

moma_gender_dim_latin = go.parcats.Dimension(
    values=moma_only_latin.Gender_ID, label="Gender", categoryarray=[0, 1],
    ticktext=['Male', 'Female']
)

moma_gender_dim_arab = go.parcats.Dimension(
    values=moma_only_arab.Gender_ID, label="Gender", categoryarray=[0, 1],
    ticktext=['Male', 'Female']
)

# Create parcats trace
moma_color_asia = moma_only_asia.Gender_ID;
colorscale_asia = ['#007c91', '#ff5252'];

moma_color_africa = moma_only_africa.Gender_ID;
colorscale_africa = ['#007c91', '#ff5252'];

moma_color_me = moma_only_middle_east.Gender_ID;
colorscale_me = ['#007c91', '#ff5252'];

moma_color_latin = moma_only_latin.Gender_ID;
colorscale_latin = ['#007c91', '#ff5252'];

moma_color_arab = moma_only_arab.Gender_ID;
colorscale_arab = ['#007c91', '#ff5252'];


moma_parallel_set_asia = go.Figure(data = [go.Parcats(dimensions=[nat_dim_asia, gender_dim_asia, dept_dim_asia], 
        line={'color': color_asia, 'colorscale': colorscale_asia},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
moma_parallel_set_asia.update_traces(dimensions=[{'categoryorder':'category ascending'}])
moma_parallel_set_asia.update_layout(height=1000, title="Artworks from Asia & Pacific - MoMA (sampled)", title_x=0.5)

moma_parallel_set_africa = go.Figure(data = [go.Parcats(dimensions=[nat_dim_africa, gender_dim_africa, dept_dim_africa], 
        line={'color': color_africa, 'colorscale': colorscale_africa},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
moma_parallel_set_africa.update_traces(dimensions=[{'categoryorder':'category ascending'}])
moma_parallel_set_africa.update_layout(height=1000, title="Artworks from Africa - MoMA (sampled)", title_x=0.5)

moma_parallel_set_me = go.Figure(data = [go.Parcats(dimensions=[nat_dim_me, gender_dim_me, dept_dim_me], 
        line={'color': color_me, 'colorscale': colorscale_me},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
moma_parallel_set_me.update_traces(dimensions=[{'categoryorder':'category ascending'}])
moma_parallel_set_me.update_layout(height=1000, title="Artworks from the Middle east - MoMA (sampled)", title_x=0.5)

moma_parallel_set_latin = go.Figure(data = [go.Parcats(dimensions=[nat_dim_latin, gender_dim_latin, dept_dim_latin], 
        line={'color': color_latin, 'colorscale': colorscale_latin},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
moma_parallel_set_latin.update_traces(dimensions=[{'categoryorder':'category ascending'}])
moma_parallel_set_latin.update_layout(height=1000, title="Artworks from South/Latin America - MoMA (sampled)", title_x=0.5)

moma_parallel_set_arab = go.Figure(data = [go.Parcats(dimensions=[nat_dim_arab, gender_dim_arab, dept_dim_arab], 
        line={'color': color_arab, 'colorscale': colorscale_arab},
        hoveron='color', hoverinfo='count',
        labelfont={'size': 12},
        tickfont={'size': 10},
        arrangement='freeform')])
moma_parallel_set_arab.update_traces(dimensions=[{'categoryorder':'category ascending'}])
moma_parallel_set_arab.update_layout(height=1000, title="Artworks from Arab States - MoMA (sampled)", title_x=0.5)

#create local copies
#north_south_nats.write_html(path+"Plotly_embeds/rhz_moma_north_south_bars_nats.html", include_plotlyjs='directory')
#rhz_parallel_set_2.write_html(path+"Plotly_embeds/parallel_test.html", include_plotlyjs='directory')

moma_parallel_set_asia = go.FigureWidget(moma_parallel_set_asia)
moma_parallel_set_africa = go.FigureWidget(moma_parallel_set_africa)
moma_parallel_set_me = go.FigureWidget(moma_parallel_set_me)
moma_parallel_set_latin = go.FigureWidget(moma_parallel_set_latin)
moma_parallel_set_arab = go.FigureWidget(moma_parallel_set_arab)
moma_parallel_together = ipw.VBox([moma_parallel_set_asia, moma_parallel_set_africa, moma_parallel_set_me, moma_parallel_set_latin, moma_parallel_set_arab])
moma_parallel_together

VBox(children=(FigureWidget({
    'data': [{'arrangement': 'freeform',
              'dimensions': [{'categoryâ€¦

In [51]:
test = moma_rhz_parallel_both[['Nation', 'Gender', 'Divide', 'Region', 'Source']].copy()
test['Counts'] = 1
test.loc[test.Gender.str.contains(','), 'Gender'] = 'Collectives & Collabs'
test = test.groupby(['Gender', 'Region', 'Source'],as_index=False).agg({'Counts': 'sum'})

In [52]:
test.loc[test['Gender'] == 'NB']

Unnamed: 0,Gender,Region,Source,Counts
30,NB,Europe,MoMA (sampled),1
31,NB,Europe,Rhizome,2
32,NB,North America,MoMA (sampled),9


In [60]:
fig43 = px.scatter(test,
                 y="Region", x="Gender", size="Counts", color="Source",
                 opacity = 0.3,
                 title="Rhizome artworks acquired by Nationality and Gender")
fig43.update_xaxes()
fig43.update_yaxes(matches=None)
fig43.update_layout()
fig43.show()

In [196]:
#add source info
bubble_south = moma_rhz_parallel_both_south.copy()
bubble_south['Gender'] = bubble_south['Gender'].replace(1, 'Male')
bubble_south['Gender'] = bubble_south['Gender'].replace(0, 'Female')
bubble_south[['dateAcquired', 'dateCreated']] = bubble_south[['dateAcquired', 'dateCreated']].astype(int)
#clean from 0
bubble_south = bubble_south[bubble_south['dateCreated'] >= 2000]
bubble_south = bubble_south[bubble_south['dateAcquired'] != 0]

#group by
bubble_south = bubble_south.groupby(['Source', 'Gender','dateCreated', 'dateAcquired', 'Nationality']).size()
bubble_south = bubble_south.to_frame(name = 'size').reset_index()


In [197]:
#create bubble chart w/facet
fig = px.scatter(bubble_south, y='dateCreated', x='dateAcquired', color='Nationality', size='size', symbol='Gender', facet_col='Source', title= "Acquisitions of Artworks from the Global South across Collection, by Gender and Nationality", template='seaborn')
#green line before and after
#fig.add_hline(y=2007, line_dash="dot", line_width=3, annotation_text="2007", annotation_position="bottom left", annotation_font_size=10, annotation_font_color="blue")
#fig.add_vline(x=2007, line_dash="dot", line_width=3)
#OR add inside scatter the faceting
#facet_col='Period', category_orders={"Period": ["Before 1980", "After 1980"]}
#trendline="ols"
fig.update_layout(width=1400)
fig.show()