# Analysis of the Collection of Artworks at MoMA Dataset 

How has the proportion of aquisitions of artworks created by women changed over time?

In [127]:
import pandas as pd

In [128]:
#importing artists dataset
df = pd.read_csv('artists.csv')

In [129]:
df.head()

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
0,1,Robert Arneson,American,Male,1930.0,1992.0
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,
2,3,Bill Arnold,American,Male,1941.0,
3,4,Charles Arnoldi,American,Male,1946.0,
4,5,Per Arnoldi,Danish,Male,1941.0,


In [None]:
#converting "Birth Year" and "Death Year" Columns to integer

In [130]:
df["Birth Year Int"] = df["Birth Year"].fillna(value=0).astype(int)

In [532]:
df["Death Year Int"] = df["Death Year"].fillna(value=0).astype(int)

In [None]:
#check the datatypes of the dataframe

In [133]:
df.dtypes

Artist ID           int64
Name               object
Nationality        object
Gender             object
Birth Year        float64
Death Year        float64
Birth Year Int      int64
Death Year Int      int64
dtype: object

In [None]:
#create a nationality subset and count the frequecy of each nationality

In [135]:
df_nat_u = df.groupby('Nationality').size().reset_index(name='Freq')

In [136]:
df_nat_u

Unnamed: 0,Nationality,Freq
0,Afghan,1
1,Albanian,4
2,Algerian,6
3,American,5198
4,Angolan,1
...,...,...
120,Vietnamese,7
121,Welsh,2
122,Yugoslav,24
123,Zimbabwean,4


In [137]:
df_nat_u_sort = df_nat_u.sort_values(by=['Freq'])

In [138]:
df_nat_u_sort

Unnamed: 0,Nationality,Freq
0,Afghan,1
36,Emirati,1
52,Indonesian,1
58,Ivorian,1
63,Kuwaiti,1
...,...,...
57,Italian,531
14,British,835
41,French,839
43,German,930


In [37]:
import plotly.graph_objects as go
import plotly.express as px

In [541]:
fig = go.Figure(data=go.Bar(x=df_nat_u_sort.Nationality,
            y=df_nat_u_sort.Freq), layout=go.Layout(
        title=go.layout.Title(text="Nationalities")
    ))
fig.show()

In [None]:
#create a gender subset and count the frequencies

In [148]:
genders = pd.DataFrame(df['Gender'].value_counts())

In [149]:
genders

Unnamed: 0,Gender
Male,9820
Female,2193
male,6


In [None]:
#rename missing data to "gender unknown"

In [150]:
gender_unknown = df['Gender'].isna().sum()

In [151]:
gender_unknown

3072

In [None]:
#add the "male" category to "Male"

In [152]:
genders.iloc[0] = genders.iloc[0] + genders.iloc[2]

In [153]:
genders = genders.drop('male')

In [154]:
genders.loc['Gender Unknown'] = gender_unknown

In [155]:
genders

Unnamed: 0,Gender
Male,9826
Female,2193
Gender Unknown,3072


In [549]:
fig = go.Figure(data=go.Pie(labels = genders.index, values = genders.Gender), layout=go.Layout(
        title=go.layout.Title(text="Genders")))
fig.show()

In [None]:
#birth year scatter chart

In [550]:
fig = go.Figure(data=go.Scatter(
                x=df['Birth Year'],
                hovertext=df['Name'],
                mode='markers'), layout=go.Layout(
        title=go.layout.Title(text="Birth Years")))
fig.show()

In [None]:
#female only birth year scatter chart

In [175]:
female_by = df[df.Gender != "Male"]

In [None]:
female_by = female_by[df.Gender != "male"]

In [None]:
female_by = female_by[df['Gender'].notna()]

In [187]:
female_by

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Birth Year Int,Death Year Int
8,10,Irene Aronson,American,Female,1918.0,,1918,0
16,21,Ruth Asawa,American,Female,1926.0,2013.0,1926,2013
17,22,Isidora Aschheim,Israeli,Female,,,0,0
23,28,Geneviève Asse,French,Female,1923.0,,1923,0
25,31,Dana Atchley,American,Female,1941.0,2000.0,1941,2000
...,...,...,...,...,...,...,...,...
15013,50154,Ann Magnuson,American,Female,1956.0,,1956,0
15022,67012,Ka Markelius,,Female,,,0,0
15034,67122,Giorgia Lupi,Italian,Female,1981.0,,1981,0
15044,67272,Toyin Ojih Odutola,American,Female,1985.0,,1985,0


In [551]:
fig = go.Figure(data=go.Scatter(
                x=female_by['Birth Year'],
                hovertext=female_by['Name'],
                mode='markers'), layout=go.Layout(
        title=go.layout.Title(text=" Female Birth Years")))
fig.show()

In [None]:
#reading the artworks dataset

In [346]:
artworks = pd.read_csv('artworks.csv')

In [552]:
artworks.head()

Unnamed: 0,Artwork ID,Title,Artist ID,Name,Date,Medium,Dimensions,Acquisition Date,Credit,Catalogue,...,Diameter (cm),Circumference (cm),Height (cm),Length (cm),Width (cm),Depth (cm),Weight (kg),Duration (s),Gender,Parsed_date
0,2.0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",6210,Otto Wagner,1896,Ink and cut-and-pasted painted pages on paper,"19 1/8 x 66 1/2"" (48.6 x 168.9 cm)",1996-04-09,Fractional and promised gift of Jo Carole and ...,Y,...,,,48.6,,168.9,,,,Male,1996-04-09
1,3.0,"City of Music, National Superior Conservatory ...",7470,Christian de Portzamparc,1987,Paint and colored pencil on print,"16 x 11 3/4"" (40.6 x 29.8 cm)",1995-01-17,Gift of the architect in honor of Lily Auchinc...,Y,...,,,40.6401,,29.8451,,,,Male,1995-01-17
2,4.0,"Villa near Vienna Project, Outside Vienna, Aus...",7605,Emil Hoppe,1903,"Graphite, pen, color pencil, ink, and gouache ...","13 1/2 x 12 1/2"" (34.3 x 31.8 cm)",1997-01-15,Gift of Jo Carole and Ronald S. Lauder,Y,...,,,34.3,,31.8,,,,Male,1997-01-15
3,5.0,"The Manhattan Transcripts Project, New York, N...",7056,Bernard Tschumi,1980,Photographic reproduction with colored synthet...,"20 x 20"" (50.8 x 50.8 cm)",1995-01-17,Purchase and partial gift of the architect in ...,Y,...,,,50.8,,50.8,,,,Male,1995-01-17
4,6.0,"Villa, project, outside Vienna, Austria, Exter...",7605,Emil Hoppe,1903,"Graphite, color pencil, ink, and gouache on tr...","15 1/8 x 7 1/2"" (38.4 x 19.1 cm)",1997-01-15,Gift of Jo Carole and Ronald S. Lauder,Y,...,,,38.4,,19.1,,,,Male,1997-01-15


In [196]:
#map the gender of the artist to each artwork

In [210]:
"""checking the datatypes, we see that Artist ID is an object in the artworks dataframe, 
while it is an integer in the artworks df, so map throws an error
"""

In [232]:
artworks.dtypes

Artwork ID              int64
Title                  object
Artist ID              object
Name                   object
Date                   object
Medium                 object
Dimensions             object
Acquisition Date       object
Credit                 object
Catalogue              object
Department             object
Classification         object
Object Number          object
Diameter (cm)         float64
Circumference (cm)    float64
Height (cm)           float64
Length (cm)           float64
Width (cm)            float64
Depth (cm)            float64
Weight (kg)           float64
Duration (s)          float64
dtype: object

In [358]:
#the attempt to convert it throws another error

In [348]:
artworks['Artist ID'] = pd.to_numeric(artworks['Artist ID'])

ValueError: Unable to parse string "6969, 8134" at position 65

In [None]:
#we see that there can be multiple Artist IDs for an artwork

In [268]:
#loop over each Artist ID cell
#if there is more than one ID, take the first one

In [366]:
for index, row in artworks.iterrows():
    if isinstance(row['Artist ID'], str):
        x = row['Artist ID'].split(',')
        if len(x) > 1:
            artworks.at[index, 'Artist ID'] = x[0]
            

In [553]:
artworks.head()

Unnamed: 0,Artwork ID,Title,Artist ID,Name,Date,Medium,Dimensions,Acquisition Date,Credit,Catalogue,...,Diameter (cm),Circumference (cm),Height (cm),Length (cm),Width (cm),Depth (cm),Weight (kg),Duration (s),Gender,Parsed_date
0,2.0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",6210,Otto Wagner,1896,Ink and cut-and-pasted painted pages on paper,"19 1/8 x 66 1/2"" (48.6 x 168.9 cm)",1996-04-09,Fractional and promised gift of Jo Carole and ...,Y,...,,,48.6,,168.9,,,,Male,1996-04-09
1,3.0,"City of Music, National Superior Conservatory ...",7470,Christian de Portzamparc,1987,Paint and colored pencil on print,"16 x 11 3/4"" (40.6 x 29.8 cm)",1995-01-17,Gift of the architect in honor of Lily Auchinc...,Y,...,,,40.6401,,29.8451,,,,Male,1995-01-17
2,4.0,"Villa near Vienna Project, Outside Vienna, Aus...",7605,Emil Hoppe,1903,"Graphite, pen, color pencil, ink, and gouache ...","13 1/2 x 12 1/2"" (34.3 x 31.8 cm)",1997-01-15,Gift of Jo Carole and Ronald S. Lauder,Y,...,,,34.3,,31.8,,,,Male,1997-01-15
3,5.0,"The Manhattan Transcripts Project, New York, N...",7056,Bernard Tschumi,1980,Photographic reproduction with colored synthet...,"20 x 20"" (50.8 x 50.8 cm)",1995-01-17,Purchase and partial gift of the architect in ...,Y,...,,,50.8,,50.8,,,,Male,1995-01-17
4,6.0,"Villa, project, outside Vienna, Austria, Exter...",7605,Emil Hoppe,1903,"Graphite, color pencil, ink, and gouache on tr...","15 1/8 x 7 1/2"" (38.4 x 19.1 cm)",1997-01-15,Gift of Jo Carole and Ronald S. Lauder,Y,...,,,38.4,,19.1,,,,Male,1997-01-15


In [None]:
# now we can map the gender to each artwork

In [None]:
artworks['Gender'] = artworks['Artist ID'].map(df.set_index('Artist ID')['Gender'])

In [371]:
artworks.loc[artworks['Name'] == 'Robert Arneson']

Unnamed: 0,Artwork ID,Title,Artist ID,Name,Date,Medium,Dimensions,Acquisition Date,Credit,Catalogue,...,Object Number,Diameter (cm),Circumference (cm),Height (cm),Length (cm),Width (cm),Depth (cm),Weight (kg),Duration (s),Gender
32062,33599.0,Study for Head Bath,1,Robert Arneson,1977,Conté crayon and pencil on paper,"41 5/8 x 29 7/8"" (105.8 x 75.8 cm)",1981-04-28,Gift of the Friends of Contemporary Drawing,Y,...,67.1981,,,105.7,,75.9,,,,Male
60218,64139.0,General Nuke,1,Robert Arneson,1986,Lithograph,"composition (irreg.): 31 11/16 x 23 5/16"" (80....",1997-05-28,Gift of Landfall Press,Y,...,192.1997,,,80.5,,59.2,,,,Male


In [None]:
#parsing Aquisition Date

In [None]:
artworks['Parsed_date'] = pd.to_datetime(artworks['Acquisition Date'], format='%Y-%m-%d')

In [554]:
artworks.tail(10)

Unnamed: 0,Artwork ID,Title,Artist ID,Name,Date,Medium,Dimensions,Acquisition Date,Credit,Catalogue,...,Diameter (cm),Circumference (cm),Height (cm),Length (cm),Width (cm),Depth (cm),Weight (kg),Duration (s),Gender,Parsed_date
"67310, 67311",,,67310,,,,,,,,...,,,,,,,,,,NaT
"67315, 67316",,,67315,,,,,,,,...,,,,,,,,,,NaT
"67318, 67319",,,67318,,,,,,,,...,,,,,,,,,,NaT
"67320, 67321",,,67320,,,,,,,,...,,,,,,,,,,NaT
"67336, 67337",,,67336,,,,,,,,...,,,,,,,,,,NaT
"32196, 46463",,,32196,,,,,,,,...,,,,,,,,,Male,NaT
"67352, 67353",,,67352,,,,,,,,...,,,,,,,,,,NaT
"24409, 34803, 49699, 24885, 35641",,,24409,,,,,,,,...,,,,,,,,,,NaT
"24409, 49077, 40921, 44883",,,24409,,,,,,,,...,,,,,,,,,,NaT
"42821, 2928",,,42821,,,,,,,,...,,,,,,,,,,NaT


In [None]:
#drop empty rows and sort

In [391]:
artworks_noNaT = artworks.dropna(subset=['Parsed_date'])

In [394]:
artworks_noNaT.tail()

Unnamed: 0,Artwork ID,Title,Artist ID,Name,Date,Medium,Dimensions,Acquisition Date,Credit,Catalogue,...,Diameter (cm),Circumference (cm),Height (cm),Length (cm),Width (cm),Depth (cm),Weight (kg),Duration (s),Gender,Parsed_date
130257,217983.0,Seul/NY/MAX,4469,Nam June Paik,,VHS,,2008-10-08,The Gilbert and Lila Silverman Fluxus Collecti...,N,...,,,0.0,,0.0,0.0,,,Male,2008-10-08
130258,217984.0,Fluxus-Manifestatie in en Rond Kunsthandel Monet,0,,1962.0,VHS,,2008-10-08,The Gilbert and Lila Silverman Fluxus Collecti...,N,...,,,0.0,,0.0,0.0,,,,2008-10-08
130259,217985.0,Fluxphone Compositions,67695,Ely Ramen,1969.0,Cassette,,2008-10-08,The Gilbert and Lila Silverman Fluxus Collecti...,N,...,,,0.0,,0.0,0.0,,,,2008-10-08
130260,217986.0,Unidentified,0,,,Cassette,,2008-10-08,The Gilbert and Lila Silverman Fluxus Collecti...,N,...,,,0.0,,0.0,0.0,,,,2008-10-08
130261,218011.0,Portrait of Takako Saito,21398,George Maciunas,,Gelatin silver print,"image: 7 1/2 × 7 1/2"" (19.1 × 19.1 cm); sheet:...",2008-10-08,The Gilbert and Lila Silverman Fluxus Collecti...,N,...,,,0.0,,0.0,0.0,,,Male,2008-10-08


In [416]:
sbd = artworks_noNaT.sort_values(by='Parsed_date')

In [None]:
#creating a subset of Aquisition Date and Gender, fill empty cells with "Unknown Gender" and change "male" to "Male"

In [438]:
d_g = sbd[['Parsed_date', 'Gender']]

In [None]:
d_g["Gender"] = d_g["Gender"].fillna(value="Unknown Gender")

In [None]:
d_g["Gender"] = d_g["Gender"].replace(['male'], 'Male')

In [596]:
d_g.tail(50)

Unnamed: 0,Parsed_date,Gender
129519,2016-11-14,Female
129518,2016-11-14,Female
129511,2016-11-14,Female
129512,2016-11-14,Female
129513,2016-11-14,Female
129514,2016-11-14,Female
129515,2016-11-14,Female
129933,2016-11-14,Male
130159,2016-11-14,Male
129966,2016-11-14,Male


In [469]:
# group cells with the same date and count values

In [471]:
dgt2 = pd.DataFrame(d_g.groupby(['Parsed_date', 'Gender']).size().unstack())

In [515]:
dgt2 = dgt2.reset_index()

In [520]:
dgt2 = dgt2.fillna(value = 0)

In [None]:
#calculating the cumulative sum for each date and gender

In [None]:
dgt2['Male_sum'] = dgt2['Male_sum'].cumsum()

In [528]:
dgt2['Female_sum'] = dgt2['Female_sum'].cumsum()

In [None]:
dgt2['Unknown Gender_sum'] = dgt2['Unknown Gender_sum'].cumsum()

In [529]:
dgt2.head(50)

Gender,Parsed_date,Female,Male,Unknown Gender,Female_sum,Male_sum,Unknown Gender_sum
0,1929-11-19,0.0,9.0,0.0,0.0,9.0,0.0
1,1930-01-12,0.0,3.0,0.0,0.0,21.0,0.0
2,1930-04-02,0.0,2.0,0.0,0.0,35.0,0.0
3,1930-06-08,0.0,1.0,0.0,0.0,50.0,0.0
4,1930-10-23,0.0,2.0,0.0,0.0,67.0,0.0
5,1931-01-15,0.0,1.0,0.0,0.0,85.0,0.0
6,1931-01-19,0.0,1.0,0.0,0.0,104.0,0.0
7,1931-03-11,0.0,1.0,0.0,0.0,124.0,0.0
8,1932-01-01,0.0,2.0,0.0,0.0,146.0,0.0
9,1932-02-24,0.0,2.0,0.0,0.0,170.0,0.0


In [476]:
dgt3 = dgt2.fillna(value=0)

In [481]:
dgt4 = dgt3.reset_index()

In [None]:
#line chart without cumsum

In [603]:
fig = px.line(title='Aquisitions by Gender')
fig.add_trace(go.Scatter(x=dgt4["Parsed_date"], y=dgt4["Female"],
                 name='Female'))
fig.add_trace(go.Scatter(x=dgt4["Parsed_date"], y=dgt4["Male"],
                 name='Male'))
fig.add_trace(go.Scatter(x=dgt4["Parsed_date"], y=dgt4["Unknown Gender"],
                 name='Unknown Gender'))
fig.show()

In [None]:
#line chart with cumsum

In [604]:
fig = px.line(title='Cumulative Aquisitions by Gender')
fig.add_trace(go.Scatter(x=dgt2["Parsed_date"], y=dgt2["Female_sum"],
                 name='Female'))
fig.add_trace(go.Scatter(x=dgt2["Parsed_date"], y=dgt2["Male_sum"],
                 name='Male'))
fig.add_trace(go.Scatter(x=dgt2["Parsed_date"], y=dgt2["Unknown Gender_sum"],
                 name='Unknown Gender'))
fig.show()

In [None]:
#stacked area chart Cumulative Aquisitions by Gender

In [607]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=dgt2["Parsed_date"], y=dgt2["Unknown Gender_sum"],
    mode='lines',
    line=dict(width=0.5, color='rgb(127, 166, 238)'),
    stackgroup='one',
    name="Unknown Gender"
))

fig.add_trace(go.Scatter(
    x=dgt2["Parsed_date"], y=dgt2["Female_sum"],
    mode='lines',
    line=dict(width=0.5, color='rgb(111, 231, 219)'),
    stackgroup='one',
    name="Female"
))

fig.add_trace(go.Scatter(
    x=dgt2["Parsed_date"], y=dgt2["Male_sum"],
    mode='lines',
    line=dict(width=0.5, color='rgb(184, 247, 212)'),
    stackgroup='one',
    groupnorm='percent',
    name="Male"
))

fig.update_layout(
    showlegend=True,
    yaxis=dict(
        type='linear',
        range=[1, 100],
        ticksuffix='%'))


fig.update_layout(
    xaxis_tickformat = '%d %B %Y'
)

fig.show()