In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# collections data
csv_file = "resources/Mia_objects_raw.csv"

In [3]:
df = pd.read_csv(csv_file, index_col='Unnamed: 0')
df.head()

Unnamed: 0,accession_number,artist,classification,continent,country,creditline,culture,dated,department,id,life_date,medium,nationality,object_name,provenance,room,style,title
0,10.1,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c.1888-89,Prints and Drawings,0,"English, 1833 - 1911","Pen and ink, brush and wash over graphite",English,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Air, from the series The Four Elements"
1,10.2,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,1,"English, 1833 - 1911","Pen and ink, brush and wash over graphite",English,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Earth, from the series The Four Elements"
2,10.3,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,2,"English, 1833 - 1911","Pen and ink, brush and wash over graphite",English,Drawing,"[Art dealer, London, acquired from ""an old hou...",G352,19th century,"Fire, from the series The Four Elements"
3,10.4,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,3,"English, 1833 - 1911","Pen and ink, brush and wash over graphite",English,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Water, from the series The Four Elements"
4,13.29,Walter Shirlaw,Drawings,North America,United States,Gift of Mrs. Florence M. Shirlaw,,19th century,Prints and Drawings,4,"American, 1838 - 1909",Watercolor,American,Drawing,,Not on View,19th century,Montana Indian Reservation I


In [4]:
# strip "P" & "p" from prints & drawings
df['accession_number'] = df['accession_number'].str.lstrip('P.')
df['accession_number'] = df['accession_number'].str.lstrip('p.')

# split out first half of accession number
df['accession_year'] = df['accession_number'].str.split('.', n=1, expand=True)[0]

# Acession Number Cleaning

### RBL/RB/X
Adding columns for unknown accession letters

In [5]:
# add 'RBL' column
df['RBL'] = df['accession_year'].str.extract(r'(RBL)', expand=True)
# add 'RB' column
df['RB'] = df['accession_year'].str.extract(r'(RB)', expand=True)
# add 'X' column
df['X'] = df['accession_year'].str.extract(r'(^X)', expand=True)

### L
Adding column for loaned objects

In [6]:
# add loan column
df['loan'] = df['accession_year'].str.extract(r'(^L)', expand=True)

Stripping letters from accession_year

In [7]:
# strip 'RBL' from accession_year column (strips L, RB & RBL)
df['accession_year'] = df['accession_year'].str.lstrip('RBL')

# strip "X" from accession_year column
df['accession_year'] = df['accession_year'].str.lstrip('X')

### Filter out invalid years

Filter dataframe two ways:
- Filter out accession years longer than 4 digits (years go up to 4 digits e.g. 2019)
- Use groupby to filter out any accession years with less than two entries

In [8]:
# replace all empty cells with None
df = df.replace('', None)

In [9]:
# filter dataframe for accession years longer than 4 digits
df_1 = df[df['accession_year'].map(lambda x: len(x) < 5)]
df_2 = df[df['accession_year'].map(lambda x: len(x) > 4)]

In [10]:
# Use groupby to filter out any accession years with less than two entries
df_grouped = df_1.groupby('accession_year').filter(lambda x: len(x) > 2)
df_remainder = df_1.groupby('accession_year').filter(lambda x: len(x) <= 2)

### Check filtered dataframe for invalid accession years

- use df_grouped.accession_year.value_counts() to find non-year accession year values and see if there is information
in creditline or provenance
- update accession_year in original dataframe and re-filter, then recheck grouped dataframe

In [25]:
# look for non-year accession_years in df_grouped (main dataframe)
df_grouped.accession_year.value_counts()

99      7717
16      5066
2013    4006
2003    3731
98      3550
2007    3121
2004    3066
2002    2864
96      2650
2015    2625
2001    2490
2010    2356
97      2306
2005    2180
74      2019
95      1960
2000    1957
2017    1828
2018    1771
2014    1771
2012    1771
82      1682
77      1677
90      1672
2016    1651
2006    1625
81      1581
94      1468
75      1361
2008    1323
        ... 
45       125
44       123
24       109
46       106
26       106
09       105
20       104
29       101
32        99
43        92
13        78
22        63
55        62
36        60
33        57
59        57
39        55
49        48
60        47
57        46
38        45
18        37
48        37
53        37
52        34
19        30
121       19
10         6
12         5
11         3
Name: accession_year, Length: 112, dtype: int64

In [11]:
# check number 234 & 121 - did not find any useful info in 121
df_test = df[df['accession_year'] == "234"]
df_test.creditline.all()

'The William M. Ladd Collection\r\nGift of Herschel V. Jones, 1916'

In [12]:
# check larger dataframe for matching creditline entries
df_test_2 = df[df['creditline'] == 'The William M. Ladd Collection\r\nGift of Herschel V. Jones, 1916']
# using count and db_test_2.head() double checked data
df_test_2.count()
# replace accession_year with 16 for all matching accession numbers in db
df.loc[(df['creditline'] == 'The William M. Ladd Collection\r\nGift of Herschel V. Jones, 1916'), 
            'accession_year'] = '16'

In [17]:
# rerun filters on updated dataframe
# filter dataframe for accession years longer than 4 digits
df_1 = df[df['accession_year'].map(lambda x: len(x) < 5)]
df_2 = df[df['accession_year'].map(lambda x: len(x) > 4)]
# Use groupby to filter out any accession years with less than two entries
df_grouped = df_1.groupby('accession_year').filter(lambda x: len(x) > 2)
df_remainder = df_1.groupby('accession_year').filter(lambda x: len(x) <= 2)

### Manually Updating Using Creditline info
Using creditline info, update accession year when possible for objects filtered out of dataframe with invalid accession years

In [32]:
# combine df_2 and df_remainder into one larger dataframe
df_remainder = df_remainder.append(df_2)
df_remainder.creditline.value_counts().head(10)

The Minnich Collection\r\nThe Ethel Morrison Van Derlip Fund, 1966       22719
Gift of Mrs. Charles C. Bovey, 1924                                       9397
Gift of George A. Goddard, 1919                                           1842
Gift of H. V. Jones                                                       1554
The William M. Ladd Collection\r\nGift of Herschel V. Jones, 1916\r\n      975
Gift of Mrs. George P. Douglas, 1946                                       801
Gift of Herschel V. Jones, 1926                                            693
Gift of Mrs. George P. Douglas, 1929                                       636
Gift of Mrs.C.C.Bovey, 1924                                                489
Gift of Mr. R. E. Lewis, 1965                                              459
Name: creditline, dtype: int64

In [37]:
# Update top 10 entries
df.loc[(df['creditline'] == 'The Minnich Collection\r\nThe Ethel Morrison Van Derlip Fund, 1966'), 
            'accession_year'] = '66'
df.loc[(df['creditline'] == 'Gift of Mrs. Charles C. Bovey, 1924'), 
            'accession_year'] = '24'
df.loc[(df['creditline'] == 'Gift of George A. Goddard, 1919'), 
            'accession_year'] = '19'
df.loc[(df['creditline'] == 'The William M. Ladd Collection\r\nGift of Herschel V. Jones, 1916\r\n'), 
            'accession_year'] = '16'
df.loc[(df['creditline'] == 'Gift of Mrs. George P. Douglas, 1946'), 
            'accession_year'] = '46'
df.loc[(df['creditline'] == 'Gift of Herschel V. Jones, 1926'), 
            'accession_year'] = '26'
df.loc[(df['creditline'] == 'Gift of Mrs. George P. Douglas, 1929'), 
            'accession_year'] = '29'
df.loc[(df['creditline'] == 'Gift of Mrs.C.C.Bovey, 1924'), 
            'accession_year'] = '24'
df.loc[(df['creditline'] == 'Gift of Mr. R. E. Lewis, 1965'), 
            'accession_year'] = '65'

In [54]:
# rerun filters on updated dataframe
# filter dataframe for accession years longer than 4 digits
df_1 = df[df['accession_year'].map(lambda x: len(x) < 5)]
df_2 = df[df['accession_year'].map(lambda x: len(x) > 4)]
# Use groupby to filter out any accession years with less than two entries
df_grouped = df_1.groupby('accession_year').filter(lambda x: len(x) > 2)
df_remainder = df_1.groupby('accession_year').filter(lambda x: len(x) <= 2)
# combine df_2 and df_remainder into one larger dataframe
df_remainder = df_remainder.append(df_2)
df_remainder.creditline.value_counts().head(11)

Gift of H. V. Jones                                                                         518
The Minnich Collection\r\nThe Ethel Morrison Van Derlip Fund                                114
The Ethel Morrison Van Derlip Fund                                                           55
Gift of F.N. Edmonds                                                                         52
Gift of Frederick B. Wells                                                                   38
Gift of Mrs. Carl W. Jones in Memory of Her Husband                                          38
Gift of Mrs. Darwin R. Martin                                                                38
Gift of the Estate of Dorothy Millett Lindeke                                                30
The William Hood Dunwoody Fund                                                               29
Gift of Miss Tessie Jones in memory of her parents, Mr. and Mrs. Herschel V. Jones, 1966     27
Gift of Bruce B. Dayton                 

In [39]:
# Update top 10 entries
df.loc[(df['creditline'] == 'Gift of Herschel V. Jones, 1925'), 
            'accession_year'] = '25'
df.loc[(df['creditline'] == 'Gift of Miss A. G. Latham, 1943'), 
            'accession_year'] = '43'
df.loc[(df['creditline'] == 'Bequest of Mrs. Charles S. Pillsbury, 1958'), 
            'accession_year'] = '58'
df.loc[(df['creditline'] == 'Gift of Philip Little in memory of Annie Jeannette Jackson, 1931'), 
            'accession_year'] = '31'
df.loc[(df['creditline'] == 'Gift of Mrs. C.C. Bovey, 1941'), 
            'accession_year'] = '41'
df.loc[(df['creditline'] == 'Gift of Mr. George A. Goddard, 1919'), 
            'accession_year'] = '19'
df.loc[(df['creditline'] == 'Gift of Mrs. C.C. Bovey, 1924'), 
            'accession_year'] = '24'
df.loc[(df['creditline'] == 'Gift of Mrs. Philip Little, Jr., 1963'), 
            'accession_year'] = '63'

In [40]:
# rerun filters on updated dataframe
# filter dataframe for accession years longer than 4 digits
df_1 = df[df['accession_year'].map(lambda x: len(x) < 5)]
df_2 = df[df['accession_year'].map(lambda x: len(x) > 4)]
# Use groupby to filter out any accession years with less than two entries
df_grouped = df_1.groupby('accession_year').filter(lambda x: len(x) > 2)
df_remainder = df_1.groupby('accession_year').filter(lambda x: len(x) <= 2)
# combine df_2 and df_remainder into one larger dataframe
df_remainder = df_remainder.append(df_2)
df_remainder.creditline.value_counts().head(10)

Gift of H. V. Jones                                                                                          518
The Minnich Collection\r\nThe Ethel Morrison Van Derlip Fund                                                 114
Gift of Miss Perrie Jones, 1961                                                                               65
Gift of Mrs. Philip Little, Jr., 1962                                                                         65
Gift of Mrs. B.J.O Nordfeldt, 1955                                                                            63
Gift of Howard Mansfield, 1926                                                                                63
Gift of Ruth Lathrop Sikes in memory of her brother Bruce Sikes, 1967                                         61
Gift of Miss Eileen Bigelow and Mrs. O. H. Ingram, in memory of their mother, Mrs. Alice F. Bigelow, 1965     60
Gift of the Artist, 1923                                                                        

In [43]:
# Update top 10 entries
df.loc[(df['creditline'] == 'Gift of Miss Perrie Jones, 1961'), 
            'accession_year'] = '61'
df.loc[(df['creditline'] == 'Gift of Mrs. Philip Little, Jr., 1962'), 
            'accession_year'] = '62'
df.loc[(df['creditline'] == 'Gift of Mrs. B.J.O Nordfeldt, 1955'), 
            'accession_year'] = '55'
df.loc[(df['creditline'] == 'Gift of Howard Mansfield, 1926'), 
            'accession_year'] = '26'
df.loc[(df['creditline'] == 'Gift of Ruth Lathrop Sikes in memory of her brother Bruce Sikes, 1967'), 
            'accession_year'] = '67'
df.loc[(df['creditline'] == 'Gift of Miss Eileen Bigelow and Mrs. O. H. Ingram, in memory of their mother, Mrs. Alice F. Bigelow, 1965'), 
            'accession_year'] = '65'
df.loc[(df['creditline'] == 'Gift of the Artist, 1923'), 
            'accession_year'] = '23'

In [44]:
# rerun filters on updated dataframe
# filter dataframe for accession years longer than 4 digits
df_1 = df[df['accession_year'].map(lambda x: len(x) < 5)]
df_2 = df[df['accession_year'].map(lambda x: len(x) > 4)]
# Use groupby to filter out any accession years with less than two entries
df_grouped = df_1.groupby('accession_year').filter(lambda x: len(x) > 2)
df_remainder = df_1.groupby('accession_year').filter(lambda x: len(x) <= 2)
# combine df_2 and df_remainder into one larger dataframe
df_remainder = df_remainder.append(df_2)
df_remainder.creditline.value_counts().head(10)

Gift of H. V. Jones                                                                        518
The Minnich Collection\r\nThe Ethel Morrison Van Derlip Fund                               114
The Ethel Morrison Van Derlip Fund                                                          55
The Minnich Collection\r\nThe Ethel Morrison Van DerLip Fund, 1966                          53
Gift of F.N. Edmonds                                                                        52
Gift of Miss Katherine Bullard, 1918                                                        48
The Ethel Morrison Van Derlip Fund, 1967                                                    41
Gift of Mrs. C. C. Bovey, 1941                                                              41
Gift of Funds from Mrs. Franklin M. Crosby Jr. and the William Hood Dunwoody Fund, 1947     41
Pillsbury Bequest, 1958                                                                     40
Name: creditline, dtype: int64

In [45]:
# Update top 10 entries
df.loc[(df['creditline'] == 'The Minnich Collection\r\nThe Ethel Morrison Van DerLip Fund, 1966'), 
            'accession_year'] = '66'
df.loc[(df['creditline'] == 'Gift of Miss Katherine Bullard, 1918'), 
            'accession_year'] = '18'
df.loc[(df['creditline'] == 'The Ethel Morrison Van Derlip Fund, 1967'), 
            'accession_year'] = '67'
df.loc[(df['creditline'] == 'Gift of Mrs. C. C. Bovey, 1941'), 
            'accession_year'] = '41'
df.loc[(df['creditline'] == 'Gift of Funds from Mrs. Franklin M. Crosby Jr. and the William Hood Dunwoody Fund, 1947'), 
            'accession_year'] = '47'
df.loc[(df['creditline'] == 'Pillsbury Bequest, 1958'), 
            'accession_year'] = '58'

In [47]:
# rerun filters on updated dataframe
# filter dataframe for accession years longer than 4 digits
df_1 = df[df['accession_year'].map(lambda x: len(x) < 5)]
df_2 = df[df['accession_year'].map(lambda x: len(x) > 4)]
# Use groupby to filter out any accession years with less than two entries
df_grouped = df_1.groupby('accession_year').filter(lambda x: len(x) > 2)
df_remainder = df_1.groupby('accession_year').filter(lambda x: len(x) <= 2)
# combine df_2 and df_remainder into one larger dataframe
df_remainder = df_remainder.append(df_2)
df_remainder.creditline.value_counts().head(15)

Gift of H. V. Jones                                             518
The Minnich Collection\r\nThe Ethel Morrison Van Derlip Fund    114
The Ethel Morrison Van Derlip Fund                               55
Gift of F.N. Edmonds                                             52
Gift of Mrs. Carl W. Jones in Memory of Her Husband              38
Gift of Mrs. Darwin R. Martin                                    38
Gift of Frederick B. Wells                                       38
The Ethel Morrison Van Derlip Fund, 1965                         36
Gift of Mrs. George P. Douglas, 1955                             35
The Christina N. and Swan J. Turnblad Memorial Fund, 1964        35
Gift of the Estate of Dorothy Millett Lindeke                    30
Gift of Mrs. Hiram C. Truesdale, 1927                            30
The William Hood Dunwoody Fund                                   29
Gift of Mrs. George P. Douglas, 1929\r\n                         27
Gift of Friends of Art in Minneapolis, 1917     

In [48]:
# Update top 15 entries
df.loc[(df['creditline'] == 'The Ethel Morrison Van Derlip Fund, 1965'), 
            'accession_year'] = '65'
df.loc[(df['creditline'] == 'Gift of Mrs. George P. Douglas, 1955'), 
            'accession_year'] = '55'
df.loc[(df['creditline'] == 'The Christina N. and Swan J. Turnblad Memorial Fund, 1964'), 
            'accession_year'] = '64'
df.loc[(df['creditline'] == 'Gift of Mrs. Hiram C. Truesdale, 1927'), 
            'accession_year'] = '27'
df.loc[(df['creditline'] == 'Gift of Mrs. George P. Douglas, 1929\r\n'), 
            'accession_year'] = '29'
df.loc[(df['creditline'] == 'Gift of Friends of Art in Minneapolis, 1917'), 
            'accession_year'] = '17'

In [49]:
# rerun filters on updated dataframe
# filter dataframe for accession years longer than 4 digits
df_1 = df[df['accession_year'].map(lambda x: len(x) < 5)]
df_2 = df[df['accession_year'].map(lambda x: len(x) > 4)]
# Use groupby to filter out any accession years with less than two entries
df_grouped = df_1.groupby('accession_year').filter(lambda x: len(x) > 2)
df_remainder = df_1.groupby('accession_year').filter(lambda x: len(x) <= 2)
# combine df_2 and df_remainder into one larger dataframe
df_remainder = df_remainder.append(df_2)
df_remainder.creditline.value_counts().head(15)

Gift of H. V. Jones                                                                         518
The Minnich Collection\r\nThe Ethel Morrison Van Derlip Fund                                114
The Ethel Morrison Van Derlip Fund                                                           55
Gift of F.N. Edmonds                                                                         52
Gift of Frederick B. Wells                                                                   38
Gift of Mrs. Carl W. Jones in Memory of Her Husband                                          38
Gift of Mrs. Darwin R. Martin                                                                38
Gift of the Estate of Dorothy Millett Lindeke                                                30
The William Hood Dunwoody Fund                                                               29
Gift of Miss Tessie Jones in memory of her parents, Mr. and Mrs. Herschel V. Jones, 1966     27
Gift of Bruce B. Dayton                 

In [53]:
df_remainder.creditline.value_counts()

Gift of H. V. Jones                                                                         518
The Minnich Collection\r\nThe Ethel Morrison Van Derlip Fund                                114
The Ethel Morrison Van Derlip Fund                                                           55
Gift of F.N. Edmonds                                                                         52
Gift of Frederick B. Wells                                                                   38
Gift of Mrs. Carl W. Jones in Memory of Her Husband                                          38
Gift of Mrs. Darwin R. Martin                                                                38
Gift of the Estate of Dorothy Millett Lindeke                                                30
The William Hood Dunwoody Fund                                                               29
Gift of Miss Tessie Jones in memory of her parents, Mr. and Mrs. Herschel V. Jones, 1966     27
Gift of Bruce B. Dayton                 