In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in cleaned accession data
csv_file = 'resources/Mia_objects_accession_year.csv'

In [3]:
df = pd.read_csv(csv_file, index_col='Unnamed: 0', low_memory=False)
df.head()

Unnamed: 0,accession_number,artist,classification,continent,country,creditline,culture,dated,department,id,...,object_name,provenance,room,style,title,RBL,RB,X,loan,accession_year
0,10.1,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c.1888-89,Prints and Drawings,0,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Air, from the series The Four Elements",,,,,10
1,10.2,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,1,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Earth, from the series The Four Elements",,,,,10
2,10.3,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,2,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",G352,19th century,"Fire, from the series The Four Elements",,,,,10
3,10.4,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,3,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Water, from the series The Four Elements",,,,,10
4,13.29,Walter Shirlaw,Drawings,North America,United States,Gift of Mrs. Florence M. Shirlaw,,19th century,Prints and Drawings,4,...,Drawing,,Not on View,19th century,Montana Indian Reservation I,,,,,13


## Using Tableau Unkown to Fix Typos in Country Data
Pulled data frame into Tableau and found 311 non country identified data, using the following steps the countries Tableau didn't recognize were fixed (unclear data was not changed - e.g. 'India or China')
* Group the data by country type and filter the number of entries, starting with large values (200)
* Make test output file and read into Tableau
* Pull up unknown values from bottom corner of map, and update if possible
* Repeat steps down to reasonable number of entries

In [4]:
# add column for tableau country data
    # strip any trailing commas and standardize punctuation in the process
df['tableau_country'] = df['country'].str.rstrip('')
df['tableau_country'] = df['tableau_country'].str.title()
df.tableau_country.value_counts()

United States                                     35945
France                                            17749
Japan                                             11218
China                                              9866
England                                            8550
Germany                                            3890
Italy                                              3078
Netherlands                                        1103
Guatemala                                          1057
Mexico                                              901
India                                               872
Switzerland                                         784
The Netherlands                                     689
Peru                                                609
Thailand                                            609
Scotland                                            551
Belgium                                             548
Spain                                           

In [5]:
# Replaced country values for countries with >200 entries
df.replace({'tableau_country' : ['England', 'Scotland', 'Engalnd', 'Great Britain']}, 'United Kingdom', inplace=True)
df.replace({'tableau_country' : 'Korea'}, 'South Korea', inplace=True)

In [6]:
# Replaced country values for countries with 50 - 199 entries
df.replace({'tableau_country' : ['Flanders/Belgium', 'Belgium (Flanders)', 'Flemish', 
                                 'Flanders']},'Belgium', inplace=True)
df.replace({'tableau_country' : ['Persia (Iran)', 'Luristan, Persia', 'Iran (Persia)',
                                'Iran (Persia (Khourvin))']}, 'Iran', inplace=True)

In [7]:
# Replaced country values for countries with 25 - 49 entries
df.replace({'tableau_country' : ['Germany (Saxony)', 'West Germany', 'East Germany', 'Bavaria',
                                'Germany, Europe', 'Germany (Prussia)']},'Germany', inplace=True)
df.replace({'tableau_country' : 'Bohemia'}, 'Czechia', inplace=True)

In [8]:
# Replaced country values for countries with 10 - 24 entries
df.replace({'tableau_country' : 'Aotearoa (New Zealand)'}, 'New Zeleand', inplace=True)
df.replace({'tableau_country' : 'Jingdezhen, China'}, 'China', inplace=True)
df.replace({'tableau_country' : ['Siam','Central Thailand']}, 'Thailand', inplace=True)
df.replace({'tableau_country' : ['Greek Islands', 'Greek']}, 'Greece', inplace=True)

In [9]:
# Replaced country values for countries with 5 - 9 entries
df.replace({'tableau_country' : ['Bali', 'East Bali', 'Java','Kalimanta','Lombok Island']}, 'Indonesia', inplace=True)
df.replace({'tableau_country' : 'Fiji Islands'}, 'Fiji', inplace=True)
df.replace({'tableau_country' : ['Italy Or Sicily', 'Sicily', 'Itlay', 'Etruria', 'Italy, Ancient Etruria',
                                'Italy Or Sardinia', 'Italy Or Sardina', 'Italy or Sicily',
                                'Italy (Siculo-Campanian)']}, 'Italy', inplace=True)

In [10]:
# Replaced country values for countries with < 5 entries
df.replace({'tableau_country' : 'Admiralty Islands'}, 'Papua New Guinea', inplace=True)
df.replace({'tableau_country' : 'Anatolia'}, 'Turkey', inplace=True)
df.replace({'tableau_country' : ['Austral Islands', 'Marquesas Islands']}, 'French Polynesia', inplace=True)
df.replace({'tableau_country' : 'Babylon'}, 'Iraq', inplace=True)
df.replace({'tableau_country' : 'Banks Islands'}, 'Vanuatu', inplace=True)
df.replace({'tableau_country' : ['Bokhara', 'Bukhara']}, 'Uzbekistan', inplace=True)
df.replace({'tableau_country' : 'Canary Islands'}, 'Spain', inplace=True)
df.replace({'tableau_country' : 'Chinese Export'}, 'China', inplace=True)

In [11]:
# Replaced country values for countries with < 5 entries
df.replace({'tableau_country' : 'Dagestan'}, 'Russia', inplace=True)
df.replace({'tableau_country' : 'Dahomey'}, 'Benin', inplace=True)
df.replace({'tableau_country' : 'Fakara Lamedi'}, 'Niger', inplace=True)
df.replace({'tableau_country' : 'Guatamala'}, 'Guatemala', inplace=True)
df.replace({'tableau_country' : 'Hispano-Moorish'}, 'Spain', inplace=True)
df.replace({'tableau_country' : 'Indian Export'}, 'India', inplace=True)

In [12]:
# Replaced country values for countries with < 5 entries
df.replace({'tableau_country' : ['Limoges','Lorraine']}, 'France', inplace=True)
df.replace({'tableau_country' : 'Nehterlands'}, 'Netherlands', inplace=True)

In [13]:
# Historical countries with multiple current countries or cultures that span two countries:
# Czechoslovakia, New Guinea, Tibet, Roman, Khmer, Persia, Yugoslavia, Roman Empire
## Flanders with Belgium unless Flanders, Europe

In [14]:
df.tableau_country.value_counts()

United States                                     35945
France                                            17751
Japan                                             11218
China                                              9880
United Kingdom                                     9128
Germany                                            3934
Italy                                              3103
Netherlands                                        1104
Guatemala                                          1058
Mexico                                              901
India                                               874
Switzerland                                         784
The Netherlands                                     689
Belgium                                             640
Thailand                                            628
Peru                                                609
Spain                                               529
Austria                                         

In [15]:
# New file for visualizations from objects with updated country names
output_datafile = 'resources/Mia_objects_country.csv'
df.to_csv(output_datafile, encoding='utf-8')