In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in cleaned accession data
csv_file = 'resources/Mia_objects_accession_year.csv'

In [3]:
data = pd.read_csv(csv_file, index_col='Unnamed: 0', low_memory=False)
data.head()

Unnamed: 0,accession_number,artist,classification,continent,country,creditline,culture,dated,department,id,...,object_name,provenance,room,style,title,RBL,RB,X,loan,accession_year
0,10.1,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c.1888-89,Prints and Drawings,0,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Air, from the series The Four Elements",,,,,10
1,10.2,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,1,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Earth, from the series The Four Elements",,,,,10
2,10.3,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,2,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",G352,19th century,"Fire, from the series The Four Elements",,,,,10
3,10.4,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,3,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Water, from the series The Four Elements",,,,,10
4,13.29,Walter Shirlaw,Drawings,North America,United States,Gift of Mrs. Florence M. Shirlaw,,19th century,Prints and Drawings,4,...,Drawing,,Not on View,19th century,Montana Indian Reservation I,,,,,13


In [4]:
# Pull in only objects owned by Mia
df = data[data['loan']!='L'].copy()

## Using Tableau Unkown to Fix Typos in Country Data
Pulled data frame into Tableau and found 311 non country identified data, using the following steps the countries Tableau didn't recognize were fixed (unclear data was not changed - e.g. 'India or China')
* Group the data by country type and filter the number of entries, starting with large values (200)
* Make test output file and read into Tableau
* Pull up unknown values from bottom corner of map, and update if possible
* Repeat steps down to reasonable number of entries

In [8]:
# add column for tableau country data
    # strip any trailing commas and standardize punctuation in the process
df['tableau_country'] = df['country'].str.rstrip('')
df['tableau_country'] = df['tableau_country'].str.title()
df.tableau_country.value_counts()

United States                             27406
France                                    17086
Japan                                      9143
England                                    8106
China                                      8012
Germany                                    3654
Italy                                      2888
Netherlands                                1052
Guatemala                                   862
India                                       756
Switzerland                                 753
Mexico                                      732
The Netherlands                             684
Peru                                        554
Belgium                                     529
Thailand                                    526
Scotland                                    524
Spain                                       477
Austria                                     469
Ghana                                       405
Turkey                                  

In [11]:
# Replaced country values for countries A-C
df.replace({'tableau_country' : ['Flanders/Belgium', 'Belgium (Flanders)', 'Flemish', 
                                 'Flanders']},'Belgium', inplace=True)
df.replace({'tableau_country' : 'Dahomey'}, 'Benin', inplace=True)
df.replace({'tableau_country' : 'Rapa Nui (Easter Island)'}, 'Chile', inplace=True)
df.replace({'tableau_country' : ['Jingdezhen, China', 'Chinese Export', 'Southern China'
                                ]}, 'China', inplace=True)
df.replace({'tableau_country' : 'Bohemia'}, 'Czechia', inplace=True)


In [12]:
# Replaced country values for countries F-K
df.replace({'tableau_country' : 'Fiji Islands'}, 'Fiji', inplace=True)
df.replace({'tableau_country' : ['Limoges','Lorraine']}, 'France', inplace=True)
df.replace({'tableau_country' : ['Austral Islands', 'Marquesas Islands', 'Society Islands'
                                ]}, 'French Polynesia', inplace=True)
df.replace({'tableau_country' : ['Germany (Saxony)', 'West Germany', 'East Germany', 'Bavaria',
                                'Germany, Europe', 'Germany (Prussia)', 'Southern Germany'
                                ]},'Germany', inplace=True)
df.replace({'tableau_country' : ['Greek Islands', 'Greek']}, 'Greece', inplace=True)
df.replace({'tableau_country' : 'Guatamala'}, 'Guatemala', inplace=True)


In [13]:
# Replaced country values for countries I-K
df.replace({'tableau_country' : 'Indian Export'}, 'India', inplace=True)
df.replace({'tableau_country' : ['Bali', 'East Bali', 'Java','Kalimanta','Lombok Island',
                                'Papua']}, 'Indonesia', inplace=True)
df.replace({'tableau_country' : ['Persia (Iran)', 'Luristan, Persia', 'Iran (Persia)',
                                'Iran (Persia (Khourvin))']}, 'Iran', inplace=True)
df.replace({'tableau_country' : ['Babylon','Mesopotamia (Iraq)', 'Sumeria (Iraq)'
                                ]}, 'Iraq', inplace=True)
df.replace({'tableau_country' : ['Italy Or Sicily', 'Sicily', 'Itlay', 'Etruria', 'Italy, Ancient Etruria',
                                'Italy Or Sardinia', 'Italy Or Sardina', 'Italy or Sicily',
                                'Italy (Siculo-Campanian)', 'Northern Italy', 'Sardinia',
                                'Sardinia Or Italy', 'Sicily Or Italy', 'Venice Or Padua, Italy'
                                ]}, 'Italy', inplace=True)

In [14]:
# Replaced country values for countries L-P
df.replace({'tableau_country' : 'Northwest Laos'}, 'Laos', inplace=True)
df.replace({'tableau_country' : ['Southeastern Mexico', 'Tarascan']}, 'Mexico', inplace=True)
df.replace({'tableau_country' : ['The Netherlands', 'Nehterlands','Netherlands/Flanders'
                                ]}, 'Netherlands', inplace=True)
df.replace({'tableau_country' : ['Aotearoa (New Zealand)', 'New Zeleand']}, 'New Zealand', inplace=True)
df.replace({'tableau_country' : 'Fakara Lamedi'}, 'Niger', inplace=True)
df.replace({'tableau_country' : 'North Nigeria'}, 'Nigeria', inplace=True)
df.replace({'tableau_country' : ['Admiralty Islands', 'New Ireland']}, 'Papua New Guinea', inplace=True)
df.replace({'tableau_country' : 'Philippine Islands'}, 'Philippines', inplace=True)


In [15]:
# Replaced country values for countries Q-S
df.replace({'tableau_country' : ['Dagestan', 'Syberia']}, 'Russia', inplace=True)
df.replace({'tableau_country' : 'Korea'}, 'South Korea', inplace=True)
df.replace({'tableau_country' : ['Canary Islands', 'Hispano-Moorish']}, 'Spain', inplace=True)
df.replace({'tableau_country' : 'Dutch Guiana (Suriname)'}, 'Suriname', inplace=True)


In [16]:
# Replaced country values for countries T-Z
df.replace({'tableau_country' : ['Siam','Central Thailand', 'Northern Thailand'
                                ]}, 'Thailand', inplace=True)
df.replace({'tableau_country' : 'Tonga Islands'}, 'Tonga', inplace=True)
df.replace({'tableau_country' : 'Anatolia'}, 'Turkey', inplace=True)
df.replace({'tableau_country' : ['England', 'Scotland', 'Engalnd', 'Great Britain',
                                 'Northern Ireland', 'Scotland And England', 'Wales'
                                ]}, 'United Kingdom', inplace=True)
df.replace({'tableau_country' : ['New England, United States', 'Unted States'
                                ]}, 'United States', inplace=True)
df.replace({'tableau_country' : ['Bokhara', 'Bukhara', "Qoraqalpog'Iston / Uzbekistan", 
                                 "Qoraqalpog'Iston", 'Uzbekestan']}, 'Uzbekistan', inplace=True)
df.replace({'tableau_country' : 'Banks Islands'}, 'Vanuatu', inplace=True)
df.replace({'tableau_country' : 'Northern Vietnam'}, 'Vietnam', inplace=True)


In [17]:
# Historical countries with multiple current countries or cultures that span two countries:
# Arabia, Austro_hungarian Empire, Balkans, Borneo, Czechoslovakia, Flanders (Europe)
## Graeco-Roman, Khmer, Kurdistan, Mesopotamia, Micronesian Islands, New Guinea
## Nubia, Ottoman Empire, Persia, Ptolemaic Egypt, Roman, Tibet, Yugoslavia

In [18]:
# New file for visualizations from objects with updated country names
output_datafile = 'resources/Mia_objects_country.csv'
df.to_csv(output_datafile, encoding='utf-8')