In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in cleaned accession data
csv_file = 'resources/Mia_objects_accession_year.csv'

In [4]:
df = pd.read_csv(csv_file, index_col='Unnamed: 0', low_memory=False)
df.head(3)

Unnamed: 0,accession_number,artist,classification,continent,country,creditline,culture,dated,department,id,...,object_name,provenance,room,style,title,RBL,RB,X,loan,accession_year
0,10.1,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c.1888-89,Prints and Drawings,0,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Air, from the series The Four Elements",,,,,10
1,10.2,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,1,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Earth, from the series The Four Elements",,,,,10
2,10.3,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,2,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",G352,19th century,"Fire, from the series The Four Elements",,,,,10


## Fix Typos in Country Data
There are 505 country entries, most of which are only 1-2 long.
* Pull out list of unique country names, sort alphabetically
* Fix typos and formatting differences
* For cultures or countries that do not exist:
    * Change to modern country if possible
    * Add province or older country name to culture tab
* Will pull data into Tableau and further edit if necessary

In [5]:
# add column for cleaned country data
    # strip any trailing commas and standardize punctuation in the process
df['country_cleaned'] = df['country'].str.rstrip('')
df['country_cleaned'] = df['country_cleaned'].str.title()

# add column for country status, default value "clear"
df['country_status'] = 'clear'

# build dictionary to hold country status information (multiple, (?) values)
status_dict = {}

In [6]:
# Pull out countries
countries = df.country_cleaned.unique().astype(str)
country_list = countries.tolist()

In [7]:
# Alphabetize list to go through
country_list.sort()
country_list[0:15]

['Admiralty Islands',
 'Afghanistan',
 'Afghanistan Or Pakistan',
 'Afghanistan Or Uzbekistan',
 'Africa',
 'Albania',
 'Algeria',
 'Algeria, Mali, Or Niger',
 'American',
 'Anatolia',
 'Angola',
 'Angola / Drc',
 'Angola Or Democratic Republic Of Congo',
 'Angola Or Democratic Republic Of The Congo',
 'Angola, Zambia Or Democratic Republic Of Congo']

In [8]:
# Replaced country names for countries A-B
df.replace({'country_cleaned' : ['Angola / Drc', 'Angola Or Democratic Republic Of Congo',
                                 'Democratic Republic Of Congo Or Angola'
                                ]}, 'Angola Or Democratic Republic Of The Congo', inplace=True)

status_dict.update(dict.fromkeys(['Afghanistan Or Pakistan', 'Afghanistan Or Uzbekistan',
            'Algeria, Mali, Or Niger', 'Angola Or Democratic Republic Of The Congo',
            'Angola, Zambia Or Democratic Republic Of Congo', 'Armenia Or Syria',
            'Austria Or Germany'], 'unclear_multiple'))
status_dict['Austria (?)'] = 'unclear_possible'

df.replace({'country_cleaned' : 'Balkan'}, 'Balkans', inplace=True)
df.loc[(df['country'] == 'Belgium (Flanders)'), 'culture'] = "Flanders"
df.replace({'country_cleaned' : 'Belgium (Flanders)'}, 'Belgium', inplace=True)
df.loc[(df['country'] == 'Bokhara'), 'culture'] = "Bukhara"
df.loc[(df['country'] == 'Bukhara'), 'culture'] = "Bukhara"
df.replace({'country_cleaned' : 'East Bali'}, 'Bali', inplace=True)

status_dict.update(dict.fromkeys(['Belgium Or France', 'Burkina Faso Or Côte D’Ivoire',
                                 'Burkina Faso Or Mali'], 'unclear_multiple'))
status_dict['Bohemia (?)'] = 'unclear_possible'


In [9]:
# Replaced country names for countries C-D
df.replace({'country_cleaned' : 'Chinese Export'}, 'China', inplace=True)
df.replace({'country_cleaned' : 'Chinese Export (?)'}, 'China (?)', inplace=True)
df.replace({'country_cleaned' : "Côte D'Ivoire"}, "Cote D'Ivoire", inplace=True)

status_dict.update(dict.fromkeys(['Cambodia Or Thailand', 'China Or India', 'Costa Rica Or Panama',
            "Côte D'Ivoire Or Liberia", "Côte D'Ivoire Or Mali"], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Cameroon(?)', 'China (?)', 'Czech Republic (?)', 
            'Czechoslovakia (?)'], 'unclear_possible'))

df.loc[(df['country'] == 'Dagestan'), 'culture'] = "Dagestan"
df.replace({'country_cleaned' : 'Democratic Republic Of Congo'}, 
           'Democratic Republic Of The Congo', inplace=True)
df.replace({'country_cleaned' : 'Democratic Republic Of Congo (?)'}, 
           'Democratic Republic Of The Congo (?)', inplace=True)

status_dict.update(dict.fromkeys(['Democratic Republic Of The Congo Or Central African Republic',
            'Denmark England', 'Denmark Or Belgium ?', 'Denmark Or Sweden'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Democratic Republic Of The Congo (?)', 'Denmark (?)',
            'Dutch Republic(?)'], 'unclear_possible'))


In [None]:
# Replaced country names for countries E-F
df.replace({'country_cleaned' : 'Egnland Or America'}, 'England Or American', inplace=True)
df.replace({'country_cleaned' : 'Engalnd'}, 'England', inplace=True)
df.replace({'country_cleaned' : ['England (Possibly)', 'England (Probably)']}, 'England (?)', inplace=True)
df.replace({'country_cleaned' : 'England Or Netherlands'}, 'England Or The Netherlands', inplace=True)

status_dict.update(dict.fromkeys(['East Germany-Austria-Hungary','Egypt Or Mesopotamia',
            'Egypt Or Syria','England And Switzerland','England And United States',
            'England Or American', 'England Or France', 'England Or Hungary (?)', 
            'England Or Ireland','England Or The Netherlands','England Or United States',
            'Ethiopia Or Kenya'
            ], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Egypt (?)','England (?)'], 'unclear_possible'))

df.replace({'country_cleaned' : 'Fiji Islands'}, 'Fiji', inplace=True)
df.loc[(df['country'] == 'Flanders (And Italy)'), 'culture'] = "Flanders"
df.loc[(df['country'] == 'Flanders (And Italy)'), 'culture'] = "Flanders"
df.replace({'country_cleaned' : 'Flanders (And Italy)'}, 'Belgium', inplace=True)

status_dict.update(dict.fromkeys(['Fiji, Tonga, Or Samoa Islands',
            'Flanders Or France'
            ], 'unclear_multiple'))
status_dict.update(dict.fromkeys([], 'unclear_possible'))

df.replace({'country_cleaned' : 'East Germany'}, 'Germany', inplace=True)

df.replace({'country_cleaned' : 'Flanders (And Italy)'}, 'Italy', inplace=True)


 'Fakara Lamedi',
 'Fiji',
 'Finland',
 'Flanders',
 'Flanders, Europe',
 'Flanders/Belgium',
 'Flemish',
 'France',
 'France & Germany',
 'France (?)',
 'France (And England)',
 'France (Northern), Flanders Or Holland',
 'France / Austria',
 'France / Belgium',
 'France And Belgium',
 'France And Switzerland',
 'France Or Belgium',
 'France Or England',
 'France Or Flanders',
 'France Or Italy',
 'France Or Switzerland',
 'France Or United States',
 'France Or United States Or England',
 'France, Switzerland And London, England',
 'France/Germany',
 'France?',
 'French Or Italian',

In [None]:
# Replaced country names for countries N-
df.replace({'country_cleaned' : 'Aotearoa (New Zealand)'}, 'New Zeleand', inplace=True)

# Replaced country names for countries R-S
df.replace({'country_cleaned' : 'Dagestan'}, 'Russia', inplace=True)

# Replaced country names for countries T-U
df.replace({'country_cleaned' : 'Central Thailand'}, 'Thailand', inplace=True)
df.replace({'country_cleaned' : ['Bokhara', 'Bukhara']}, 'Uzbekistan', inplace=True)

In [10]:
status_dict

{'Afghanistan Or Pakistan': 'unclear_multiple',
 'Afghanistan Or Uzbekistan': 'unclear_multiple',
 'Algeria, Mali, Or Niger': 'unclear_multiple',
 'Angola Or Democratic Republic Of The Congo': 'unclear_multiple',
 'Angola, Zambia Or Democratic Republic Of Congo': 'unclear_multiple',
 'Armenia Or Syria': 'unclear_multiple',
 'Austria Or Germany': 'unclear_multiple',
 'Austria (?)': 'unclear_possible',
 'Belgium Or France': 'unclear_multiple',
 'Burkina Faso Or Côte D’Ivoire': 'unclear_multiple',
 'Burkina Faso Or Mali': 'unclear_multiple',
 'Bohemia (?)': 'unclear_possible',
 'Cambodia Or Thailand': 'unclear_multiple',
 'China Or India': 'unclear_multiple',
 'Costa Rica Or Panama': 'unclear_multiple',
 "Côte D'Ivoire Or Liberia": 'unclear_multiple',
 "Côte D'Ivoire Or Mali": 'unclear_multiple',
 'Cameroon(?)': 'unclear_possible',
 'China (?)': 'unclear_possible',
 'Czech Republic (?)': 'unclear_possible',
 'Czechoslovakia (?)': 'unclear_possible',
 'Democratic Republic Of The Congo Or C

In [17]:
test = df[df['country_cleaned']=='Flanders, Europe']
test
# df.loc[(df['creditline'] == 'The Miscellaneous Works of Art Purchase Fund, 1952 (2nd Biennial)'), 
#             'accession_year'] = '52'

Unnamed: 0,accession_number,artist,classification,continent,country,creditline,culture,dated,department,id,...,room,style,title,RBL,RB,X,loan,accession_year,country_cleaned,country_status
99154,30.1,Author: Thomas a Kempis; Publisher: Henry Iaye,Books,,"Flanders, Europe",39.46\r\nGift of Mrs. John Washburn,,1616,Prints and Drawings,103102,...,Not on View,17th century,The Following of Christ,,,,,30,"Flanders, Europe",clear


In [138]:
# Use dictionary to update and add "country_status" column to filter out data
df['country_status'] = df['country_cleaned'].map(status_dict).fillna(df['country_status'])
df.country_status.unique()

array(['clear', 'unclear_multiple', 'unclear_possible'], dtype=object)

## Tableau Country Names
Pulled data frame into Tableau and found 311 non country identified data, using the following steps the countries Tableau didn't recognize were fixed (unclear data was not changed - e.g. 'India or China')
* Make test output file and read into Tableau
* Pull up unknown values from bottom corner of map, and update if possible
    * Created a new column for just Tableau values
    * Changed country name values to modern country if possible, updated culture with any extraneous values
* Divide by alphabet to help keep track of changes

In [None]:
df['tableau_country'] = df['country_cleaned']

In [6]:
# Replaced country values for countries A-C
df.replace({'tableau_country' : ['Flanders/Belgium', 'Flemish', 'Flanders']},'Belgium', inplace=True)
df.replace({'tableau_country' : 'Dahomey'}, 'Benin', inplace=True)
df.replace({'tableau_country' : 'Rapa Nui (Easter Island)'}, 'Chile', inplace=True)
df.replace({'tableau_country' : ['Jingdezhen, China', 'Southern China'
                                ]}, 'China', inplace=True)
df.replace({'tableau_country' : 'Bohemia'}, 'Czechia', inplace=True)


In [7]:
# Replaced country values for countries F-G
df.replace({'tableau_country' : ['Limoges','Lorraine']}, 'France', inplace=True)
df.replace({'tableau_country' : ['Austral Islands', 'Marquesas Islands', 'Society Islands'
                                ]}, 'French Polynesia', inplace=True)
df.replace({'tableau_country' : ['Germany (Saxony)', 'West Germany', 'Bavaria',
                                'Germany, Europe', 'Germany (Prussia)', 'Southern Germany'
                                ]},'Germany', inplace=True)
df.replace({'tableau_country' : ['Greek Islands', 'Greek']}, 'Greece', inplace=True)
df.replace({'tableau_country' : 'Guatamala'}, 'Guatemala', inplace=True)


In [8]:
# Replaced country values for countries I-K
df.replace({'tableau_country' : 'Indian Export'}, 'India', inplace=True)
df.replace({'tableau_country' : ['Bali', 'Java','Kalimanta','Lombok Island',
                                'Papua']}, 'Indonesia', inplace=True)
df.replace({'tableau_country' : ['Persia (Iran)', 'Luristan, Persia', 'Iran (Persia)',
                                'Iran (Persia (Khourvin))']}, 'Iran', inplace=True)
df.replace({'tableau_country' : ['Babylon','Mesopotamia (Iraq)', 'Sumeria (Iraq)'
                                ]}, 'Iraq', inplace=True)
df.replace({'tableau_country' : ['Italy Or Sicily', 'Sicily', 'Itlay', 'Etruria', 'Italy, Ancient Etruria',
                                'Italy Or Sardinia', 'Italy Or Sardina', 'Italy or Sicily',
                                'Italy (Siculo-Campanian)', 'Northern Italy', 'Sardinia',
                                'Sardinia Or Italy', 'Sicily Or Italy', 'Venice Or Padua, Italy'
                                ]}, 'Italy', inplace=True)

In [9]:
# Replaced country values for countries L-P
df.replace({'tableau_country' : 'Northwest Laos'}, 'Laos', inplace=True)
df.replace({'tableau_country' : ['Southeastern Mexico', 'Tarascan']}, 'Mexico', inplace=True)
df.replace({'tableau_country' : ['The Netherlands', 'Nehterlands','Netherlands/Flanders'
                                ]}, 'Netherlands', inplace=True)
df.replace({'tableau_country' : ['New Zeleand']}, 'New Zealand', inplace=True)
df.replace({'tableau_country' : 'Fakara Lamedi'}, 'Niger', inplace=True)
df.replace({'tableau_country' : 'North Nigeria'}, 'Nigeria', inplace=True)
df.replace({'tableau_country' : ['Admiralty Islands', 'New Ireland']}, 'Papua New Guinea', inplace=True)
df.replace({'tableau_country' : 'Philippine Islands'}, 'Philippines', inplace=True)


In [10]:
# Replaced country values for countries Q-S
df.replace({'tableau_country' : ['Syberia']}, 'Russia', inplace=True)
df.replace({'tableau_country' : 'Korea'}, 'South Korea', inplace=True)
df.replace({'tableau_country' : ['Canary Islands', 'Hispano-Moorish']}, 'Spain', inplace=True)
df.replace({'tableau_country' : 'Dutch Guiana (Suriname)'}, 'Suriname', inplace=True)


In [11]:
# Replaced country values for countries T-Z
df.replace({'tableau_country' : ['Siam','Northern Thailand'
                                ]}, 'Thailand', inplace=True)
df.replace({'tableau_country' : 'Tonga Islands'}, 'Tonga', inplace=True)
df.replace({'tableau_country' : 'Anatolia'}, 'Turkey', inplace=True)
df.replace({'tableau_country' : ['England', 'Scotland', 'Engalnd', 'Great Britain',
                                 'Northern Ireland', 'Scotland And England', 'Wales'
                                ]}, 'United Kingdom', inplace=True)
df.replace({'tableau_country' : ['New England, United States', 'Unted States'
                                ]}, 'United States', inplace=True)
df.replace({'tableau_country' : ["Qoraqalpog'Iston / Uzbekistan", 
                                 "Qoraqalpog'Iston", 'Uzbekestan']}, 'Uzbekistan', inplace=True)
df.replace({'tableau_country' : 'Banks Islands'}, 'Vanuatu', inplace=True)
df.replace({'tableau_country' : 'Northern Vietnam'}, 'Vietnam', inplace=True)


In [17]:
# Historical countries with multiple current countries or cultures that span two countries:
# Arabia, Austro_hungarian Empire, Balkans, Borneo, Czechoslovakia, Flanders (Europe)
## Graeco-Roman, Khmer, Kurdistan, Mesopotamia, Micronesian Islands, New Guinea
## Nubia, Ottoman Empire, Persia, Ptolemaic Egypt, Roman, Tibet, Yugoslavia

In [18]:
# New file for visualizations from objects with updated country names
output_datafile = 'resources/Mia_objects_country.csv'
df.to_csv(output_datafile, encoding='utf-8')