In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in cleaned accession data
csv_file = 'resources/Mia_objects_accession_year.csv'

In [3]:
df = pd.read_csv(csv_file, index_col='Unnamed: 0', low_memory=False)
df.head(3)

Unnamed: 0,accession_number,artist,classification,continent,country,creditline,culture,dated,department,id,...,object_name,provenance,room,style,title,RBL,RB,X,loan,accession_year
0,10.1,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c.1888-89,Prints and Drawings,0,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Air, from the series The Four Elements",,,,,10
1,10.2,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,1,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Earth, from the series The Four Elements",,,,,10
2,10.3,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,2,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",G352,19th century,"Fire, from the series The Four Elements",,,,,10


## Fix Typos in Country Data
There are 505 country entries, most of which are only 1-2 long.
* Pull out list of unique country names, sort alphabetically
* Fix typos and formatting differences
* For cultures or countries that do not exist:
    * Change to modern country if possible
    * Add province or older country name to culture tab
* Will pull data into Tableau and further edit if necessary

In [4]:
# add column for cleaned country data
    # strip any trailing commas and standardize punctuation in the process
df['country_cleaned'] = df['country'].str.rstrip('')
df['country_cleaned'] = df['country_cleaned'].str.title()

# add column for country status, default value "clear"
df['country_status'] = 'clear'

# build dictionary to hold country status information (multiple, (?) values)
status_dict = {}
# add continent labels from country list
status_dict.update(dict.fromkeys(['Africa', 'Central Asia', 'Europe', 'Europe Or Asia',
            'West Africa'], 'unclear_continent'))

In [5]:
# Pull out countries
countries = df.country_cleaned.unique().astype(str)
country_list = countries.tolist()

In [6]:
# Alphabetize list to go through
country_list.sort()
country_list[0:15]

['Admiralty Islands',
 'Afghanistan',
 'Afghanistan Or Pakistan',
 'Afghanistan Or Uzbekistan',
 'Africa',
 'Albania',
 'Algeria',
 'Algeria, Mali, Or Niger',
 'American',
 'Anatolia',
 'Angola',
 'Angola / Drc',
 'Angola Or Democratic Republic Of Congo',
 'Angola Or Democratic Republic Of The Congo',
 'Angola, Zambia Or Democratic Republic Of Congo']

In [7]:
# Replaced country names for countries A-B
df.replace({'country_cleaned' : ['Angola / Drc', 'Angola Or Democratic Republic Of Congo',
                                 'Democratic Republic Of Congo Or Angola'
                                ]}, 'Angola Or Democratic Republic Of The Congo', inplace=True)

status_dict.update(dict.fromkeys(['Afghanistan Or Pakistan', 'Afghanistan Or Uzbekistan',
            'Algeria, Mali, Or Niger', 'Angola Or Democratic Republic Of The Congo',
            'Angola, Zambia Or Democratic Republic Of Congo', 'Armenia Or Syria',
            'Austria Or Germany'], 'unclear_multiple'))
status_dict['Austria (?)'] = 'unclear_possible'

df.replace({'country_cleaned' : 'Balkan'}, 'Balkans', inplace=True)
df.loc[(df['country'] == 'Belgium (Flanders)'), 'culture'] = "Flanders"
df.replace({'country_cleaned' : ['Belgium (Flanders)', 'Flanders/Belgium']}, 'Belgium', inplace=True)
df.loc[(df['country'] == 'Bokhara'), 'culture'] = "Bukhara"
df.loc[(df['country'] == 'Bukhara'), 'culture'] = "Bukhara"
df.replace({'country_cleaned' : 'East Bali'}, 'Bali', inplace=True)
df.loc[(df['country'] == 'Dahomey'), 'culture'] = "Dahomey"
df.replace({'country_cleaned' : 'Dahomey'}, 'Benin', inplace=True)

status_dict.update(dict.fromkeys(['Belgium Or France', 'Burkina Faso Or Côte D’Ivoire',
                                 'Burkina Faso Or Mali'], 'unclear_multiple'))
status_dict['Bohemia (?)'] = 'unclear_possible'


In [8]:
# Replaced country names for countries C-D
df.replace({'country_cleaned' : ['Chinese Export', 'Jingdezhen, China', 
                                 "People'S Republic Of China", 'Southern China'
                                ]}, 'China', inplace=True)
df.replace({'country_cleaned' : 'Chinese Export (?)'}, 'China (?)', inplace=True)
df.replace({'country_cleaned' : "Côte D'Ivoire"}, "Cote D'Ivoire", inplace=True)

status_dict.update(dict.fromkeys(['Cambodia Or Thailand', 'China Or India', 'Costa Rica Or Panama',
            "Côte D'Ivoire Or Liberia", "Côte D'Ivoire Or Mali"], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Cameroon(?)', 'China (?)', 'Czech Republic (?)', 
            'Czechoslovakia (?)'], 'unclear_possible'))

df.loc[(df['country'] == 'Dagestan'), 'culture'] = "Dagestan"
df.replace({'country_cleaned' : 'Democratic Republic Of Congo'}, 
           'Democratic Republic Of The Congo', inplace=True)
df.replace({'country_cleaned' : 'Democratic Republic Of Congo (?)'}, 
           'Democratic Republic Of The Congo (?)', inplace=True)

status_dict.update(dict.fromkeys(['Democratic Republic Of The Congo Or Central African Republic',
            'Denmark England', 'Denmark Or Belgium ?', 'Denmark Or Sweden'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Democratic Republic Of The Congo (?)', 'Denmark (?)',
            'Dutch Republic(?)'], 'unclear_possible'))


In [9]:
# Replaced country names for countries E-F
df.replace({'country_cleaned' : 'Egnland Or America'}, 'England Or American', inplace=True)
df.replace({'country_cleaned' : 'Engalnd'}, 'England', inplace=True)
df.replace({'country_cleaned' : ['England (Possibly)', 'England (Probably)',
                                'Probably English']}, 'England (?)', inplace=True)
df.replace({'country_cleaned' : 'England Or Netherlands'}, 'England Or The Netherlands', inplace=True)

status_dict.update(dict.fromkeys(['East Germany-Austria-Hungary','Egypt Or Mesopotamia',
            'Egypt Or Syria','England And Switzerland','England And United States',
            'England Or American', 'England Or France', 'England Or Hungary (?)', 
            'England Or Ireland','England Or The Netherlands','England Or United States',
            'Ethiopia Or Kenya'
            ], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Egypt (?)','England (?)'], 'unclear_possible'))

df.replace({'country_cleaned' : 'Fiji Islands'}, 'Fiji', inplace=True)
df.loc[(df['country'] == 'Flanders (And Italy)'), 'culture'] = "Flanders"
df.loc[(df['country'] == 'Flanders/Belgium'), 'culture'] = "Flanders"
df.loc[(df['country'] == 'Flanders, Europe'), 'culture'] = "Flanders"
df.loc[(df['country'] == 'Flemish'), 'culture'] = "Flanders"
df.replace({'country_cleaned' : 'Flemish'}, 'Flanders', inplace=True)
df.replace({'country_cleaned' : 'France/Germany'}, 'France & Germany', inplace=True)
df.replace({'country_cleaned' : 'France?'}, 'France (?)', inplace=True)
df.replace({'country_cleaned' : 'France / Belgium'}, 'France And Belgium', inplace=True)
df.loc[(df['country'] == 'France Or Flanders'), 'culture'] = "Flanders"
df.replace({'country_cleaned' : 'French Or Italian'}, 'France Or Italy', inplace=True)

status_dict.update(dict.fromkeys(['Fiji, Tonga, Or Samoa Islands','Flanders Or France', 
            'Flanders, Europe', 'France & Germany', 'France (And England)',
            'France (Northern), Flanders Or Holland','France / Austria','France And Belgium',
            'France And Switzerland', 'France Or Belgium', 'France Or England',
            'France Or Flanders','France Or Italy','France Or Switzerland', 
            'France Or United States','France Or United States Or England',
             'France, Switzerland And London, England'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['France (?)'], 'unclear_possible'))



In [10]:
# Replaced country names for countries G-I
df.replace({'country_cleaned' : ['East Germany', 'Germany, Europe', 'Germany (Prussia)',
                                'Germany (Saxony)', 'Southern Germany', 'West Germany'
                                ]}, 'Germany', inplace=True)
df.replace({'country_cleaned' : ['Germany (Possibly)', 'Germany (Probably)', 'Germany?'
                                ]}, 'Germany (?)', inplace=True)
df.loc[(df['country'] == 'Germany (Prussia)'), 'culture'] = "Prussia"
df.loc[(df['country'] == 'Germany (Saxony)'), 'culture'] = "Prussia"
df.replace({'country_cleaned' : 'Germany/Italy?'}, 'Germany Or Italy', inplace=True)
df.replace({'country_cleaned' : ['Greek','Greek Islands']}, 'Greece', inplace=True)
df.replace({'country_cleaned' : 'Guatamala'}, 'Guatemala', inplace=True)

status_dict.update(dict.fromkeys(['Gabon Or Republic Of The Congo', 'Germany / Holland',
            'Germany Or Austria', 'Germany Or France(?)', 'Germany Or Hungary (?)',
            'Germany Or Italy', 'Germany Or Netherlands', 'Germany Or United States',
            'Ghana / Togo', 'Ghana Or Burkina Faso', 'Ghana Or Togo', 'Greece Or Cyprus',
            'Guatemala Or Honduras'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Germany (?)','Guatemala (Possibly)'], 'unclear_possible'))

df.loc[(df['country'] == 'Hispano-Moorish'), 'culture'] = 'Hispano-Moorish'

df.replace({'country_cleaned' : 'India (Probably) For The English Market'}, 
                               'India (?)', inplace=True)
df.replace({'country_cleaned' : ['Indian Export', 'Northern India']}, 'India', inplace=True)
df.replace({'country_cleaned' : ['Iran (Persia (Khourvin))', 'Iran (Persia)', 'Luristan, Persia',
                                 'Persia (Iran)']}, 'Iran', inplace=True)
df.loc[(df['country'] == 'Iran (Persia)'), 'culture'] = 'Persian'
df.loc[(df['country'] == 'Iran (Persia (Khourvin)'), 'culture'] = 'Persian'
df.replace({'country_cleaned' : 'Sumeria (Iraq)'}, 'Iraq', inplace=True)
df.replace({'country_cleaned' : 'Israel (Probably)'}, 'Israel (?)', inplace=True)

status_dict.update(dict.fromkeys(['India Or Burma', 'India Or China','India Or Pakistan',
            'Iran Or India', 'Iran Or Trasoxiana', 'Iran Or Turkey', 
            'Iran, Nishapur Or Samarkand, Uzbekistan', 'Iraq Or Iran', 'Israel And France',
            'Isreal / Palestine', 'Ivory Coast/Nigeria'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Hungary (Probably)','India (?)', 'Iraq (?)', 'Ireland (?)',
            'Israel (?)'], 'unclear_possible'))


In [11]:
# Replaced country names for Italy
df.replace({'country_cleaned' : ['Italy (Likely)', 'Sardinia (?)', 'Sicily (?)'
                                ]}, 'Italy (?)', inplace=True)
df.replace({'country_cleaned' : ['Flanders (And Italy)', 'Italy (Siculo-Campanian)', 
                                 'Italy Or Sardina', 'Italy Or Sardinia', 'Italy Or Sicily', 
                                 'Italy, Ancient Etruria', 'Itlay', 'Northern Italy', 
                                 'Sardinia Or Italy', 'Sardinia', 'Sicily', 'Sicily Or Italy',
                                 'Venice Or Padua, Italy',
                                ]},  'Italy', inplace=True)
df.loc[(df['country'] == 'Italy (Siculo-Campanian)'), 'culture'] = 'Siculo-Campanian'
df.loc[(df['country'] == 'Italy or Sardina'), 'culture'] = 'Sardinia'
df.loc[(df['country'] == 'Italy or Sardinia'), 'culture'] = 'Sardinia'
df.loc[(df['country'] == 'Italy or Sicily'), 'culture'] = 'Sicily'
df.loc[(df['country'] == 'Italy, Ancient Etruria'), 'culture'] = 'Etruria'
df.loc[(df['country'] == 'Sardinia'), 'culture'] = 'Sardinia'
df.loc[(df['country'] == 'Sardinia (?)'), 'culture'] = 'Sardinia'
df.loc[(df['country'] == 'Sardinia or Italy'), 'culture'] = 'Sardinia'
df.loc[(df['country'] == 'Sicily'), 'culture'] = 'Sicily'
df.loc[(df['country'] == 'Sicily (?)'), 'culture'] = 'Sicily'
df.loc[(df['country'] == 'Sicily Or Italy'), 'culture'] = 'Sicily'

status_dict.update(dict.fromkeys(['Italy - Belgium', 'Italy / Greece', 'Italy And Antwerp',
            'Italy Or France', 'Italy Or Holland','Italy Or Spain', 
            'Italy, Flanders, Or France?'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Italy (?)'], 'unclear_possible'))




In [12]:
# Replaced country names J-L
df.replace({'country_cleaned' : 'Japan (Or Korea?)'}, 'Japan Or Korea', inplace=True)
df.loc[(df['country'] == 'Jingdezhen, China'), 'culture'] = "Jingdezhen"
df.replace({'country_cleaned' : 'South Korea'}, 'Korea', inplace=True)

status_dict.update(dict.fromkeys(['Japan Or Korea', 'Kenya Or Tanzania', 
            'Kyrgystan Or Uzbekistan', 'Kyrgyzstan Or Kazakhstan'], 'unclear_multiple'))

df.replace({'country_cleaned' : 'Laos, Possibly Vietnam'}, 'Laos Or Vietnam', inplace=True)
df.replace({'country_cleaned' : 'Northwest Laos'}, 'Laos', inplace=True)
df.loc[(df['country'] == 'Luristan, Persia'), 'culture'] = "Persia"

status_dict.update(dict.fromkeys(['Laos And North Vietnam', 'Laos Or Northern Thailand', 
            'Laos Or Vietnam', "Liberia Or Cote D'Ivoire", 'Liberia Or Guinea',
            "Liberia/Côte D'Ivoire"], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Laos (?)',], 'unclear_possible'))


In [13]:
# Replaced country names M-N
df.replace({'country_cleaned' : 'Mesopotamia (Iraq)'}, 'Iraq', inplace=True)
df.loc[(df['country'] == 'Mesopotamia (Iraq)'), 'culture'] = 'Mesopotamia'
df.loc[(df['country'] == 'Mesopotamia'), 'culture'] = 'Mesopotamia'
df.replace({'country_cleaned' : 'Southeastern Mexico'}, 'Mexico', inplace=True)
df.replace({'country_cleaned' : 'Myanmar (Burma)'}, 'Myanmar', inplace=True)

status_dict.update(dict.fromkeys(['Malawi Or Zimbabwe', 'Mali /Niger', 'Mali Or Burkina Faso',
            'Mali Or Nigeria', 'Mexico Or Costa Rica', 'Mexico Or Guatemala',
            'Middle East Or India', 'Mozambique Or Tanzania', 'Myanmar (Burma) / Laos'
            ], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Mexico (?)', 'Morocco (?)'], 'unclear_possible'))

df.replace({'country_cleaned' : 'Aotearoa (New Zealand)'}, 'New Zeleand', inplace=True)
df.replace({'country_cleaned' : ['Nehterlands', 'The Netherlands']}, 'Netherlands', inplace=True)
df.replace({'country_cleaned' : 'The Netherlands (Probably)'}, 'Netherlands (?)', inplace=True)
df.replace({'country_cleaned' : 'North Nigeria'}, 'Nigeria', inplace=True)


status_dict.update(dict.fromkeys(['Netherlands / Flanders', 'Netherlands Or Germany',
            'Nicaragua Or Costa Rica', 'Niger Or Algeria', 'Niger Or Mali', 'Nigeria Or Benin',
            'Nigeria Or Cameroon', 'North Thailand Or Laos',], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Netherlands (?)', 'Niger (?)', 'Norway (?)'
            ], 'unclear_possible'))


In [14]:
# Replaced country names P-R
df.loc[(df['country'] == 'Persia (Iran)'), 'culture'] = 'Persian'
df.loc[(df['country'] == 'Persia'), 'culture'] = 'Persian'
df.replace({'country_cleaned' : ['Philippine Islands', 'Phillipines']}, 'Philippines', inplace=True)
df.replace({'country_cleaned' : ['Poland (Possibly)', 'Poland (Probably)']}, 'Poland (?)', inplace=True)


status_dict.update(dict.fromkeys(['Pakistan (Or Afghanistan?)','Pakistan Or India',
            'Panama Or Costa Rica', 'Poland Or East Germany'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Peru (?)', 'Poland (?)'], 'unclear_possible'))

df.replace({'country_cleaned' : 'Dagestan'}, 'Russia', inplace=True)
df.replace({'country_cleaned' : ['Russia (Possibly)', 'Russia (Probably)']}, 'Russia (?)', inplace=True)

status_dict.update(dict.fromkeys(['Russia And Germany','Rwanda Or Burundi'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Romania(?)', 'Russia (?)'], 'unclear_possible'))


In [15]:
# Replaced country names for countries S-T
df.replace({'country_cleaned' : 'Scotland (Probably)'}, 'Scotland (?)', inplace=True)
df.replace({'country_cleaned' : 'Hispano-Moorish'}, 'Spain', inplace=True)
df.replace({'country_cleaned' : 'Republic Of The Sudan'}, 'Sudan', inplace=True)
df.replace({'country_cleaned' : 'Switzerland/France'}, 'Switzerland / France', inplace=True)

status_dict.update(dict.fromkeys(['Scotland And England', 'Senegal Or Guinea',
            'Senegal Or Mauritania','Spain Or Austria','Spain Or Italy','Spain Or Phiippines',
            'Spain Or Portugal', 'Switzerland / France', 'Switzerland Or France',
            'Syria Or Turkey'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Scotland (?)','Spain (?)','Switzerland (?)',
            ], 'unclear_possible'))

df.replace({'country_cleaned' : ['Central Thailand', 'Northern Thailand', "Siam"
                                ]}, 'Thailand', inplace=True)
df.replace({'country_cleaned' : 'Western Tibet Or Central Regions, Tibet'}, 'Tibet', inplace=True)

status_dict.update(dict.fromkeys(['Tajikistan Or Uzbekistan', 'Tanzania Or Mozambique',
            'Thailand Or Burma', 'Thailand Or Laos', 'Thailand/Laos', 'Tibet Or Mongolia',
             'Turkey Or Greece', 'Turkey Or Iran', 'Turkey Or Kurdistan',
             'Turkey Or North Africa', 'Turkmenistan Or Uzbekistan'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Turkey (?)'], 'unclear_possible'))


In [16]:
# Replaced country names for countries U
df.replace({'country_cleaned' : ['New England, United States', 'Unted States', 
                                 'United States Of America', 'American'
                                ]}, 'United States', inplace=True)
df.replace({'country_cleaned' : ['Probably United States',  'United States (Possibly)',
                                 'United States (Probably)', 'United States?'
                                ]}, 'United States (?)', inplace=True)
df.replace({'country_cleaned' : ['Bokhara', 'Bukhara', "Qoraqalpog'Iston",
                                 "Qoraqalpog'Iston / Uzbekistan",'Uzbekestan'
                                ]}, 'Uzbekistan', inplace=True)

status_dict.update(dict.fromkeys(['United States And England', 'United States And Europe',
            'United States And France', 'United States And Japan', 'United States And Mexico',
            'United States Or Canada', 'United States Or England', 'United States Or England (?)',
            'United States Or Great Britain', 'United States Or Great Britain (?)', 
            'Uzbekistan Or Afghanistan', 'Uzbekistan Or Tajikistan'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['United Kingdom (?)','United States (?)'], 'unclear_possible'))


In [17]:
# Replaced country names for countries V-Z
df.replace({'country_cleaned' : 'Northern Vietnam'}, 'Vietnam', inplace=True)
df.replace({'country_cleaned' : 'Republic Of Yemen'}, 'Yemen', inplace=True)

status_dict.update(dict.fromkeys([ 'Vietnam Or Cambodia', 'Vietnam Or Laos',
            'Vietnam Or Thailand (?)', 'Zambia Or Botswana',
            'Zambia Or Democratic Republic Of Congo'], 'unclear_multiple'))
status_dict.update(dict.fromkeys(['Vietnam (?)', 'Wales (?)'], 'unclear_possible'))


In [18]:
# Double check dictionary
status_dict

{'Africa': 'unclear_continent',
 'Central Asia': 'unclear_continent',
 'Europe': 'unclear_continent',
 'Europe Or Asia': 'unclear_continent',
 'West Africa': 'unclear_continent',
 'Afghanistan Or Pakistan': 'unclear_multiple',
 'Afghanistan Or Uzbekistan': 'unclear_multiple',
 'Algeria, Mali, Or Niger': 'unclear_multiple',
 'Angola Or Democratic Republic Of The Congo': 'unclear_multiple',
 'Angola, Zambia Or Democratic Republic Of Congo': 'unclear_multiple',
 'Armenia Or Syria': 'unclear_multiple',
 'Austria Or Germany': 'unclear_multiple',
 'Austria (?)': 'unclear_possible',
 'Belgium Or France': 'unclear_multiple',
 'Burkina Faso Or Côte D’Ivoire': 'unclear_multiple',
 'Burkina Faso Or Mali': 'unclear_multiple',
 'Bohemia (?)': 'unclear_possible',
 'Cambodia Or Thailand': 'unclear_multiple',
 'China Or India': 'unclear_multiple',
 'Costa Rica Or Panama': 'unclear_multiple',
 "Côte D'Ivoire Or Liberia": 'unclear_multiple',
 "Côte D'Ivoire Or Mali": 'unclear_multiple',
 'Cameroon(?)': 

In [19]:
# Use dictionary to update "country_status" column to filter out data
    # fillna so countries without a question or multiple can be selected as 'clear'
df['country_status'] = df['country_cleaned'].map(status_dict).fillna(df['country_status'])
df.country_status.unique()

array(['clear', 'unclear_multiple', 'unclear_possible',
       'unclear_continent'], dtype=object)

In [None]:
# List of countries to add code for or look up/change
# df.replace({'tableau_country' : ['Limoges','Lorraine']}, 'France', inplace=True)
# df.replace({'tableau_country' : ['Bavaria']},'Germany', inplace=True)
# df.replace({'tableau_country' : ['Babylon','Mesopotamia (Iraq)', 'Sumeria (Iraq)'
#                                 ]}, 'Iraq', inplace=True)
# df.replace({'tableau_country' : 'Fakara Lamedi'}, 'Niger', inplace=True)
# df.replace({'tableau_country' : ['Southeastern Mexico', 'Tarascan']}, 'Mexico', inplace=True)
# df.replace({'tableau_country' : ['Syberia']}, 'Russia', inplace=True)
# df.replace({'tableau_country' : 'Anatolia'}, 'Turkey', inplace=True)

# Anatolia
# Arabia
# Austro_hungarian Empire
# Balkans
# Bavaria
# Borneo
# Czechoslovokia
# Etruria
# Flanders
# Graeco-Roman
# Khmer
# Kurdistan
# Babylon

# Mesopotamia
# Micornesian Islands
# New Guinea
# New Zeleand
# Northern Ireland
# Nubia
# Ottoman Empire
# Persia
# Probably Europe
# Ptolemaic Egypt
# Roman
# Roman Empire
# Roman/Near Eastern
# Syberia
# Tarascan
# Tibet
# Tonga Islands
# Yugoslavia

# Limoges
# Lorraine


In [None]:
# Historical countries with multiple current countries or cultures that span two countries:
# Arabia, Austro_hungarian Empire, Balkans, Borneo, Czechoslovakia, Flanders (Europe)
## Graeco-Roman, Khmer, Kurdistan, Mesopotamia, Micronesian Islands, New Guinea
## Nubia, Ottoman Empire, Persia, Ptolemaic Egypt, Roman, Tibet, Yugoslavia

## Tableau Country Names
Pulled data frame into Tableau and found 311 non country identified data, using the following steps the countries Tableau didn't recognize were fixed (unclear data was not changed - e.g. 'India or China')
* Make test output file and read into Tableau
* Pull up unknown values from bottom corner of map, and update if possible
    * Created a new column for just Tableau values
    * Changed country name values to modern country if possible, updated culture with any extraneous values
* Divide by alphabet to help keep track of changes

In [20]:
df['tableau_country'] = df['country_cleaned']

In [21]:
# Replaced country values for countries A-K
df.replace({'tableau_country' : 'Rapa Nui (Easter Island)'}, 'Chile', inplace=True)
df.replace({'tableau_country' : 'Bohemia'}, 'Czechia', inplace=True)
df.replace({'tableau_country' : ['Austral Islands', 'Marquesas Islands', 'Society Islands'
                                ]}, 'French Polynesia', inplace=True)
df.replace({'tableau_country' : ['Bali', 'Java', 'Kalimanta', 'Lombok Island',
                                'Papua']}, 'Indonesia', inplace=True)

In [22]:
# Replaced country values for countries L-S
df.replace({'tableau_country' : ['Admiralty Islands', 'New Ireland']}, 'Papua New Guinea', inplace=True)
df.replace({'tableau_country' : 'Korea'}, 'South Korea', inplace=True)
df.replace({'tableau_country' : 'Canary Islands'}, 'Spain', inplace=True)
df.replace({'tableau_country' : 'Dutch Guiana (Suriname)'}, 'Suriname', inplace=True)

In [23]:
# Replaced country values for countries T-Z
df.replace({'tableau_country' : 'Tonga Islands'}, 'Tonga', inplace=True)
df.replace({'tableau_country' : ['England', 'Scotland', 'Northern Ireland', 'Wales'
                                ]}, 'United Kingdom', inplace=True)
df.replace({'tableau_country' : 'Banks Islands'}, 'Vanuatu', inplace=True)


In [24]:
# New file for visualizations from objects with updated country names
output_datafile = 'resources/Mia_objects_country.csv'
df.to_csv(output_datafile, encoding='utf-8')