In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in cleaned accession data
csv_file = 'resources/Mia_objects_accession_year.csv'

In [61]:
df = pd.read_csv(csv_file, index_col='Unnamed: 0', low_memory=False)
df.head()

Unnamed: 0,accession_number,artist,classification,continent,country,creditline,culture,dated,department,id,...,object_name,provenance,room,style,title,RBL,RB,X,loan,accession_year
0,10.1,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c.1888-89,Prints and Drawings,0,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Air, from the series The Four Elements",,,,,10
1,10.2,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,1,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Earth, from the series The Four Elements",,,,,10
2,10.3,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,2,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",G352,19th century,"Fire, from the series The Four Elements",,,,,10
3,10.4,Frederick G. Smith; Artist: Formerly attribute...,Drawings,Europe,England,"Gift of Mrs. C. J. Martin, in memory of Charle...",,c. 1888-89,Prints and Drawings,3,...,Drawing,"[Art dealer, London, acquired from ""an old hou...",Not on View,19th century,"Water, from the series The Four Elements",,,,,10
4,13.29,Walter Shirlaw,Drawings,North America,United States,Gift of Mrs. Florence M. Shirlaw,,19th century,Prints and Drawings,4,...,Drawing,,Not on View,19th century,Montana Indian Reservation I,,,,,13


## Fix Typos in Country Data
There are 505 country entries, most of which are only 1-2 long.
* Using groupby, read through country data in batches
    * e.g. df.groupby('country_cleaned').filter(lambda x: len(x) > 10)
    * This gives a list of all country entries with more than 10 objects
* Update all unclear entries with "Unclear"
* Fix typos and formatting differences
* Changed to modern country if possible, added the province or older country name to the Culture tab

* Divided by alphabet to help keep track of changes

In [82]:
# add column for cleaned country data
    # strip any trailing commas and standardize punctuation in the process
df['country_cleaned'] = df['country'].str.rstrip('')
df['country_cleaned'] = df['country_cleaned'].str.title()


In [83]:
# Pull out countries
countries = df.country_cleaned.unique().astype(str)
country_list = countries.tolist()

In [91]:
# Alphabetize list to go through
country_list.sort()
country_list[0:15]

['Admiralty Islands',
 'Afghanistan',
 'Afghanistan Or Pakistan',
 'Afghanistan Or Uzbekistan',
 'Africa',
 'Albania',
 'Algeria',
 'Algeria, Mali, Or Niger',
 'American',
 'Anatolia',
 'Angola',
 'Angola / Drc',
 'Angola Or Democratic Republic Of Congo',
 'Angola Or Democratic Republic Of The Congo',
 'Angola, Zambia Or Democratic Republic Of Congo']

In [86]:
country_list

['Admiralty Islands',
 'Afghanistan',
 'Afghanistan Or Pakistan',
 'Afghanistan Or Uzbekistan',
 'Africa',
 'Albania',
 'Algeria',
 'Algeria, Mali, Or Niger',
 'American',
 'Anatolia',
 'Angola',
 'Angola / Drc',
 'Angola Or Democratic Republic Of Congo',
 'Angola Or Democratic Republic Of The Congo',
 'Angola, Zambia Or Democratic Republic Of Congo',
 'Aotearoa (New Zealand)',
 'Arabia',
 'Argentina',
 'Armenia',
 'Armenia Or Syria',
 'Austral Islands',
 'Australia',
 'Austria',
 'Austria (?)',
 'Austria Or Germany',
 'Austro-Hungarian Empire',
 'Azerbaijan',
 'Babylon',
 'Bali',
 'Balkan',
 'Balkans',
 'Bangladesh',
 'Banks Islands',
 'Bavaria',
 'Belgium',
 'Belgium (Flanders)',
 'Belgium Or France',
 'Benin',
 'Bhutan',
 'Bohemia',
 'Bohemia (?)',
 'Bokhara',
 'Bolivia',
 'Borneo',
 'Bosnia',
 'Botswana',
 'Brazil',
 'Britain',
 'Bukhara',
 'Bulgaria',
 'Burkina Faso',
 'Burkina Faso Or Côte D’Ivoire',
 'Burkina Faso Or Mali',
 'Burma',
 'Cambodia',
 'Cambodia Or Thailand',
 'Camer

In [69]:
df_grouped = df.groupby('country_cleaned').filter(lambda x: len(x) > 500)
df_remainder = df.groupby('country_cleaned').filter(lambda x: len(x) < 501)
df_remainder.country.value_counts().head(20)

Austria                             499
Ghana                               424
Turkey                              364
Nigeria                             361
Uzbekistan                          328
Democratic Republic of the Congo    316
Korea                               306
Morocco                             303
Egypt                               275
Holland                             268
Myanmar (Burma)                     242
Mali                                240
Iran                                239
Russia                              232
Canada                              223
Syria                               222
Czechoslovakia                      218
Ireland                             218
Israel                              207
Indonesia                           197
Name: country, dtype: int64

In [64]:
# Replaced country values for countries N
df.replace({'country_cleaned' : 'The Netherlands'},'Netherlands', inplace=True)


In [None]:
df.replace({'tableau_country' : ['The Netherlands', 'Nehterlands','Netherlands/Flanders'
                                ]}, 'Netherlands', inplace=True)

In [50]:
test = df[df['country'] == 'Flanders']
test

Unnamed: 0,accession_number,artist,classification,continent,country,creditline,culture,dated,department,id,...,room,style,title,RBL,RB,X,loan,accession_year,tableau_country,country_cleaned
49,15.208,,Textiles,Europe,Flanders,Given in memory of Mrs. Thomas Lowry by her ch...,,c. 1580-1620,Textiles,55,...,Not on View,16th-17th century,Deer Hunt with the Story of Diana and Actaeon,,,,,15,Belgium,Flanders
100,16.721,,Textiles,Europe,Flanders,Gift of Mrs. C. J. Martin for the Charles Jair...,,c. 1460-1485,Textiles,109,...,Not on View,15th century,Esther and Ahasuerus,,,,,16,Belgium,Flanders
511,37.17,,Textiles,Europe,Flanders,The John R. Van Derlip Fund,,c. 1507-1540,Textiles,555,...,Not on View,16th century,The Journey and Temptations of the Prodigal Son,,,,,37,Belgium,Flanders
10684,60.30,,Textiles,Europe,Flanders,Gift of Philip W. Pillsbury,,late 16th century,Textiles,11631,...,Not on View,16th century,Offering to the Goddess Ceres,,,,,60,Belgium,Flanders
12612,21.205,,Textiles,Europe,Flanders,Gift of Countess Phelps Resse,,18th century,Textiles,13584,...,Not on View,18th century,Border,,,,,21,Belgium,Flanders
12728,"21.360a,b",,Textiles,,Flanders,Gift of Countess Phelps Resse,,19th century,Textiles,13724,...,Not on View,19th century,Lappet?,,,,,21,Belgium,Flanders
13157,23.60,,Textiles,Europe,Flanders,Gift of Mrs. John Washburn,,16th century,Textiles,14154,...,Not on View,16th century,Large-Leaf Verdure Tapestry Fragment with Birds,,,,,23,Belgium,Flanders
13230,25.415,,Textiles,Europe,Flanders,,,19th century,Textiles,14227,...,Not on View,19th century,Border,,,,,25,Belgium,Flanders
13236,25.414,,Textiles,Europe,Flanders,,,18th century,Textiles,14233,...,Not on View,18th century,Collar,,,,,25,Belgium,Flanders
13241,25.413,,Textiles,Europe,Flanders,,,19th century,Textiles,14238,...,Not on View,19th century,Edging,,,,,25,Belgium,Flanders


In [None]:
# Replaced country values for countries A-G
df.replace({'country_cleaned' : 'East Bali'},'Bali', inplace=True)
df.replace({'country_cleaned' : 'Flanders/Belgium'},'Belgium (Flanders)', inplace=True)
df.replace({'country_cleaned' : 'Flemish'}, 'Flanders', inplace=True)
df.replace({'country_cleaned' : ['Jingdezhen, China', 'Southern China']}, 'China', inplace=True)
df.replace({'country_cleaned' : 'Fiji Islands'}, 'Fiji', inplace=True)
df.replace({'country_cleaned' : ['Limoges','Lorraine']}, 'France', inplace=True)
# df.replace({'tableau_country' : ['Austral Islands', 'Marquesas Islands', 'Society Islands'
#                                 ]}, 'French Polynesia', inplace=True)
df.replace({'country_cleaned' : ['Germany (Saxony)', 'West Germany', 'East Germany', 'Bavaria',
                                'Germany, Europe', 'Germany (Prussia)', 'Southern Germany'
                                ]},'Germany', inplace=True)
df.replace({'country_cleaned' : ['Greek Islands', 'Greek']}, 'Greece', inplace=True)
df.replace({'country_cleaned' : 'Guatamala'}, 'Guatemala', inplace=True)


In [None]:
# Replaced country values for countries I-K
df.replace({'country_cleaned' : 'Indian Export'}, 'India', inplace=True)
# df.replace({'tableau_country' : ['Bali', 'East Bali', 'Java','Kalimanta','Lombok Island',
#                                 'Papua']}, 'Indonesia', inplace=True)
df.replace({'country_cleaned' : ['Persia (Iran)', 'Luristan, Persia','Iran (Persia (Khourvin))'
                                ]}, 'Iran (Persia)', inplace=True)
# df.replace({'tableau_country' : ['Persia (Iran)', 'Luristan, Persia', 'Iran (Persia)',
#                                 'Iran (Persia (Khourvin))']}, 'Iran', inplace=True)
# df.replace({'tableau_country' : ['Babylon','Mesopotamia (Iraq)', 'Sumeria (Iraq)'
#                                 ]}, 'Iraq', inplace=True)
df.replace({'country_cleaned' : ['Italy Or Sicily', 'Sicily', 'Itlay', 'Etruria', 'Italy, Ancient Etruria',
                                'Italy Or Sardinia', 'Italy Or Sardina', 'Italy or Sicily',
                                'Italy (Siculo-Campanian)', 'Northern Italy', 'Sardinia',
                                'Sardinia Or Italy', 'Sicily Or Italy', 'Venice Or Padua, Italy'
                                ]}, 'Italy', inplace=True)

In [None]:
# Replaced country values for countries L-P
df.replace({'tableau_country' : 'Northwest Laos'}, 'Laos', inplace=True)
df.replace({'tableau_country' : ['Southeastern Mexico', 'Tarascan']}, 'Mexico', inplace=True)

df.replace({'tableau_country' : ['Aotearoa (New Zealand)', 'New Zeleand']}, 'New Zealand', inplace=True)
df.replace({'tableau_country' : 'Fakara Lamedi'}, 'Niger', inplace=True)
df.replace({'tableau_country' : 'North Nigeria'}, 'Nigeria', inplace=True)
df.replace({'tableau_country' : ['Admiralty Islands', 'New Ireland']}, 'Papua New Guinea', inplace=True)
df.replace({'tableau_country' : 'Philippine Islands'}, 'Philippines', inplace=True)

In [None]:
# Replaced country values for countries Q-S
df.replace({'tableau_country' : ['Dagestan', 'Syberia']}, 'Russia', inplace=True)
df.replace({'tableau_country' : 'Korea'}, 'South Korea', inplace=True)
df.replace({'tableau_country' : ['Canary Islands', 'Hispano-Moorish']}, 'Spain', inplace=True)
df.replace({'tableau_country' : 'Dutch Guiana (Suriname)'}, 'Suriname', inplace=True)

In [None]:
# Replaced country values for countries T-Z
df.replace({'tableau_country' : ['Siam','Central Thailand', 'Northern Thailand'
                                ]}, 'Thailand', inplace=True)
df.replace({'tableau_country' : 'Tonga Islands'}, 'Tonga', inplace=True)
df.replace({'tableau_country' : 'Anatolia'}, 'Turkey', inplace=True)
df.replace({'tableau_country' : ['England', 'Scotland', 'Engalnd', 'Great Britain',
                                 'Northern Ireland', 'Scotland And England', 'Wales'
                                ]}, 'United Kingdom', inplace=True)
df.replace({'tableau_country' : ['New England, United States', 'Unted States'
                                ]}, 'United States', inplace=True)
df.replace({'tableau_country' : ['Bokhara', 'Bukhara', "Qoraqalpog'Iston / Uzbekistan", 
                                 "Qoraqalpog'Iston", 'Uzbekestan']}, 'Uzbekistan', inplace=True)
df.replace({'tableau_country' : 'Banks Islands'}, 'Vanuatu', inplace=True)
df.replace({'tableau_country' : 'Northern Vietnam'}, 'Vietnam', inplace=True)

## Tableau Country Names
Pulled data frame into Tableau and found 311 non country identified data, using the following steps the countries Tableau didn't recognize were fixed (unclear data was not changed - e.g. 'India or China')
* Make test output file and read into Tableau
* Pull up unknown values from bottom corner of map, and update if possible
    * Created a new column for just Tableau values
    * Changed country name values to modern country if possible, updated culture with any extraneous values
* Divide by alphabet to help keep track of changes

In [6]:
# Replaced country values for countries A-C
df.replace({'tableau_country' : ['Flanders/Belgium', 'Belgium (Flanders)', 'Flemish', 
                                 'Flanders']},'Belgium', inplace=True)
df.replace({'tableau_country' : 'Dahomey'}, 'Benin', inplace=True)
df.replace({'tableau_country' : 'Rapa Nui (Easter Island)'}, 'Chile', inplace=True)
df.replace({'tableau_country' : ['Jingdezhen, China', 'Chinese Export', 'Southern China'
                                ]}, 'China', inplace=True)
df.replace({'tableau_country' : 'Bohemia'}, 'Czechia', inplace=True)


In [7]:
# Replaced country values for countries F-G
df.replace({'tableau_country' : 'Fiji Islands'}, 'Fiji', inplace=True)
df.replace({'tableau_country' : ['Limoges','Lorraine']}, 'France', inplace=True)
df.replace({'tableau_country' : ['Austral Islands', 'Marquesas Islands', 'Society Islands'
                                ]}, 'French Polynesia', inplace=True)
df.replace({'tableau_country' : ['Germany (Saxony)', 'West Germany', 'East Germany', 'Bavaria',
                                'Germany, Europe', 'Germany (Prussia)', 'Southern Germany'
                                ]},'Germany', inplace=True)
df.replace({'tableau_country' : ['Greek Islands', 'Greek']}, 'Greece', inplace=True)
df.replace({'tableau_country' : 'Guatamala'}, 'Guatemala', inplace=True)


In [8]:
# Replaced country values for countries I-K
df.replace({'tableau_country' : 'Indian Export'}, 'India', inplace=True)
df.replace({'tableau_country' : ['Bali', 'East Bali', 'Java','Kalimanta','Lombok Island',
                                'Papua']}, 'Indonesia', inplace=True)
df.replace({'tableau_country' : ['Persia (Iran)', 'Luristan, Persia', 'Iran (Persia)',
                                'Iran (Persia (Khourvin))']}, 'Iran', inplace=True)
df.replace({'tableau_country' : ['Babylon','Mesopotamia (Iraq)', 'Sumeria (Iraq)'
                                ]}, 'Iraq', inplace=True)
df.replace({'tableau_country' : ['Italy Or Sicily', 'Sicily', 'Itlay', 'Etruria', 'Italy, Ancient Etruria',
                                'Italy Or Sardinia', 'Italy Or Sardina', 'Italy or Sicily',
                                'Italy (Siculo-Campanian)', 'Northern Italy', 'Sardinia',
                                'Sardinia Or Italy', 'Sicily Or Italy', 'Venice Or Padua, Italy'
                                ]}, 'Italy', inplace=True)

In [9]:
# Replaced country values for countries L-P
df.replace({'tableau_country' : 'Northwest Laos'}, 'Laos', inplace=True)
df.replace({'tableau_country' : ['Southeastern Mexico', 'Tarascan']}, 'Mexico', inplace=True)
df.replace({'tableau_country' : ['The Netherlands', 'Nehterlands','Netherlands/Flanders'
                                ]}, 'Netherlands', inplace=True)
df.replace({'tableau_country' : ['Aotearoa (New Zealand)', 'New Zeleand']}, 'New Zealand', inplace=True)
df.replace({'tableau_country' : 'Fakara Lamedi'}, 'Niger', inplace=True)
df.replace({'tableau_country' : 'North Nigeria'}, 'Nigeria', inplace=True)
df.replace({'tableau_country' : ['Admiralty Islands', 'New Ireland']}, 'Papua New Guinea', inplace=True)
df.replace({'tableau_country' : 'Philippine Islands'}, 'Philippines', inplace=True)


In [10]:
# Replaced country values for countries Q-S
df.replace({'tableau_country' : ['Dagestan', 'Syberia']}, 'Russia', inplace=True)
df.replace({'tableau_country' : 'Korea'}, 'South Korea', inplace=True)
df.replace({'tableau_country' : ['Canary Islands', 'Hispano-Moorish']}, 'Spain', inplace=True)
df.replace({'tableau_country' : 'Dutch Guiana (Suriname)'}, 'Suriname', inplace=True)


In [11]:
# Replaced country values for countries T-Z
df.replace({'tableau_country' : ['Siam','Central Thailand', 'Northern Thailand'
                                ]}, 'Thailand', inplace=True)
df.replace({'tableau_country' : 'Tonga Islands'}, 'Tonga', inplace=True)
df.replace({'tableau_country' : 'Anatolia'}, 'Turkey', inplace=True)
df.replace({'tableau_country' : ['England', 'Scotland', 'Engalnd', 'Great Britain',
                                 'Northern Ireland', 'Scotland And England', 'Wales'
                                ]}, 'United Kingdom', inplace=True)
df.replace({'tableau_country' : ['New England, United States', 'Unted States'
                                ]}, 'United States', inplace=True)
df.replace({'tableau_country' : ['Bokhara', 'Bukhara', "Qoraqalpog'Iston / Uzbekistan", 
                                 "Qoraqalpog'Iston", 'Uzbekestan']}, 'Uzbekistan', inplace=True)
df.replace({'tableau_country' : 'Banks Islands'}, 'Vanuatu', inplace=True)
df.replace({'tableau_country' : 'Northern Vietnam'}, 'Vietnam', inplace=True)


In [17]:
# Historical countries with multiple current countries or cultures that span two countries:
# Arabia, Austro_hungarian Empire, Balkans, Borneo, Czechoslovakia, Flanders (Europe)
## Graeco-Roman, Khmer, Kurdistan, Mesopotamia, Micronesian Islands, New Guinea
## Nubia, Ottoman Empire, Persia, Ptolemaic Egypt, Roman, Tibet, Yugoslavia

In [18]:
# New file for visualizations from objects with updated country names
output_datafile = 'resources/Mia_objects_country.csv'
df.to_csv(output_datafile, encoding='utf-8')