# Mapping Countries & States to 2-letter dictionaries

September, 2021

I've been surprised by how many comments I get on my gists of Python dictionaries for mapping to 2-letter acronyms.  Both [states](https://gist.github.com/rogerallen/1583593) and [countries](https://gist.github.com/rogerallen/1583606).  Who knows, this code might be the most popular code I've ever written?  :-)

So, I decided to automate the process of keeping them up-to-date.  Countries are created periodically, states/provinces can change, and people always want slight tweaks to fit their own use cases.  In addition, Wikipedia will also change, requiring updates.

Looking around, I found that Wikipedia has not only a list of Country mappings, it also links to every countries subdivisions.  Using `requests`, `BeautifulSoup` and `Pandas`, I was able to automate the process of scraping the site to keep these gists up-to-date.  

Feel free to use this code for your own purposes.  It is dedicated to the public domain.  To the extent possible under law, Roger Allen has waived all copyright and related or neighboring rights to this code.


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime

In [2]:
this_code_gist_url = 'https://gist.github.com/rogerallen/d75440e8e5ea4762374dfd5c1ddf84e0'

# We can't get this from ISO, so, let's use Wikipedia
site = 'https://en.wikipedia.org'
iso3166_url = site+'/wiki/ISO_3166-1_alpha-2'

def get_iso3166_countries():
    """Get the Country table from Wikipedia from the URL https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
    inside the section 'Officially assigned code elements' which contains the 3rd table with columns labelled
    Code, Country name, Year, ccTLD, ISO 3166-2 and Notes.
    
    Reads the table and returns it as a Pandas DataFrame.
    """
    THE_TABLE = 2 # !!! Assumes required table is 3rd one !!!
    try:
        # grab web page
        url = iso3166_url
        r = requests.get(url)
        r.raise_for_status()
        # convert to soup
        soup = BeautifulSoup(r.text, 'html.parser')
        # find THE_TABLE
        for i,table in enumerate(soup.find_all('table')):
            if i == THE_TABLE:
                break
        # iterate the table gathering data into data dictionary
        have_header = False
        columns = {} # column number -> column label
        data = {} # table's data, dict of arrays
        for i,tr in enumerate(table.find_all('tr')):
            if not have_header:
                # read the headers into columns dict & setup data dict
                have_header = True
                for j,th in enumerate(tr.find_all('th')):
                    columns[j] = th.text.replace(' (using title case)','').strip()
                for col in columns.values():
                    data[col] = []
            else:
                # grab text for each td in tr, except ISO 3166-2, grab link
                for j,td in enumerate(tr.find_all('td')):
                    if columns[j] != 'ISO 3166-2':
                        data[columns[j]].append(td.text)
                    else:
                        a = td.find('a')
                        data[columns[j]].append(a.get('href'))
        # convert dictionary to dataframe
        return pd.DataFrame(data)
    except HTTPError:
        print(f"ERROR: {r.status_code} accessing url: {url}")
        
def get_iso3166_subdivisions(country_df,country_code):
    """The country dataframe has links to Wikipedia subdivision tables for each country.  
    E.g. for the U.S. it has States & Territories.  Return a Pandas DataFrame containing
    this data."""
    try:
        url = site + country_df[country_df['Code'] == country_code]['ISO 3166-2'].values[0]
        r = requests.get(url)
        r.raise_for_status()
        # convert to soup
        soup = BeautifulSoup(r.text, 'html.parser')
        # assume there is only one table containing the data.
        # iterate the table gathering data into data dictionary
        have_header = False
        columns = {} # column number -> column label
        data = {} # table's data, dict of arrays
        table = soup.find('table')
        for i,tr in enumerate(table.find_all('tr')):
            if not have_header:
                # read the headers into columns dict & setup data dict
                have_header = True
                for j,th in enumerate(tr.find_all('th')):
                    columns[j] = th.text.strip()
                for col in columns.values():
                    data[col] = []
            else:
                # grab text for each td in tr
                for j,td in enumerate(tr.find_all('td')):
                    data[columns[j]].append(td.text.strip())
        # convert dictionary to dataframe
        return pd.DataFrame(data)
    except HTTPError:
        print(f"ERROR: {r.status_code} accessing url: {url}")
        
def print_country_gist():
    now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    country_df = get_iso3166_countries()
    # print header
    print(f"""# Python Dictionary to translate Countries to Two-Letter codes and vice versa.
#
# https://gist.github.com/rogerallen/1583606
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# {iso3166_url}
#
# Automatically Generated {now_str} via Jupyter Notebook from
# {this_code_gist_url} 

country_to_abbrev = {{""")
    # print countries
    for i in range(country_df.shape[0]):
        print(f'    "{country_df.iloc[i]["Country name"]}": "{country_df.iloc[i]["Code"]}",') 
    # print footer
    print("""}
    
# invert the dictionary
abbrev_to_country = dict(map(reversed, country_to_abbrev.items()))

""")
    
def print_us_gist():
    now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    country_df = get_iso3166_countries()
    country_code = 'US'
    us_df = get_iso3166_subdivisions(country_df,country_code)
    url = site + country_df[country_df['Code'] == country_code]['ISO 3166-2'].values[0]
    # print header
    print(f"""# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# Canonical URL: https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# {url}
#
# Automatically Generated {now_str} via Jupyter Notebook from
# {this_code_gist_url} 

us_state_to_abbrev = {{""")
    # print countries
    for i in range(us_df.shape[0]):
        state = us_df.iloc[i]["Subdivision name (en)"]
        abbrev = us_df.iloc[i]["Code"].split('-')[1]
        print(f'    "{state}": "{abbrev}",') 
    # print footer
    print("""}
    
# invert the dictionary
abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))

""")

In [3]:
print_country_gist()

# Python Dictionary to translate Countries to Two-Letter codes and vice versa.
#
# https://gist.github.com/rogerallen/1583606
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
#
# Automatically Generated 2021-09-11 18:04:35 via Jupyter Notebook from
# https://gist.github.com/rogerallen/d75440e8e5ea4762374dfd5c1ddf84e0 

country_to_abbrev = {
    "Andorra": "AD",
    "United Arab Emirates": "AE",
    "Afghanistan": "AF",
    "Antigua and Barbuda": "AG",
    "Anguilla": "AI",
    "Albania": "AL",
    "Armenia": "AM",
    "Angola": "AO",
    "Antarctica": "AQ",
    "Argentina": "AR",
    "American Samoa": "AS",
    "Austria": "AT",
    "Australia": "AU",
    "Aruba": "AW",
    "Åland Islands": "AX",
    "Azerbaijan": "AZ",
    "Bosnia and Herzegovina": "BA",
    "Barbados": "BB",
    "Ba

# paste the code generated above to check

In [9]:
# Python Dictionary to translate Countries to Two-Letter codes and vice versa.
#
# https://gist.github.com/rogerallen/1583606
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
#
# Automatically Generated 2021-09-11 18:04:35 via Jupyter Notebook from
# https://gist.github.com/rogerallen/d75440e8e5ea4762374dfd5c1ddf84e0 

country_to_abbrev = {
    "Andorra": "AD",
    "United Arab Emirates": "AE",
    "Afghanistan": "AF",
    "Antigua and Barbuda": "AG",
    "Anguilla": "AI",
    "Albania": "AL",
    "Armenia": "AM",
    "Angola": "AO",
    "Antarctica": "AQ",
    "Argentina": "AR",
    "American Samoa": "AS",
    "Austria": "AT",
    "Australia": "AU",
    "Aruba": "AW",
    "Åland Islands": "AX",
    "Azerbaijan": "AZ",
    "Bosnia and Herzegovina": "BA",
    "Barbados": "BB",
    "Bangladesh": "BD",
    "Belgium": "BE",
    "Burkina Faso": "BF",
    "Bulgaria": "BG",
    "Bahrain": "BH",
    "Burundi": "BI",
    "Benin": "BJ",
    "Saint Barthélemy": "BL",
    "Bermuda": "BM",
    "Brunei Darussalam": "BN",
    "Bolivia (Plurinational State of)": "BO",
    "Bonaire, Sint Eustatius and Saba": "BQ",
    "Brazil": "BR",
    "Bahamas": "BS",
    "Bhutan": "BT",
    "Bouvet Island": "BV",
    "Botswana": "BW",
    "Belarus": "BY",
    "Belize": "BZ",
    "Canada": "CA",
    "Cocos (Keeling) Islands": "CC",
    "Congo, Democratic Republic of the": "CD",
    "Central African Republic": "CF",
    "Congo": "CG",
    "Switzerland": "CH",
    "Côte d'Ivoire": "CI",
    "Cook Islands": "CK",
    "Chile": "CL",
    "Cameroon": "CM",
    "China": "CN",
    "Colombia": "CO",
    "Costa Rica": "CR",
    "Cuba": "CU",
    "Cabo Verde": "CV",
    "Curaçao": "CW",
    "Christmas Island": "CX",
    "Cyprus": "CY",
    "Czechia": "CZ",
    "Germany": "DE",
    "Djibouti": "DJ",
    "Denmark": "DK",
    "Dominica": "DM",
    "Dominican Republic": "DO",
    "Algeria": "DZ",
    "Ecuador": "EC",
    "Estonia": "EE",
    "Egypt": "EG",
    "Western Sahara": "EH",
    "Eritrea": "ER",
    "Spain": "ES",
    "Ethiopia": "ET",
    "Finland": "FI",
    "Fiji": "FJ",
    "Falkland Islands (Malvinas)": "FK",
    "Micronesia (Federated States of)": "FM",
    "Faroe Islands": "FO",
    "France": "FR",
    "Gabon": "GA",
    "United Kingdom of Great Britain and Northern Ireland": "GB",
    "Grenada": "GD",
    "Georgia": "GE",
    "French Guiana": "GF",
    "Guernsey": "GG",
    "Ghana": "GH",
    "Gibraltar": "GI",
    "Greenland": "GL",
    "Gambia": "GM",
    "Guinea": "GN",
    "Guadeloupe": "GP",
    "Equatorial Guinea": "GQ",
    "Greece": "GR",
    "South Georgia and the South Sandwich Islands": "GS",
    "Guatemala": "GT",
    "Guam": "GU",
    "Guinea-Bissau": "GW",
    "Guyana": "GY",
    "Hong Kong": "HK",
    "Heard Island and McDonald Islands": "HM",
    "Honduras": "HN",
    "Croatia": "HR",
    "Haiti": "HT",
    "Hungary": "HU",
    "Indonesia": "ID",
    "Ireland": "IE",
    "Israel": "IL",
    "Isle of Man": "IM",
    "India": "IN",
    "British Indian Ocean Territory": "IO",
    "Iraq": "IQ",
    "Iran (Islamic Republic of)": "IR",
    "Iceland": "IS",
    "Italy": "IT",
    "Jersey": "JE",
    "Jamaica": "JM",
    "Jordan": "JO",
    "Japan": "JP",
    "Kenya": "KE",
    "Kyrgyzstan": "KG",
    "Cambodia": "KH",
    "Kiribati": "KI",
    "Comoros": "KM",
    "Saint Kitts and Nevis": "KN",
    "Korea (Democratic People's Republic of)": "KP",
    "Korea, Republic of": "KR",
    "Kuwait": "KW",
    "Cayman Islands": "KY",
    "Kazakhstan": "KZ",
    "Lao People's Democratic Republic": "LA",
    "Lebanon": "LB",
    "Saint Lucia": "LC",
    "Liechtenstein": "LI",
    "Sri Lanka": "LK",
    "Liberia": "LR",
    "Lesotho": "LS",
    "Lithuania": "LT",
    "Luxembourg": "LU",
    "Latvia": "LV",
    "Libya": "LY",
    "Morocco": "MA",
    "Monaco": "MC",
    "Moldova, Republic of": "MD",
    "Montenegro": "ME",
    "Saint Martin (French part)": "MF",
    "Madagascar": "MG",
    "Marshall Islands": "MH",
    "North Macedonia": "MK",
    "Mali": "ML",
    "Myanmar": "MM",
    "Mongolia": "MN",
    "Macao": "MO",
    "Northern Mariana Islands": "MP",
    "Martinique": "MQ",
    "Mauritania": "MR",
    "Montserrat": "MS",
    "Malta": "MT",
    "Mauritius": "MU",
    "Maldives": "MV",
    "Malawi": "MW",
    "Mexico": "MX",
    "Malaysia": "MY",
    "Mozambique": "MZ",
    "Namibia": "NA",
    "New Caledonia": "NC",
    "Niger": "NE",
    "Norfolk Island": "NF",
    "Nigeria": "NG",
    "Nicaragua": "NI",
    "Netherlands": "NL",
    "Norway": "NO",
    "Nepal": "NP",
    "Nauru": "NR",
    "Niue": "NU",
    "New Zealand": "NZ",
    "Oman": "OM",
    "Panama": "PA",
    "Peru": "PE",
    "French Polynesia": "PF",
    "Papua New Guinea": "PG",
    "Philippines": "PH",
    "Pakistan": "PK",
    "Poland": "PL",
    "Saint Pierre and Miquelon": "PM",
    "Pitcairn": "PN",
    "Puerto Rico": "PR",
    "Palestine, State of": "PS",
    "Portugal": "PT",
    "Palau": "PW",
    "Paraguay": "PY",
    "Qatar": "QA",
    "Réunion": "RE",
    "Romania": "RO",
    "Serbia": "RS",
    "Russian Federation": "RU",
    "Rwanda": "RW",
    "Saudi Arabia": "SA",
    "Solomon Islands": "SB",
    "Seychelles": "SC",
    "Sudan": "SD",
    "Sweden": "SE",
    "Singapore": "SG",
    "Saint Helena, Ascension and Tristan da Cunha": "SH",
    "Slovenia": "SI",
    "Svalbard and Jan Mayen": "SJ",
    "Slovakia": "SK",
    "Sierra Leone": "SL",
    "San Marino": "SM",
    "Senegal": "SN",
    "Somalia": "SO",
    "Suriname": "SR",
    "South Sudan": "SS",
    "Sao Tome and Principe": "ST",
    "El Salvador": "SV",
    "Sint Maarten (Dutch part)": "SX",
    "Syrian Arab Republic": "SY",
    "Eswatini": "SZ",
    "Turks and Caicos Islands": "TC",
    "Chad": "TD",
    "French Southern Territories": "TF",
    "Togo": "TG",
    "Thailand": "TH",
    "Tajikistan": "TJ",
    "Tokelau": "TK",
    "Timor-Leste": "TL",
    "Turkmenistan": "TM",
    "Tunisia": "TN",
    "Tonga": "TO",
    "Turkey": "TR",
    "Trinidad and Tobago": "TT",
    "Tuvalu": "TV",
    "Taiwan, Province of China": "TW",
    "Tanzania, United Republic of": "TZ",
    "Ukraine": "UA",
    "Uganda": "UG",
    "United States Minor Outlying Islands": "UM",
    "United States of America": "US",
    "Uruguay": "UY",
    "Uzbekistan": "UZ",
    "Holy See": "VA",
    "Saint Vincent and the Grenadines": "VC",
    "Venezuela (Bolivarian Republic of)": "VE",
    "Virgin Islands (British)": "VG",
    "Virgin Islands (U.S.)": "VI",
    "Viet Nam": "VN",
    "Vanuatu": "VU",
    "Wallis and Futuna": "WF",
    "Samoa": "WS",
    "Yemen": "YE",
    "Mayotte": "YT",
    "South Africa": "ZA",
    "Zambia": "ZM",
    "Zimbabwe": "ZW",
}
    
# invert the dictionary
abbrev_to_country = dict(map(reversed, country_to_abbrev.items()))

In [10]:
# quick check
abbrev_to_country["DK"], country_to_abbrev["Denmark"]

('Denmark', 'DK')

In [6]:
print_us_gist()

# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# Canonical URL: https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# https://en.wikipedia.org/wiki/ISO_3166-2:US
#
# Automatically Generated 2021-09-11 18:04:36 via Jupyter Notebook from
# https://gist.github.com/rogerallen/d75440e8e5ea4762374dfd5c1ddf84e0 

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Ma

# paste the code generated above to check

In [11]:
# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# Canonical URL: https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# https://en.wikipedia.org/wiki/ISO_3166-2:US
#
# Automatically Generated 2021-09-11 18:04:36 via Jupyter Notebook from
# https://gist.github.com/rogerallen/d75440e8e5ea4762374dfd5c1ddf84e0 

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}
    
# invert the dictionary
abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))

In [12]:
# quick check
abbrev_to_us_state['WI'], us_state_to_abbrev["Wisconsin"]

('Wisconsin', 'WI')