In [1]:
import pandas as pd
import numpy as np
import requests
import time
import pprint

#API key pulled from a config.py in format of `prog_search_key` =  "your_key_here" 
from config import prog_search_key

## Attempt using country_codes_combined.csv

In [2]:
country_codes_csv = pd.read_csv('data/country_codes_combined.csv')

In [3]:
country_codes_csv = country_codes_csv[ ['alpha2','de','en'] ]
country_codes_df = country_codes_csv.copy()
country_codes_df.head()

Unnamed: 0,alpha2,de,en
0,af,Afghanistan,Afghanistan
1,al,Albanien,Albania
2,dz,Algerien,Algeria
3,ad,Andorra,Andorra
4,ao,Angola,Angola


In [4]:
world_risk_index_csv = pd.read_csv('data/world_risk_index.csv')
world_risk_index_csv['Region'][1858]

'Korea Republic of 4.59'

In [5]:
print(f'Rows in country_codes_df: {len(country_codes_df.index)}')
print(f'Rows in country_codes_df: {len(world_risk_index_csv.index)}')

Rows in country_codes_df: 193
Rows in country_codes_df: 1917


In [6]:
merged_df_en = world_risk_index_csv.merge(country_codes_df, how='left', left_on='Region', right_on='en')
merged_df_de = world_risk_index_csv.merge(country_codes_df, how='left', left_on='Region', right_on='de')
merged_all = pd.concat([merged_df_en, merged_df_de])
merged_dropped = merged_all.dropna().copy()

In [7]:
merged_world = merged_dropped.merge(world_risk_index_csv,how='right')
merged_final = merged_world.drop_duplicates(ignore_index=True).copy()

In [8]:
merged_final[merged_final['alpha2'].isnull()]

Unnamed: 0,Region,WRI,Exposure,Vulnerability,Susceptibility,Lack of Coping Capabilities,Lack of Adaptive Capacities,Year,Exposure Category,WRI Category,Vulnerability Category,Susceptibility Category,alpha2,de,en
67,Kongo,7.71,12.19,63.28,50.98,87.39,51.45,2011,Medium,Medium,High,Very High,,,
73,Swasiland,7.37,11.98,61.56,48.56,83.10,53.02,2011,Medium,Medium,High,High,,,
102,Mazedonien,5.86,14.28,41.03,19.28,64.74,39.05,2011,Medium,Medium,Low,Low,,,
107,Botsuana,5.56,11.52,48.26,30.25,68.14,46.40,2011,Low,Low,Medium,Medium,,,
122,Tschechische Republik,4.15,11.00,37.75,14.37,66.67,32.22,2011,Low,Low,Low,Very Low,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1871,Libyan Arab Jamahiriya,3.79,7.80,48.65,25.03,78.33,42.58,2016,Very Low,Low,Medium,Medium,,,
1872,United States,3.76,12.25,30.68,16.35,48.24,27.46,2016,Medium,Low,Very Low,Very Low,,,
1873,Russia,3.58,9.38,38.15,21.53,59.12,33.81,2016,Low,Low,Low,Medium,,,
1876,United Kingdom,3.54,11.60,30.54,17.29,45.95,28.37,2016,Medium,Low,Very Low,Very Low,,,


## Attempt at Pulling From Additional Source

In [9]:
ger_url = "https://www.oenb.at/Statistik/Klassifikationen/ISO-Codes/ISO-Code-Verzeichnis-fuer-Laender--und-Waehrungscodes.html"
ger_codes = pd.read_html(ger_url)
ger_code_draft = ger_codes[0].copy()
ger_code_df = ger_code_draft[ ['Land','ISO-Code (Land)'] ]
ger_code_df = ger_code_df.fillna("").copy()
ger_code_df_clean =  ger_code_df.loc[ger_code_df['ISO-Code (Land)']!='一一一']
ger_code_df_clean

Unnamed: 0,Land,ISO-Code (Land)
0,Afghanistan,AF
1,Ägypten,EG
2,Aland,AX
3,Albanien,AL
4,Algerien,DZ
...,...,...
262,Westsahara Eigenst.Staat,EH
263,Westsahara Eigenst.Staat,EH
264,Westsahara Eigenst.Staat,EH
265,Zentralafrikanische Republik,CF


In [10]:
world_risk_index_csv.merge(ger_code_df_clean, how='left', left_on='Region', right_on='Land')

Unnamed: 0,Region,WRI,Exposure,Vulnerability,Susceptibility,Lack of Coping Capabilities,Lack of Adaptive Capacities,Year,Exposure Category,WRI Category,Vulnerability Category,Susceptibility Category,Land,ISO-Code (Land)
0,Vanuatu,32.00,56.33,56.81,37.14,79.34,53.96,2011,Very High,Very High,High,High,Vanuatu,VU
1,Tonga,29.08,56.04,51.90,28.94,81.80,44.97,2011,Very High,Very High,Medium,Medium,Tonga,TO
2,Philippinen,24.32,45.09,53.93,34.99,82.78,44.01,2011,Very High,Very High,High,High,Philippinen,PH
3,Salomonen,23.51,36.40,64.60,44.11,85.95,63.74,2011,Very High,Very High,Very High,High,Salomonen,SB
4,Guatemala,20.88,38.42,54.35,35.36,77.83,49.87,2011,Very High,Very High,High,High,Guatemala,GT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2057,Grenada,1.42,3.13,45.39,24.54,68.82,42.82,2016,Very Low,Very Low,Medium,Medium,Grenada,GD
2058,Barbados,1.32,3.46,38.26,18.20,50.29,46.29,2016,Very Low,Very Low,Low,Low,Barbados,BB
2059,Saudi Arabia,1.14,2.93,38.96,14.80,65.01,37.07,2016,Very Low,Very Low,Low,Very Low,,
2060,Malta,0.60,1.65,36.25,15.97,59.33,33.44,2016,Very Low,Very Low,Low,Very Low,Malta,MT


In [11]:
url = "https://cloford.com/resources/codes/index.htm"

country_code_import = pd.read_html(url)
country_code_draft = country_code_import[3].copy()
country_code_df = country_code_draft[  ['Country','ISO (2)','Continent','Region','Capital' ]  ]

In [12]:
ger_url = "https://www.oenb.at/Statistik/Klassifikationen/ISO-Codes/ISO-Code-Verzeichnis-fuer-Laender--und-Waehrungscodes.html"
ger_codes = pd.read_html(ger_url)
ger_code_draft = ger_codes[0].copy()
ger_code_df = ger_code_draft[ ['Land','ISO-Code (Land)'] ]
ger_code_df = ger_code_df.fillna("").copy()
ger_code_df_clean =  ger_code_df.loc[ger_code_df['ISO-Code (Land)']!='一一一']
ger_code_df_clean

Unnamed: 0,Land,ISO-Code (Land)
0,Afghanistan,AF
1,Ägypten,EG
2,Aland,AX
3,Albanien,AL
4,Algerien,DZ
...,...,...
262,Westsahara Eigenst.Staat,EH
263,Westsahara Eigenst.Staat,EH
264,Westsahara Eigenst.Staat,EH
265,Zentralafrikanische Republik,CF


In [13]:
merged_import_codes = ger_code_df_clean.merge(country_code_df, left_on='ISO-Code (Land)', right_on='ISO (2)').copy()
merged_import_codes_rename= merged_import_codes.rename(columns={'Region':'Area'}).copy()

In [141]:
merged_df_import_de = world_risk_index_csv.merge(merged_import_codes_rename, how='left', left_on='Region', right_on='Land')
merged_df_import_en = world_risk_index_csv.merge(merged_import_codes_rename, how='left', left_on='Region', right_on='Country')
import_merged_all = pd.concat([merged_df_import_en, merged_df_import_de])
import_merged_dropped = import_merged_all.dropna().copy()
import_merged_world = import_merged_dropped.merge(world_risk_index_csv,how='right')
import_merged_final = import_merged_world.drop_duplicates(ignore_index=True).copy()
import_merged_final

Unnamed: 0,Region,WRI,Exposure,Vulnerability,Susceptibility,Lack of Coping Capabilities,Lack of Adaptive Capacities,Year,Exposure Category,WRI Category,Vulnerability Category,Susceptibility Category,Land,ISO-Code (Land),Country,ISO (2),Continent,Area,Capital
0,Vanuatu,32.00,56.33,56.81,37.14,79.34,53.96,2011,Very High,Very High,High,High,Vanuatu,VU,Vanuatu,VU,Oceania,Pacific,Port-Vila
1,Tonga,29.08,56.04,51.90,28.94,81.80,44.97,2011,Very High,Very High,Medium,Medium,Tonga,TO,Tonga,TO,Oceania,Pacific,Nuku'alofa
2,Philippinen,24.32,45.09,53.93,34.99,82.78,44.01,2011,Very High,Very High,High,High,Philippinen,PH,Philippines,PH,Asia,South East Asia,Manila
3,Salomonen,23.51,36.40,64.60,44.11,85.95,63.74,2011,Very High,Very High,Very High,High,Salomonen,SB,Solomon Islands,SB,Oceania,Pacific,Honiara
4,Guatemala,20.88,38.42,54.35,35.36,77.83,49.87,2011,Very High,Very High,High,High,Guatemala,GT,Guatemala,GT,Americas,Central America,Guatemala
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912,Grenada,1.42,3.13,45.39,24.54,68.82,42.82,2016,Very Low,Very Low,Medium,Medium,Grenada,GD,Grenada,GD,Americas,West Indies,Saint George's
1913,Barbados,1.32,3.46,38.26,18.20,50.29,46.29,2016,Very Low,Very Low,Low,Low,Barbados,BB,Barbados,BB,Americas,West Indies,Bridgetown
1914,Saudi Arabia,1.14,2.93,38.96,14.80,65.01,37.07,2016,Very Low,Very Low,Low,Very Low,Saudi-Arabien,SA,Saudi Arabia,SA,Asia,South West Asia,Riyadh
1915,Malta,0.60,1.65,36.25,15.97,59.33,33.44,2016,Very Low,Very Low,Low,Very Low,Malta,MT,Malta,MT,Europe,Southern Europe,Valletta


## Attempt to finish cleaning by using custom google search API

Finally, after trying to use two different sources for screening German names I found out that the original data input from the original dataset source was inconsistent. Following is how I solved this problem using Google's Custom Search API. 

Fortunately I was able to cut the amount of inconsistent naming conventions down to 33 unique countries. This fits within the Custom Search API's 100 free daily search limits. 

In [15]:
import_csv_merge = import_merged_final.merge(merged_final,how='outer').copy()
iso_codes_df = import_csv_merge[ ['ISO (2)','alpha2'] ]
import_csv_merge['iso_code'] = iso_codes_df.bfill(axis=1).iloc[:, 0]
null_codes = import_csv_merge[import_csv_merge['iso_code'].isnull()].copy()
null_codes['Region'] = null_codes['Region'].drop_duplicates().copy()
null_codes = null_codes[null_codes['Region'].notna()]
null_codes['Region'] = null_codes['Region'].str.replace('\d+', '')
null_codes = null_codes.reset_index()

  null_codes['Region'] = null_codes['Region'].str.replace('\d+', '')


In [200]:
null_codes_list = []
null_codes_list = null_codes['Region'].tolist()

In [99]:
### I wish I did not have to do this. Not even GOOGLE could properly figure out this input. It desperately wanted to call
### this Yugoslavia. While I understand why, the frustration is still present. 

null_codes_list = ["North Macedonia" if country=='T. f. Yugo. Rep. of Macedonia' else country for country in null_codes_list].copy()

In [149]:
api_url = "https://customsearch.googleapis.com/customsearch/v1?"
cx = "d3772df2249924485"
key = prog_search_key
num = 1
site_search = "https://en.wikipedia.org/wiki/ISO_3166-2:"
search_filter = "i"
query_url = (f"{api_url}cx={cx}&key={key}&num={num}&{site_search}&{search_filter}&q=")

# API Call 
## Please do not try to run this cell. I have set it to read-only. 

I have also commented it out as it should ONLY be used by Jacob McManaman, or by someone who knows what they are doing (or who is aware that *thinking* they know what they are doing can easily have consequences) and has willingly set up their Google API key for use with Google's Custom Search API. Someone who has done so must also have acknowledged that there is only 100 searches per day. Thoughtfully, this call will only run 33 searches.

API aside, running this cell will reset the `request_list` list which *can* be something incredibly annoying. I believe I have taken steps to circumvent any accidents, but in the event I have not taken enough precaution, should someone go through the effort to change the cell from read-only and runs the cell frivolously, you will make the writer of this markdown doomingly sad.

In [72]:
counter = 0
request_list= []
for country in null_codes_list:
    counter = counter + 1
    query = requests.get(f"{query_url}{country} iso code").json()
    print(f"Search Request {counter} of {len(null_codes_list)} : {country}")
    request_list.append(query)
    time.sleep(.5)

Search Request 1 of 33 : Swasiland
Search Request 2 of 33 : Mazedonien
Search Request 3 of 33 : Tschechische Republik
Search Request 4 of 33 : Südkorea
Search Request 5 of 33 : Vereinigte Arabisch Emirate
Search Request 6 of 33 : Vereinigte Staaten v. A.
Search Request 7 of 33 : Surinam
Search Request 8 of 33 : Moldawien
Search Request 9 of 33 : Vereinigte Staaten von Amerika
Search Request 10 of 33 : Weißrussland
Search Request 11 of 33 : Zentralafrik. Republik
Search Request 12 of 33 : Ver. Arabische Emirate
Search Request 13 of 33 : United Republic of Tanzania
Search Request 14 of 33 : North Macedonia
Search Request 15 of 33 : Republic of Moldova
Search Request 16 of 33 : Korea Republic of
Search Request 17 of 33 : Libyan Arab Jamahiriya
Search Request 18 of 33 : Demokratische Rep. Kongo
Search Request 19 of 33 : Föd. Staaten von Mikronesien
Search Request 20 of 33 : Ver. Staaten von Amerika
Search Request 21 of 33 : São Tomé and Príncipe
Search Request 22 of 33 : St. Vincent u. die

 ### Reasons for and Mechanics of the API Call:
Originally, I had hoped that there was consistency with the original Dataset. I was very wrong and the German country/region names deviated from convention. Thankfully I was able to clean 98% (1884/1917) of the German region names using two external sources. 

I found that I could just google the final 2% (33) country names and google would correct the search to produce a country code provided from `de.wikipedia.org/`. A useful tool google provides is the ability to filter by website, through a **site:`www.example.com`** query, or by creating a [Programmable Search Engine](https://programmablesearchengine.google.com/about/). This programable enginge can then be utilized by [Google's Custom Search API](https://developers.google.com/custom-search/v1/overview). Limited by 100 free searches a day, this project is very fortunate that only 33 of the data needed this treatment. The overview of the API is as follows;

`https://www.googleapis.com/customsearch/v1/siterestrict?cx=   &key=   &q=`

Where `?cx=` is the engine ID that is referenced for the search, the `&key=` is the API key that is used to make the call, and `&q=` is the query. 

And so this API call utilizes a programmable engine set to specifically filter websites by `de.wikipedia.org/`. While other websites did populate, since a call needed to be made individually for each erroneous data , the german wikipedia was preferable since its results returned the single country/regions information page, while others returned a table with every other country code. Organically, the search would look something like this:

![title](data/images/organic_search.png)

Thanks to Google, any sort of cleaning of poorly inputed data is done for us by these request. It's just up to us to clean the resulting request results.

## JSON cleaning

Once the API call is done, the resulting JSON is sent to a list and that list is cleaned in this code. As of now it still needs some polishing I believe.

In [78]:
test_test_url = request_list[0]['items'][0]['link']
split_list = test_test_url.split(':')
split_list[2]

'KR'

In [201]:
bad_requests = []
good_requests = []
all_country_codes = []
country_codes = []
for request in range(len(request_list)):
    try:
        test_test_url = request_list[request]['items'][0]['link']
        split_list = test_test_url.split(':')
        print(f"Country Code: {split_list[2]}")
        all_country_codes.append(split_list[2])
        country_codes.append(split_list[2])
        good_requests.append(request_list[request]['queries']['request'][0]['searchTerms'])
    except(KeyError):
        print(f"Skipped request {request}: {request_list[request]['queries']['request'][0]['searchTerms']}")
        bad_requests.append(request_list[request]['queries']['request'][0]['searchTerms'])
        all_country_codes.append(request_list[request]['queries']['request'][0]['searchTerms'])

Country Code: SZ
Country Code: MK
Country Code: CZ
Country Code: KR
Country Code: AE
Skipped request 5: Vereinigte Staaten v. A. iso code
Country Code: SR
Country Code: MD
Skipped request 8: Vereinigte Staaten von Amerika iso code
Country Code: BY
Skipped request 10: Zentralafrik. Republik iso code
Skipped request 11: Ver. Arabische Emirate iso code
Country Code: TZ
Country Code: MK
Country Code: MD
Country Code: KR
Country Code: LY
Skipped request 17: Demokratische Rep. Kongo iso code
Skipped request 18: Föd. Staaten von Mikronesien iso code
Skipped request 19: Ver. Staaten von Amerika iso code
Country Code: ST
Skipped request 21: St. Vincent u. die Grenadinen iso code
Country Code: AT
Country Code: DE
Country Code: NO
Skipped request 25: St. Vincent und d. Grenadinen iso code
Skipped request 26: Föd. Staaten v. Mikronesien iso code
Skipped request 27: St. Vincent u. d. Grenadinen iso code
Country Code: RO
Country Code: MN
Country Code: MK
Country Code: LA
Country Code: KR


In [202]:
fixed_requests = []

fixed_requests = [
    "United States" if ('Vereinigte Staaten') in country else country for country in bad_requests
].copy()

fixed_requests = [
    "United States" if ('Ver. Staaten von Amerika') in country else country for country in fixed_requests
].copy()

fixed_requests = [
    "Central African Republic" if ('Zentralafrik') in country else country for country in fixed_requests
].copy() 

fixed_requests = [
    "United Arab Emirates" if ('Arabische Emirate') in country else country for country in fixed_requests
].copy() 

fixed_requests = [
    "Democratic Republic of the Congo" if ('Kongo') in country else country for country in fixed_requests
].copy() 

fixed_requests = [
    "Federated States of Micronesia" if ('Mikronesien') in country else country for country in fixed_requests
].copy()

fixed_requests = [
    "Saint Vincent and the Grenadines" if ('St. Vincent') in country else country for country in fixed_requests
].copy()

fixed_requests

['United States',
 'United States',
 'Central African Republic',
 'United Arab Emirates',
 'Democratic Republic of the Congo',
 'Federated States of Micronesia',
 'United States',
 'Saint Vincent and the Grenadines',
 'Saint Vincent and the Grenadines',
 'Federated States of Micronesia',
 'Saint Vincent and the Grenadines']

In [153]:
counter = 0
fixed_request_list= []
for country in fixed_requests:
    counter = counter + 1
    query = requests.get(f"{query_url}{country} iso code").json()
    print(f"Search Request {counter} of {len(fixed_requests)} : {country}")
    fixed_request_list.append(query)
    time.sleep(.5)

Search Request 1 of 11 : United States
Search Request 2 of 11 : United States
Search Request 3 of 11 : Central African Republic
Search Request 4 of 11 : United Arab Emirates
Search Request 5 of 11 : Democratic Republic of the Congo
Search Request 6 of 11 : Federated States of Micronesia
Search Request 7 of 11 : United States
Search Request 8 of 11 : Saint Vincent and the Grenadines
Search Request 9 of 11 : Saint Vincent and the Grenadines
Search Request 10 of 11 : Federated States of Micronesia
Search Request 11 of 11 : Saint Vincent and the Grenadines


In [203]:
test_bad_requests = []
test_all_country_codes = []
test_country_codes = []
for request in range(len(fixed_request_list)):
    try:
        fixed_test_url = fixed_request_list[request]['items'][0]['link']
        split_list = fixed_test_url.split(':')
        print(f"Country Code: {split_list[2]}")
        test_all_country_codes.append(split_list[2])
        test_country_codes.append(split_list[2])
    except(KeyError):
        print(f"Skipped request {request}: {fixed_request_list[request]['queries']['request'][0]['searchTerms']}")
        test_bad_requests.append(fixed_request_list[request]['queries']['request'][0]['searchTerms'])
        test_country_codes.append(fixed_request_list[request]['queries']['request'][0]['searchTerms'])

Country Code: US
Country Code: US
Country Code: CF
Country Code: AE
Country Code: CD
Country Code: FM
Country Code: US
Country Code: VC
Country Code: VC
Country Code: FM
Country Code: VC


In [204]:
iso_bad_request = []
iso_good_request = []

iso_bad_request = ([s.replace(' iso code', '') for s in bad_requests])

iso_good_request = ([s.replace(' iso code', '') for s in good_requests])

In [205]:
fixed_bad_codes_df = pd.DataFrame({
    'alpha2': test_country_codes,
    'Regions': iso_bad_request
})

fixed_good_codes_df = pd.DataFrame({
    'alpha2': country_codes,
    'Regions': iso_good_request
})

joined_codes = fixed_bad_codes_df.merge(fixed_good_codes_df, how='right').copy()
joined_codes 

Unnamed: 0,alpha2,Regions
0,SZ,Swasiland
1,MK,Mazedonien
2,CZ,Tschechische Republik
3,KR,Südkorea
4,AE,Vereinigte Arabisch Emirate
5,SR,Surinam
6,MD,Moldawien
7,BY,Weißrussland
8,TZ,United Republic of Tanzania
9,MK,North Macedonia


In [211]:
complete_df = import_merged_final.merge(joined_codes, how='left', left_on='Region', right_on='Regions')
complete_df

Unnamed: 0,Region,WRI,Exposure,Vulnerability,Susceptibility,Lack of Coping Capabilities,Lack of Adaptive Capacities,Year,Exposure Category,WRI Category,...,Susceptibility Category,Land,ISO-Code (Land),Country,ISO (2),Continent,Area,Capital,alpha2,Regions
0,Vanuatu,32.00,56.33,56.81,37.14,79.34,53.96,2011,Very High,Very High,...,High,Vanuatu,VU,Vanuatu,VU,Oceania,Pacific,Port-Vila,,
1,Tonga,29.08,56.04,51.90,28.94,81.80,44.97,2011,Very High,Very High,...,Medium,Tonga,TO,Tonga,TO,Oceania,Pacific,Nuku'alofa,,
2,Philippinen,24.32,45.09,53.93,34.99,82.78,44.01,2011,Very High,Very High,...,High,Philippinen,PH,Philippines,PH,Asia,South East Asia,Manila,,
3,Salomonen,23.51,36.40,64.60,44.11,85.95,63.74,2011,Very High,Very High,...,High,Salomonen,SB,Solomon Islands,SB,Oceania,Pacific,Honiara,,
4,Guatemala,20.88,38.42,54.35,35.36,77.83,49.87,2011,Very High,Very High,...,High,Guatemala,GT,Guatemala,GT,Americas,Central America,Guatemala,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912,Grenada,1.42,3.13,45.39,24.54,68.82,42.82,2016,Very Low,Very Low,...,Medium,Grenada,GD,Grenada,GD,Americas,West Indies,Saint George's,,
1913,Barbados,1.32,3.46,38.26,18.20,50.29,46.29,2016,Very Low,Very Low,...,Low,Barbados,BB,Barbados,BB,Americas,West Indies,Bridgetown,,
1914,Saudi Arabia,1.14,2.93,38.96,14.80,65.01,37.07,2016,Very Low,Very Low,...,Very Low,Saudi-Arabien,SA,Saudi Arabia,SA,Asia,South West Asia,Riyadh,,
1915,Malta,0.60,1.65,36.25,15.97,59.33,33.44,2016,Very Low,Very Low,...,Very Low,Malta,MT,Malta,MT,Europe,Southern Europe,Valletta,,


In [None]:
# parsed_url = urlparse(filtered_url)
# frag = urldefrag(filtered_url).fragment
# frag_split = frag.split(':')
# filtered_codes = frag_split[3]
# filtered_codes

In [None]:
# searched_codes = []
# filtered_list = []
# searched_codes.append(query)
# searched_codes[0]['items'][0]['title']
# filtered_list.append(searched_codes[0]['items'][0]['link'])
# filtered_url = "https://www.iso.org/obp/ui/#iso:code:3166:MD"
# parsed_url = urlparse(filtered_url)
# frag = urldefrag(filtered_url).fragment
# frag_split = frag.split(':')
# filtered_codes = frag_split[3]
# filtered_codes