In [None]:
import pandas as pd
import numpy as np
import requests
import json
import time
from urllib.parse import urlparse, urldefrag, parse_qs
import pprint

#API key pulled from a config.py in format of `prog_search_key` =  "your_key_here" 
from config import prog_search_key

## Attempt using country_codes_combined.csv

In [None]:
country_codes_csv = pd.read_csv('data/country_codes_combined.csv')

In [None]:
country_codes_csv = country_codes_csv[ ['alpha2','de','en'] ]
country_codes_df = country_codes_csv.copy()
country_codes_df.head()

In [None]:
world_risk_index_csv = pd.read_csv('data/world_risk_index.csv')
world_risk_index_csv['Region'][1858]

In [None]:
print(f'Rows in country_codes_df: {len(country_codes_df.index)}')
print(f'Rows in country_codes_df: {len(world_risk_index_csv.index)}')

In [None]:
merged_df_en = world_risk_index_csv.merge(country_codes_df, how='left', left_on='Region', right_on='en')
merged_df_de = world_risk_index_csv.merge(country_codes_df, how='left', left_on='Region', right_on='de')
merged_all = pd.concat([merged_df_en, merged_df_de])
merged_dropped = merged_all.dropna().copy()

In [None]:
merged_world = merged_dropped.merge(world_risk_index_csv,how='right')
merged_final = merged_world.drop_duplicates(ignore_index=True).copy()

In [None]:
merged_final[merged_final['alpha2'].isnull()]

## Attempt at Pulling From Additional Source

In [None]:
ger_url = "https://www.oenb.at/Statistik/Klassifikationen/ISO-Codes/ISO-Code-Verzeichnis-fuer-Laender--und-Waehrungscodes.html"
ger_codes = pd.read_html(ger_url)
ger_code_draft = ger_codes[0].copy()
ger_code_df = ger_code_draft[ ['Land','ISO-Code (Land)'] ]
ger_code_df = ger_code_df.fillna("").copy()
ger_code_df_clean =  ger_code_df.loc[ger_code_df['ISO-Code (Land)']!='一一一']
ger_code_df_clean

In [None]:
world_risk_index_csv.merge(ger_code_df_clean, how='left', left_on='Region', right_on='Land')

In [None]:
url = "https://cloford.com/resources/codes/index.htm"

country_code_import = pd.read_html(url)
country_code_draft = country_code_import[3].copy()
country_code_df = country_code_draft[  ['Country','ISO (2)','Continent','Region','Capital' ]  ]

In [None]:
ger_url = "https://www.oenb.at/Statistik/Klassifikationen/ISO-Codes/ISO-Code-Verzeichnis-fuer-Laender--und-Waehrungscodes.html"
ger_codes = pd.read_html(ger_url)
ger_code_draft = ger_codes[0].copy()
ger_code_df = ger_code_draft[ ['Land','ISO-Code (Land)'] ]
ger_code_df = ger_code_df.fillna("").copy()
ger_code_df_clean =  ger_code_df.loc[ger_code_df['ISO-Code (Land)']!='一一一']
ger_code_df_clean

In [None]:
merged_import_codes = ger_code_df_clean.merge(country_code_df, left_on='ISO-Code (Land)', right_on='ISO (2)').copy()
merged_import_codes_rename= merged_import_codes.rename(columns={'Region':'Area'}).copy()

In [None]:
merged_df_import_de = world_risk_index_csv.merge(merged_import_codes_rename, how='left', left_on='Region', right_on='Land')
merged_df_import_en = world_risk_index_csv.merge(merged_import_codes_rename, how='left', left_on='Region', right_on='Country')
import_merged_all = pd.concat([merged_df_import_en, merged_df_import_de])
import_merged_dropped = import_merged_all.dropna().copy()
import_merged_world = import_merged_dropped.merge(world_risk_index_csv,how='right')
import_merged_final = import_merged_world.drop_duplicates(ignore_index=True).copy()

## Attempt to finish cleaning by using custom google search API

Finally, after trying to use two different sources for screening German names I found out that the original data input from the original dataset source was inconsistent. Following is how I solved this problem using Google's Custom Search API. 

Fortunately I was able to cut the amount of inconsistent naming conventions down to 33 unique countries. This fits within the Custom Search API's 100 free daily search limits. 

In [None]:
import_csv_merge = import_merged_final.merge(merged_final,how='outer').copy()
iso_codes_df = import_csv_merge[ ['ISO (2)','alpha2'] ]
import_csv_merge['iso_code'] = iso_codes_df.bfill(axis=1).iloc[:, 0]
null_codes = import_csv_merge[import_csv_merge['iso_code'].isnull()].copy()
null_codes['Region'] = null_codes['Region'].drop_duplicates().copy()
null_codes = null_codes[null_codes['Region'].notna()]
null_codes['Region'] = null_codes['Region'].str.replace('\d+', '')
null_codes = null_codes.reset_index()

In [None]:
null_codes_list = []
null_codes_list = null_codes['Region'].tolist()
null_codes_list

In [None]:
api_url = "https://www.googleapis.com/customsearch/v1/siterestrict"
cx = "d3772df2249924485"
key = prog_search_key
num = 1
query_url = (f"{api_url}?cx={cx}&key={key}&num={num}&q=")

In [None]:
query_url

# API Call 
## Please do not try to run this cell. I have set it to read-only. 

I have also commented it out as it should ONLY be used by Jacob McManaman, or by someone who knows what they are doing (or who is aware that *thinking* they know what they are doing can easily have consequences) and has willingly set up their Google API key for use with Google's Custom Search API. Someone who has done so must also have acknowledged that there is only 100 searches per day. Thoughtfully, this call will only run 33 searches.

API aside, running this cell will reset the `request_list` list which *can* be something incredibly annoying. I believe I have taken steps to circumvent any accidents, but in the event I have not taken enough precaution, should someone go through the effort to change the cell from read-only and runs the cell frivolously, you will make the writer of this markdown doomingly sad.

In [None]:
# counter = 0
# request_list= []
# for country in null_codes_list:
#     counter = counter + 1
#     query = requests.get(f"{query_url}{country}iso code").json()
#     print(f"Search Request {counter} of {len(null_codes_list)} : {country}")
#     request_list.append(query)
#     time.sleep(.5)

In [None]:
pprint.pprint(request_list)

In [None]:
test_test_url = request_list[0]['items'][0]['link']
split_list = test_test_url.split(':')
split_list[2]

In [None]:
request_code_list = []

for request in range(len(request_list)):
    request_url = request_list[request]['items'][0]['link']
    url_split_list = request_url.split(':')
    request_code_list.append(url_split_list[2])
request_code_list

### Reasons for and Mechanics of the API Call:
Originally, I had hoped that there was consistency with the original Dataset. I was very wrong and the German country/region names deviated from convention. Thankfully I was able to clean 98% (1884/1917) of the German region names using two external sources. 

I found that I could just google the final 2% (33) country names and google would correct the search to produce a country code provided from `de.wikipedia.org/`. A useful tool google provides is the ability to filter by website, through a **site:`www.example.com`** query, or by creating a [Programmable Search Engine](https://programmablesearchengine.google.com/about/). This programable enginge can then be utilized by [Google's Custom Search API](https://developers.google.com/custom-search/v1/overview). Limited by 100 free searches a day, this project is very fortunate that only 33 of the data needed this treatment. The overview of the API is as follows;

`https://www.googleapis.com/customsearch/v1/siterestrict?cx=   &key=   &q=`

Where `?cx=` is the engine ID that is referenced for the search, the `&key=` is the API key that is used to make the call, and `&q=` is the query. 

And so this API call utilizes a programmable engine set to specifically filter websites by `de.wikipedia.org/`. While other websites did populate, since a call needed to be made individually for each erroneous data , the german wikipedia was preferable since its results returned the single country/regions information page, while others returned a table with every other country code. Organically, the search would look something like this:

![title](data/images/organic_search.png)

Thanks to Google, any sort of cleaning of poorly inputed data is done for us by these request. It's just up to us to clean the resulting request results.

## JSON cleaning

Once the API call is done, the resulting JSON is sent to a list and that list is cleaned in this code. As of now it still needs some polishing I believe.

In [None]:
searched_codes = []
filtered_list = []
filtered_codes = []
fragment_list = []
parsed_list = []
filtered_codes = []

for request in range(len(request_list)):
        searched_codes.append(request_list[request]['items'][0]['link'])
        parsed_list.append(urldefrag(searched_codes[request]).fragment)
        frag_split = parsed_list[request].split(':')
        filtered_codes.append(frag_split[])
        
filtered_codes_df = pd.DataFrame(filtered_codes)
filtered_codes_df['Country'] = null_codes['Region']
filtered_codes_df        

In [None]:
# parsed_url = urlparse(filtered_url)
# frag = urldefrag(filtered_url).fragment
# frag_split = frag.split(':')
# filtered_codes = frag_split[3]
# filtered_codes

In [None]:
# searched_codes = []
# filtered_list = []
# searched_codes.append(query)
# searched_codes[0]['items'][0]['title']
# filtered_list.append(searched_codes[0]['items'][0]['link'])
# filtered_url = "https://www.iso.org/obp/ui/#iso:code:3166:MD"
# parsed_url = urlparse(filtered_url)
# frag = urldefrag(filtered_url).fragment
# frag_split = frag.split(':')
# filtered_codes = frag_split[3]
# filtered_codes