In [1]:
import os

import urllib.request
import urllib.parse
import urllib.error
import http
import json
import time
import ssl
import sys

import pandas as pd

import geopandas as gpd
import fiona

fiona.drvsupport.supported_drivers['KML'] = 'rw'
fiona.drvsupport.supported_drivers['LIBKML'] = 'rw'

In [2]:
# Create directories
current_wd = os.getcwd() #Path of current working directory
try:
    os.mkdir('{}/post_processed_data'.format(os.path.dirname(current_wd)))
except:
    pass

In [3]:
# Load datasets
df_2020 = pd.read_csv('../processed_data/df_2020.csv',
                      usecols=['COUNTRYEXPORT', 'COUNTRYORIGIN', 
                               'CUSTOMSVALUE', 'EXCHANGE_RATE',
                               'VAT_PAID'])
df_2019 = pd.read_csv('../processed_data/df_2019.csv',
                      usecols=['COUNTRYEXPORT', 'COUNTRYORIGIN',
                               'CUSTOMSVALUE', 'EXCHANGE_RATE',
                               'VAT_PAID'])
df_2018 = pd.read_csv('../processed_data/df_2018.csv',
                      usecols=['COUNTRYEXPORT', 'COUNTRYORIGIN',
                               'CUSTOMSVALUE', 'EXCHANGE_RATE',
                               'VAT_PAID'])
df_2017 = pd.read_csv('../processed_data/df_2017.csv',
                      usecols=['COUNTRYEXPORT', 'COUNTRYORIGIN',
                               'CUSTOMSVALUE', 'EXCHANGE_RATE',
                               'VAT_PAID'])
df_2016 = pd.read_csv('../processed_data/df_2016.csv',
                      usecols=['COUNTRYEXPORT', 'COUNTRYORIGIN',
                               'CUSTOMSVALUE', 'EXCHANGE_RATE',
                               'VAT_PAID'])

In [4]:
# Remove outlier
df_2019.sort_values(by='VAT_PAID', ascending=False).head(1)

Unnamed: 0,COUNTRYORIGIN,COUNTRYEXPORT,CUSTOMSVALUE,EXCHANGE_RATE,VAT_PAID
1904201,AU,AUSTRALIA,40193796.89,52.313,27344626419


In [5]:
df_2019 = df_2019[-df_2019['VAT_PAID']==538483406146]

In [6]:
# Append all datasets
df_all = pd.DataFrame()
df_all = (df_all.append(
    df_2020.append(
        df_2019.append(
            df_2018.append(
            df_2017.append(df_2016))))))

In [7]:
# Add CUSTOMSVALUE_PHP
df_all['CUSTOMSVALUEPHP'] = df_all['CUSTOMSVALUE'] * df_all['EXCHANGE_RATE']

In [8]:
(df_all.groupby('COUNTRYEXPORT').size()
                                .to_frame()
                                .reset_index()
                                .rename(columns={0:'count'})
                                .sort_values(by='count', ascending=False))

Unnamed: 0,COUNTRYEXPORT,count
41,CHINA,2528531
101,JAPAN,1750781
175,SINGAPORE,957342
205,UNITED STATES,911239
90,HONG KONG,497924
...,...,...
15,BAHAMAS,1
73,Former Czechoslovakia,1
146,NIUE,1
162,Pitcairn,1


In [9]:
# Check countries with 1 transaction count
check = (df_all.groupby('COUNTRYEXPORT').size()
                                .to_frame()
                                .reset_index()
                                .rename(columns={0:'count'})
                                .sort_values(by='count', ascending=False))
check[check['count']==1]

Unnamed: 0,COUNTRYEXPORT,count
63,ERITREA,1
57,Democratic yemen,1
55,DOMINICA,1
126,MARTINIQUE,1
86,GUINEA-BISSAU,1
36,CAPE VERDE,1
32,Br.ind.oc.tr,1
183,ST. HELENA,1
15,BAHAMAS,1
73,Former Czechoslovakia,1


In [10]:
# Investigate countries
df_all[df_all['COUNTRYEXPORT'] == 'Br.ind.oc.tr'] 

Unnamed: 0,COUNTRYORIGIN,COUNTRYEXPORT,CUSTOMSVALUE,EXCHANGE_RATE,VAT_PAID,CUSTOMSVALUEPHP
158499,GB,Br.ind.oc.tr,2275.1,51.55,15480.0,117281.405


In [11]:
# Investigate countries
df_all[df_all['COUNTRYEXPORT'] == 'Former Czechoslovakia'] 

Unnamed: 0,COUNTRYORIGIN,COUNTRYEXPORT,CUSTOMSVALUE,EXCHANGE_RATE,VAT_PAID,CUSTOMSVALUEPHP
1276757,Former Czechoslovakia,Former Czechoslovakia,914.87,47.855,7248.0,43781.10385


In [12]:
# Get all unique countries
df_all['COUNTRYEXPORT'].unique()

array(['CHINA', 'HONG KONG', 'INDONESIA', 'RUSSIAN FEDERATION',
       'VIET NAM', 'MALAYSIA', 'KOREA, REPUBLIC OF', 'QATAR', 'JAPAN',
       'UKRAINE', nan, 'BRAZIL', 'TAIWAN, PROVINCE OF CHINA', 'AUSTRALIA',
       'SRI LANKA', 'GERMANY, FEDERAL REPUBLIC OF', 'UNITED STATES',
       'INDIA', 'THAILAND', 'SWITZERLAND', 'ITALY', 'NETHERLANDS',
       'SINGAPORE', 'FINLAND', 'CAMBODIA', 'BANGLADESH', 'LATVIA',
       'NEW ZEALAND', 'BELGIUM', 'UNITED KINGDOM', 'CANADA',
       'UNITED ARAB EMIRATES', 'SWEDEN', 'PORTUGAL', 'AUSTRIA', 'POLAND',
       'FRANCE', 'TURKEY', 'DENMARK', 'MEXICO', 'ISRAEL', 'SPAIN',
       'SAUDI ARABIA', 'HUNGARY', 'PAKISTAN', 'ROMANIA',
       'MYANMAR (former BURMA)', 'ARGENTINA', 'IRELAND', 'URUGUAY',
       'PERU', 'MACAU', 'SOLOMON ISLANDS', 'SOUTH AFRICA', 'MALDIVES',
       'CHILE', 'GREECE', 'EGYPT', 'LUXEMBOURG', 'TAJIKISTAN', 'PANAMA',
       'MADAGASCAR', 'NORWAY', "KOREA, DEM. PEOPLE'S REP.", 'KENYA',
       'JORDAN', 'FIJI', 'COSTA RICA', 'GEORGIA

In [13]:
# Investigate countries
df_all.loc[df_all['COUNTRYEXPORT']=='YUGOSLAVIA (former Fed. of)', 
           ['COUNTRYEXPORT']]

Unnamed: 0,COUNTRYEXPORT
78806,YUGOSLAVIA (former Fed. of)
78807,YUGOSLAVIA (former Fed. of)
78808,YUGOSLAVIA (former Fed. of)
78809,YUGOSLAVIA (former Fed. of)
78810,YUGOSLAVIA (former Fed. of)
...,...
1269353,YUGOSLAVIA (former Fed. of)
1269354,YUGOSLAVIA (former Fed. of)
1287940,YUGOSLAVIA (former Fed. of)
1452072,YUGOSLAVIA (former Fed. of)


In [14]:
# Rename Country for Geocoding
df_all.loc[df_all['COUNTRYEXPORT']=='YUGOSLAVIA (former Fed. of)', 
           ['COUNTRYEXPORT']] = 'YUGOSLAVIA'
df_all.loc[df_all['COUNTRYEXPORT']=='YUGOSLAVIA (former Fed. of)', 
           ['COUNTRYEXPORT']]


Unnamed: 0,COUNTRYEXPORT


In [15]:
# Investigate countries
df_all.loc[df_all['COUNTRYEXPORT']=='Democratic yemen', ['COUNTRYEXPORT']] 

Unnamed: 0,COUNTRYEXPORT
37786,Democratic yemen


In [16]:
# Rename Country for Geocoding
df_all.loc[df_all['COUNTRYEXPORT']=='Democratic yemen', 
           ['COUNTRYEXPORT']] = 'YEMEN'
df_all.loc[df_all['COUNTRYEXPORT']=='Democratic yemen', ['COUNTRYEXPORT']] 

Unnamed: 0,COUNTRYEXPORT


In [17]:
# Investigate country
df_all.loc[df_all['COUNTRYEXPORT']=='GERMANY, FEDERAL REPUBLIC OF']

Unnamed: 0,COUNTRYORIGIN,COUNTRYEXPORT,CUSTOMSVALUE,EXCHANGE_RATE,VAT_PAID,CUSTOMSVALUEPHP
330,DE,"GERMANY, FEDERAL REPUBLIC OF",767383.17,50.603,0.0,3.883189e+07
331,DE,"GERMANY, FEDERAL REPUBLIC OF",14389.97,50.760,0.0,7.304349e+05
332,DE,"GERMANY, FEDERAL REPUBLIC OF",18229.08,50.760,0.0,9.253081e+05
333,DE,"GERMANY, FEDERAL REPUBLIC OF",51199.79,50.760,0.0,2.598901e+06
334,DE,"GERMANY, FEDERAL REPUBLIC OF",9246.97,50.760,0.0,4.693762e+05
...,...,...,...,...,...,...
1647244,"GERMANY, FEDERAL REPUBLIC OF","GERMANY, FEDERAL REPUBLIC OF",1606.00,46.844,0.0,7.523146e+04
1647308,"GERMANY, FEDERAL REPUBLIC OF","GERMANY, FEDERAL REPUBLIC OF",1402.00,46.844,0.0,6.567529e+04
1647423,"GERMANY, FEDERAL REPUBLIC OF","GERMANY, FEDERAL REPUBLIC OF",20.15,47.394,0.0,9.549891e+02
1647442,"GERMANY, FEDERAL REPUBLIC OF","GERMANY, FEDERAL REPUBLIC OF",1400.00,47.193,0.0,6.607020e+04


In [18]:
# Rename Country for Geocoding
df_all.loc[df_all['COUNTRYEXPORT']=='GERMANY, FEDERAL REPUBLIC OF', 
           ['COUNTRYEXPORT']] = 'GERMANY'
df_all.loc[df_all['COUNTRYEXPORT']=='GERMANY', ['COUNTRYEXPORT']] 

Unnamed: 0,COUNTRYEXPORT
330,GERMANY
331,GERMANY
332,GERMANY
333,GERMANY
334,GERMANY
...,...
1647244,GERMANY
1647308,GERMANY
1647423,GERMANY
1647442,GERMANY


In [19]:
# Investigate Country
df_all.loc[df_all['COUNTRYEXPORT']=='MYANMAR (former BURMA)']

Unnamed: 0,COUNTRYORIGIN,COUNTRYEXPORT,CUSTOMSVALUE,EXCHANGE_RATE,VAT_PAID,CUSTOMSVALUEPHP
2191,MM,MYANMAR (former BURMA),38400.00,50.937,0.0,1.955981e+06
3052,MM,MYANMAR (former BURMA),106250.00,50.937,0.0,5.412056e+06
5003,MM,MYANMAR (former BURMA),149760.00,50.937,0.0,7.628325e+06
5080,MM,MYANMAR (former BURMA),9477.22,50.937,66032.0,4.827412e+05
8173,MM,MYANMAR (former BURMA),400760.91,1.000,50177.0,4.007609e+05
...,...,...,...,...,...,...
1550310,MYANMAR (former BURMA),MYANMAR (former BURMA),147941.31,46.833,0.0,6.928535e+06
1573783,MYANMAR (former BURMA),MYANMAR (former BURMA),213435.62,47.007,0.0,1.003297e+07
1626377,MYANMAR (former BURMA),MYANMAR (former BURMA),25280.54,46.844,0.0,1.184242e+06
1628843,MYANMAR (former BURMA),MYANMAR (former BURMA),43572.72,47.394,0.0,2.065085e+06


In [20]:
# Rename Country for Geocoding
df_all.loc[df_all['COUNTRYEXPORT']=='MYANMAR (former BURMA)', 
           ['COUNTRYEXPORT']] = 'MYANMAR'
df_all.loc[df_all['COUNTRYEXPORT']=='MYANMAR (former BURMA)', ['COUNTRYEXPORT']] 

Unnamed: 0,COUNTRYEXPORT


In [21]:
# Add continents for the following countries for Geocoding
df_all.loc[df_all['COUNTRYEXPORT']=='JORDAN', 
           ['COUNTRYEXPORT']] = 'JORDAN, ASIA'
df_all.loc[df_all['COUNTRYEXPORT']=='GEORGIA', 
           ['COUNTRYEXPORT']] = 'GEORGIA, ASIA'
df_all.loc[df_all['COUNTRYEXPORT']=='TOGO', 
           ['COUNTRYEXPORT']] = 'TOGO, AFRICA'

In [22]:
# Assign 1 as 1 transaction per row
df_all['transactions'] = 1

In [23]:
# Get total transaction and VAT paid per country and save to the path
(df_all.groupby('COUNTRYEXPORT')
       .sum()
       .reset_index()
       .rename(columns={0:'count'})
       .to_csv('../post_processed_data/count_per_country.csv',index=False))

In [24]:
# Remove countries that will be geocoded
countries_data = (df_all[(df_all['COUNTRYEXPORT'].notnull()) &
                         (df_all['COUNTRYEXPORT'] != 'Br.ind.oc.tr') &
                         (df_all['COUNTRYEXPORT'] != 'Neutral zone') &
                         (df_all['COUNTRYEXPORT'] != 'PHILIPPINES') &
                         (df_all['COUNTRYEXPORT'] != 'YUGOSLAVIA') &
                         (df_all['COUNTRYEXPORT'] != 
                          'Former Czechoslovakia') &
                         (df_all['COUNTRYEXPORT'] !=
                          'Former USSR (for reference)')]['COUNTRYEXPORT']
                  .to_list())
countries_data= list(set(countries_data))
countries_data.insert(0, 'PHILIPPINES')

### Geocoding

In [25]:
#!/usr/bin/env python
# coding: utf-8

# API Keys
api_key = False


if api_key is False:
    api_key = 42
    serviceurl = "http://py4e-data.dr-chuck.net/json?"
else :
    # api_key = 'AIzaSy___IDByT70'
    serviceurl = "https://maps.googleapis.com/maps/api/geocode/json?"

print('Service URL:',serviceurl)

# Additional detail for urllib
http.client.HTTPConnection.debuglevel = 1

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Data Load list (addresses)
locsize = len(countries_data)


print('Loading {} locations...'.format(locsize),'\n')

column_names = ['address', 'longhitude', 'latitude']
df = pd.DataFrame(columns = column_names)
count = 0
for address in countries_data:
    count+=1
    
    parms = dict()
    parms["address"] = address
    if api_key is not False: parms['key'] = api_key
    url = serviceurl + urllib.parse.urlencode(parms)

    print("Searching {}...({}%)".format(address,round((count/locsize*100),2)))
    uh = urllib.request.urlopen(url, context=ctx)
    data = uh.read().decode()
    
    try:
        js = json.loads(data)
    except:
        print(data)  # print in case unicode causes an error
        continue
    
    # If country cannot be searched
    if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') :
        print('==== Failure To Retrieve ====')
        print(data)
        break

    lat = js["results"][0]["geometry"]["location"]["lat"]
    lng = js["results"][0]["geometry"]["location"]["lng"]
    if lat == 0 or lng == 0 : continue

    df = df.append({'address':address,'longhitude':lng,'latitude':lat},ignore_index=True)
print('Done...')

df.columns = ['Origin','Origin_long', 'Origin_lat']
o,olong,olat = (df[df['Origin']=='PHILIPPINES']['Origin'].iloc[0],
                df[df['Origin']=='PHILIPPINES']['Origin_long'].iloc[0],
                df[df['Origin']=='PHILIPPINES']['Origin_lat'].iloc[0])
df['Destination'],df['Destination_long'],df['Destination_lat'] = o,olong,olat

print("Exporting to longlat.csv...")
df = df[df['Origin']!='Philippines']
df.to_csv('../post_processed_data/longlat.csv',index=False)
print("Done.")

Service URL: http://py4e-data.dr-chuck.net/json?
Loading 213 locations... 

Searching PHILIPPINES...(0.47%)
Searching COOK ISLANDS...(0.94%)
Searching BARBADOS...(1.41%)
Searching TAIWAN, PROVINCE OF CHINA...(1.88%)
Searching MARTINIQUE...(2.35%)
Searching VIET NAM...(2.82%)
Searching EGYPT...(3.29%)
Searching ISRAEL...(3.76%)
Searching TURKS AND CAICOS ISLANDS...(4.23%)
Searching KAZAKHSTAN...(4.69%)
Searching SOUTH AFRICA...(5.16%)
Searching UNITED STATES VIRGIN ISLANDS...(5.63%)
Searching PARAGUAY...(6.1%)
Searching ANGUILLA...(6.57%)
Searching KOREA, REPUBLIC OF...(7.04%)
Searching LUXEMBOURG...(7.51%)
Searching NEW CALEDONIA...(7.98%)
Searching KUWAIT...(8.45%)
Searching MALDIVES...(8.92%)
Searching DENMARK...(9.39%)
Searching ANTARCTICA...(9.86%)
Searching ARUBA...(10.33%)
Searching MOLDOVA...(10.8%)
Searching SAUDI ARABIA...(11.27%)
Searching MONTSERRAT...(11.74%)
Searching ANGOLA...(12.21%)
Searching FIJI...(12.68%)
Searching GERMANY...(13.15%)
Searching EQUATORIAL GUINEA...(13

In [26]:
# Create a kml file to create line string
df['kml'] = ('<Placemark><name>' + df['Destination'] + ' - ' +
             df['Origin'] + '</name><description>'  + df.index.astype(str) +
             '</description><LineString><coordinates>' +
             df['Destination_long'].astype(str) + ',' + 
             df['Destination_lat'].astype(str) + ' ' + 
             df['Origin_long'].astype(str) + ',' + 
             df['Origin_lat'].astype(str) + 
             '</coordinates></LineString></Placemark>')

kml = ('''<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2">
<Document>\n''' +
'\n'.join(df['kml'].to_list())
+
'''
</Document>
</kml>'''
)

with open('../post_processed_data/geocode.kml', 'w') as f:
    f.writelines(kml)

In [27]:
# Load kml file and convert to a spatial file
kml_f = gpd.read_file("../post_processed_data/geocode.kml")
kml_f.to_file('../post_processed_data/network.shp', driver='ESRI Shapefile')