In [None]:
import numpy as np
import pandas as pd
import math

In [None]:
d2r = lambda x: x * np.pi / 180 # convert degrees to radians
def getDistance(lat1, lon1, lat2, lon2): 
    """Calculate the distance between two (lat,lon) points using 'Haversine' Formula"""
    r = 6371 # radias of earth in km
    dLat = d2r(lat2 - lat1)
    dLon = d2r(lon2 - lon1)
    a = math.sin(dLat/2) * math.sin(dLat/2) + math.cos(d2r(lat1)) \
         * math.cos(d2r(lat2)) * math.sin(dLon/2) * math.sin(dLon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = r * c # Distance in km
    return d

In [None]:
gold = pd.read_csv('gold_reserves.csv', header=0)
cities = pd.read_csv('country-capitals.csv', header=0)

In [None]:
gold.head()

In [None]:
gold_dropped = gold[['Country Name', 'Country Code', '2016']]
gold_dropped.head()

In [None]:
gold_dropped.count()

In [None]:
gold_dropped.set_index('Country Code', inplace = True)
gold_dropped.head()

In [None]:
gold_dropped_sorted = gold_dropped.sort_values('2016', ascending=False).head(50)
gold_dropped_sorted.head()

In [None]:
cities.head()

In [None]:
cities_dropped = cities.drop('ContinentName',axis=1)

In [None]:
cities_dropped.head()

In [None]:
combined = pd.merge(cities_dropped, gold_dropped_sorted, left_on='CountryName', right_on='Country Name')

In [None]:
combined.head(10)

In [None]:
# Check if any countries are missing
combined.count()

In [None]:
combined.sort_values('2016', inplace=True)

In [None]:
combined.head(10)

In [None]:
gold_dropped_sorted

In [None]:
missing_countries = []
for index, row in gold_dropped_sorted.iterrows():
    if row['Country Name'] not in combined.values:
        missing_countries.append(row['Country Name'])
missing_countries # check which countries need to be fixed due to inconsistent naming

In [None]:
gold_dropped_sorted.loc[gold_dropped_sorted['Country Name'] == 'Russian Federation', 'Country Name'] = 'Russia'

In [None]:
missing_countries_corrected = ['Hong Kong', 'Russia', 'South Korea']
correction_dict = dict(zip(missing_countries, missing_countries_corrected))

In [None]:
for index, row in gold_dropped_sorted.iterrows():
    if row['Country Name'] in correction_dict.keys():
        gold_dropped_sorted.loc[index, 'Country Name'] = correction_dict[row['Country Name']]

In [None]:
# re-merge the two tables after fixing the values
combined = pd.merge(cities_dropped, gold_dropped_sorted, left_on='CountryName', right_on='Country Name')

In [None]:
combined

In [None]:
# manually enter the data for Hong Kong
combined.loc[combined['CountryName'] == 'Hong Kong', 'CapitalName'] = 'Hong Kong'
combined.loc[combined['CountryName'] == 'Hong Kong', 'CapitalLatitude'] = 22.3964
combined.loc[combined['CountryName'] == 'Hong Kong', 'CapitalLongitude'] = 114.1095
combined.iloc[49]

In [None]:
combined.drop(['Country Name'], inplace = True, axis=1)
combined.set_index('CountryCode', inplace=True)
combined.sort_values("2016", ascending=False, inplace=True)
combined.head()


In [None]:
combined.to_csv('clean_capitals_gold.csv')

In [None]:
# make sure latitudes and logitudes are floats
combined.dtypes

In [None]:
country_names = list(combined.loc[:, 'CountryName'])
country_names = [' '] + country_names

In [None]:
latitudes = list(combined.loc[:, 'CapitalLatitude'])
longitudes = list(combined.loc[:, 'CapitalLongitude'])

In [None]:
adjacency_matrix = [country_names] # initialize adjacency matrix, with header

In [None]:
# zip latitudes and logitudes into (lat, long)
lat_lon_pairs = list(zip(latitudes, longitudes))
lat_lon_pairs

In [None]:
# calculate distances for each coordinates-pair, and add to the adjacency matrix
for index, pair1 in enumerate(lat_lon_pairs):
    temp = [country_names[index + 1]]
    for pair2 in lat_lon_pairs:
        temp.append(getDistance(pair1[0], pair1[1], pair2[0], pair2[1]))
    adjacency_matrix.append(temp)

In [None]:
adjacency_matrix

In [None]:
# write matrix to file
with open('named_distances.csv', 'w') as f:
    for elt in adjacency_matrix:
        f.write(', '.join(str(v) for v in elt))
        f.write('\n')