In [34]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd
import pycountry
import pypopulation
from geojson import FeatureCollection, dump
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestCentroid
from matplotlib.pyplot import figure

## Geocoding
Assigning countries coordinates

In [35]:
load_dotenv()
TOKEN=os.getenv("MAPBOX_TOKEN")
df = pd.read_csv("../output/processed/clean.csv") # ../output/processed/
origins = list(df.origin.unique())
destinations = list(df.destination.unique())

In [36]:
in_first = set(origins)
in_second = set(destinations)
in_second_but_not_in_first = in_second - in_first
countries = origins + list(in_second_but_not_in_first)

In [37]:
df = pd.DataFrame({"country":countries})

In [38]:
headers = {'Accept': 'application/json'}
lst = []
for i in countries:
    obj = {}
    obj["type"]="Feature"
    obj["properties"]={"name": pycountry.countries.get(alpha_2=i).name}
    pop = pypopulation.get_population_a2(i)
    url = f"https://api.mapbox.com/geocoding/v5/mapbox.places/{i}.json?&types=country&access_token={TOKEN}"
    r = requests.get(url)
    jason = r.json()
    obj["properties"]["code"] = i.lower()
    coords = jason["features"][0]["center"]
    obj["geometry"] = {"type":"Point", "coordinates": coords}
    lst.append(obj)
    df.loc[df["country"] == i, "lat"] = coords[0]
    df.loc[df["country"] == i, "lon"] = coords[1]
    df.loc[df["country"] == i, "name"] = obj["properties"]["name"]
    df.loc[df["country"] == i, "population"] = pop

In [39]:
with open('../output/processed/countries.geojson', 'w') as f:
    dump(FeatureCollection(lst), f)

In [58]:
df['population'] = df['population'].fillna(0).astype(int)

In [59]:
# filling one country manually
df.loc[df[df["name"]=="Saint Barthélemy"].index, 'population'] = 10.457

## Clustering


In [60]:
X = df[["lat","lon"]]

KeyError: "None of [Index(['lat', 'lon'], dtype='object')] are in the [columns]"

In [44]:
lst = []
s = len(X)
# 6 zoom levels
for i in range(5):
    s -= 23
    lst.append(s)
lst

[132, 109, 86, 63, 40]

In [45]:
for count,i in enumerate(lst):
    hierarchical_cluster = AgglomerativeClustering(n_clusters=i, metric='euclidean', linkage='ward')
    labels = hierarchical_cluster.fit_predict(X)
    clf = NearestCentroid()
    clf.fit(X, labels)
    df["temp"] = labels
    df[f"lat_{count+1}"] = df.apply(lambda x: clf.centroids_[x["temp"]][0], axis=1)
    df[f"lon_{count+1}"] = df.apply(lambda x: clf.centroids_[x["temp"]][1], axis=1)


In [46]:
df.rename(columns={"lon": "lon_0", "lat": "lat_0"}, inplace=True)
df.drop(["temp"],inplace=True, axis=1)
df

Unnamed: 0,country,lat_0,lon_0,name,population,lat_1,lon_1,lat_2,lon_2,lat_3,lon_3,lat_4,lon_4,lat_5,lon_5
0,ad,1.576766,42.548654,Andorra,77142.0,1.576766,42.548654,1.576766,42.548654,2.097777,45.186779,2.097777,45.186779,-2.105716,42.817352
1,ae,10.525771,1.612130,United Arab Emirates,9770529.0,10.525771,1.612130,10.525771,1.612130,10.525771,1.612130,8.912128,6.933350,8.912128,6.933350
2,af,66.026471,33.838806,Afghanistan,38041754.0,66.026471,33.838806,66.026471,33.838806,66.860854,31.012595,66.860854,31.012595,70.732796,28.074786
3,al,2.632388,28.163240,Albania,2854191.0,2.632388,28.163240,2.632388,28.163240,2.632388,28.163240,2.632388,28.163240,7.762685,27.790147
4,am,-97.922211,39.381266,Armenia,2957731.0,-97.922211,39.381266,-97.922211,39.381266,-97.922211,39.381266,-97.922211,39.381266,-97.922211,39.381266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,tj,71.042004,38.838508,Tajikistan,9321018.0,71.042004,38.838508,71.042004,38.838508,72.798800,40.151781,72.798800,40.151781,67.090625,41.867274
151,ug,32.386218,1.279964,Uganda,44269594.0,32.386218,1.279964,32.386218,1.279964,35.122050,0.904913,35.122050,0.904913,31.137000,0.212303
152,hk,114.162550,22.279356,Hong Kong,7507400.0,114.162550,22.279356,114.162550,22.279356,114.162550,22.279356,117.572165,18.433072,111.125326,16.577226
153,sa,44.652426,23.384784,Saudi Arabia,34268528.0,44.652426,23.384784,44.652426,23.384784,44.652426,23.384784,46.137178,19.661310,50.846777,23.131173


In [61]:
df

Unnamed: 0,country,lat_0,lon_0,name,population,lat_1,lon_1,lat_2,lon_2,lat_3,lon_3,lat_4,lon_4,lat_5,lon_5
0,ad,1.576766,42.548654,Andorra,77142.0,1.576766,42.548654,1.576766,42.548654,2.097777,45.186779,2.097777,45.186779,-2.105716,42.817352
1,ae,10.525771,1.612130,United Arab Emirates,9770529.0,10.525771,1.612130,10.525771,1.612130,10.525771,1.612130,8.912128,6.933350,8.912128,6.933350
2,af,66.026471,33.838806,Afghanistan,38041754.0,66.026471,33.838806,66.026471,33.838806,66.860854,31.012595,66.860854,31.012595,70.732796,28.074786
3,al,2.632388,28.163240,Albania,2854191.0,2.632388,28.163240,2.632388,28.163240,2.632388,28.163240,2.632388,28.163240,7.762685,27.790147
4,am,-97.922211,39.381266,Armenia,2957731.0,-97.922211,39.381266,-97.922211,39.381266,-97.922211,39.381266,-97.922211,39.381266,-97.922211,39.381266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,tj,71.042004,38.838508,Tajikistan,9321018.0,71.042004,38.838508,71.042004,38.838508,72.798800,40.151781,72.798800,40.151781,67.090625,41.867274
151,ug,32.386218,1.279964,Uganda,44269594.0,32.386218,1.279964,32.386218,1.279964,35.122050,0.904913,35.122050,0.904913,31.137000,0.212303
152,hk,114.162550,22.279356,Hong Kong,7507400.0,114.162550,22.279356,114.162550,22.279356,114.162550,22.279356,117.572165,18.433072,111.125326,16.577226
153,sa,44.652426,23.384784,Saudi Arabia,34268528.0,44.652426,23.384784,44.652426,23.384784,44.652426,23.384784,46.137178,19.661310,50.846777,23.131173


In [63]:
df.to_csv("../output/processed/countries.csv", index=False)