In [20]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd
import pycountry
import pypopulation
from geojson import FeatureCollection, dump
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestCentroid
from matplotlib.pyplot import figure

## Geocoding
Assigning countries coordinates

In [21]:
load_dotenv()
TOKEN=os.getenv("MAPBOX_TOKEN")
df = pd.read_csv("../output/processed/flows.csv") # ../output/processed/
origins = list(df.origin.unique())
destinations = list(df.destination.unique())

In [22]:
in_first = set(origins)
in_second = set(destinations)
in_second_but_not_in_first = in_second - in_first
countries = origins + list(in_second_but_not_in_first)

In [23]:
df = pd.DataFrame({"country":countries})

In [24]:
headers = {'Accept': 'application/json'}
lst = []
for i in countries:
    obj = {}
    obj["type"]="Feature"
    obj["properties"]={"name": pycountry.countries.get(alpha_2=i).name}
    pop = pypopulation.get_population_a2(i)
    url = f"https://api.mapbox.com/geocoding/v5/mapbox.places/{i}.json?&types=country&access_token={TOKEN}"
    r = requests.get(url)
    jason = r.json()
    obj["properties"]["code"] = i.lower()
    coords = jason["features"][0]["center"]
    obj["geometry"] = {"type":"Point", "coordinates": coords}
    lst.append(obj)
    df.loc[df["country"] == i, "lat"] = coords[1]
    df.loc[df["country"] == i, "lon"] = coords[0]
    df.loc[df["country"] == i, "name"] = obj["properties"]["name"]
    df.loc[df["country"] == i, "population"] = pop

In [25]:
with open('../output/processed/countries.geojson', 'w') as f:
    dump(FeatureCollection(lst), f)

In [26]:
df['population'] = df['population'].fillna(0).astype(int)

In [27]:
# filling one country manually
df.loc[df[df["name"]=="Saint Barthélemy"].index, 'population'] = 10457
df = df.astype({'population':'str'})

## Clustering


In [28]:
X = df[["lat","lon"]]

In [29]:
lst = []
s = len(X)
# 6 zoom levels
for i in range(5):
    s -= 23
    lst.append(s)
lst

[132, 109, 86, 63, 40]

In [30]:
for count,i in enumerate(lst):
    hierarchical_cluster = AgglomerativeClustering(n_clusters=i, metric='euclidean', linkage='ward')
    labels = hierarchical_cluster.fit_predict(X)
    clf = NearestCentroid()
    clf.fit(X, labels)
    df[f"label_{count+1}"] = labels
    df[f"lat_{count+1}"] = df.apply(lambda x: clf.centroids_[x[f"label_{count+1}"]][0], axis=1)
    df[f"lon_{count+1}"] = df.apply(lambda x: clf.centroids_[x[f"label_{count+1}"]][1], axis=1)


In [31]:
df.rename(columns={"lon": "lon_0", "lat": "lat_0"}, inplace=True)
df

Unnamed: 0,country,lat_0,lon_0,name,population,label_1,lat_1,lon_1,label_2,lat_2,lon_2,label_3,lat_3,lon_3,label_4,lat_4,lon_4,label_5,lat_5,lon_5
0,ad,42.548654,1.576766,Andorra,77142,97,42.548654,1.576766,97,42.548654,1.576766,7,45.186779,2.097777,15,45.186779,2.097777,13,42.817352,-2.105716
1,ae,1.612130,10.525771,United Arab Emirates,9770529,67,1.612130,10.525771,67,1.612130,10.525771,67,1.612130,10.525771,3,6.933350,8.912128,15,6.933350,8.912128
2,af,33.838806,66.026471,Afghanistan,38041754,93,33.838806,66.026471,93,33.838806,66.026471,14,31.012595,66.860854,30,31.012595,66.860854,4,28.074786,70.732796
3,al,28.163240,2.632388,Albania,2854191,9,28.163240,2.632388,20,28.163240,2.632388,41,28.163240,2.632388,41,28.163240,2.632388,1,27.790147,7.762685
4,am,39.381266,-97.922211,Armenia,2957731,15,39.381266,-97.922211,32,39.381266,-97.922211,32,39.381266,-97.922211,32,39.381266,-97.922211,32,39.381266,-97.922211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,as,-14.304405,-170.707833,American Samoa,55312,32,-14.304405,-170.707833,23,-14.071868,-171.235791,47,-14.071868,-171.235791,47,-14.071868,-171.235791,23,-14.071868,-171.235791
151,sd,13.831539,30.049948,Sudan,42813238,23,13.831539,30.049948,47,13.831539,30.049948,1,10.548463,30.052419,46,10.548463,30.052419,6,11.430667,36.176558
152,bl,53.539998,28.046788,Saint Barthélemy,10457,16,53.539998,28.046788,16,53.539998,28.046788,69,53.539998,28.046788,6,56.847104,26.141462,11,58.006786,24.932026
153,in,22.199166,78.476681,India,1366417754,21,22.199166,78.476681,21,22.199166,78.476681,44,22.199166,78.476681,44,22.199166,78.476681,4,28.074786,70.732796


In [32]:
df

Unnamed: 0,country,lat_0,lon_0,name,population,label_1,lat_1,lon_1,label_2,lat_2,lon_2,label_3,lat_3,lon_3,label_4,lat_4,lon_4,label_5,lat_5,lon_5
0,ad,42.548654,1.576766,Andorra,77142,97,42.548654,1.576766,97,42.548654,1.576766,7,45.186779,2.097777,15,45.186779,2.097777,13,42.817352,-2.105716
1,ae,1.612130,10.525771,United Arab Emirates,9770529,67,1.612130,10.525771,67,1.612130,10.525771,67,1.612130,10.525771,3,6.933350,8.912128,15,6.933350,8.912128
2,af,33.838806,66.026471,Afghanistan,38041754,93,33.838806,66.026471,93,33.838806,66.026471,14,31.012595,66.860854,30,31.012595,66.860854,4,28.074786,70.732796
3,al,28.163240,2.632388,Albania,2854191,9,28.163240,2.632388,20,28.163240,2.632388,41,28.163240,2.632388,41,28.163240,2.632388,1,27.790147,7.762685
4,am,39.381266,-97.922211,Armenia,2957731,15,39.381266,-97.922211,32,39.381266,-97.922211,32,39.381266,-97.922211,32,39.381266,-97.922211,32,39.381266,-97.922211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,as,-14.304405,-170.707833,American Samoa,55312,32,-14.304405,-170.707833,23,-14.071868,-171.235791,47,-14.071868,-171.235791,47,-14.071868,-171.235791,23,-14.071868,-171.235791
151,sd,13.831539,30.049948,Sudan,42813238,23,13.831539,30.049948,47,13.831539,30.049948,1,10.548463,30.052419,46,10.548463,30.052419,6,11.430667,36.176558
152,bl,53.539998,28.046788,Saint Barthélemy,10457,16,53.539998,28.046788,16,53.539998,28.046788,69,53.539998,28.046788,6,56.847104,26.141462,11,58.006786,24.932026
153,in,22.199166,78.476681,India,1366417754,21,22.199166,78.476681,21,22.199166,78.476681,44,22.199166,78.476681,44,22.199166,78.476681,4,28.074786,70.732796


In [33]:
df.to_csv("../output/processed/countries.csv", index=False)