In [1]:
# load python libraries
from dash import dcc, html, Dash, dash_table
from dash.dependencies import Output, Input
from dash.exceptions import PreventUpdate
import plotly.express as px
import pandas as pd
import numpy as np
from dash_bootstrap_templates import load_figure_template
import dash_bootstrap_components as dbc
import country_converter as coco
import logging
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# read raw data
data = pd.read_csv("data/data_raw.csv")
# read reference table
ref_table = pd.read_csv("data/country_code_conversion.csv")

# clean relevant numeric columns in `data` and `ref_table`
data["Area Code (M49)"] = (
    data["Area Code (M49)"]
    .astype(str)
    .str.replace('"', '', regex=False)
    .str.strip()
    .astype("Int64")
)
ref_table["Numeric code"] = (
    ref_table["Numeric code"]
    .astype(str)
    .str.replace('"', '', regex=False)
    .str.strip()
    .astype("Int64")
)
ref_table["Latitude (average)"] = (
    ref_table["Latitude (average)"]
    .astype(str)
    .str.replace('"', '', regex=False)
    .str.strip()
    .astype("Float64")
)
ref_table["Longitude (average)"] = (
    ref_table["Longitude (average)"]
    .astype(str)
    .str.replace('"', '', regex=False)
    .str.strip()
    .astype("Float64")
)

# left join `data` and `ref_table` using each area's M49 code
# so that latitude and longitude information can be included in `data`
data = data.merge(
    ref_table,
    how="left",
    left_on="Area Code (M49)",
    right_on="Numeric code"
)

# use coco library to tell which continent each area is located in
cc = coco.CountryConverter()
logging.getLogger("country_converter").setLevel(logging.ERROR)
data["Continent"] = cc.convert(names=data["Area Code (M49)"], to="continent", src="UNnumeric")
data["Continent"] = data["Continent"].replace("not found", pd.NA)

# filter out those area which cannot be matched with a continent
data = data[data['Continent'].notna()]
# select only relevant columns in `data`
data = data[['Area', 'Continent', 'Latitude (average)',
             'Longitude (average)', 'Year', 'Import', 'Export ',
             'Production', 'Consumption', 'Unit']]
# rename dirty column names
data = data.rename(columns={
    'Latitude (average)': 'Latitude',
    'Longitude (average)': 'Longitude',
    'Export ': 'Export'
})

# display first 3 rows of `data`
data.head(3)

Unnamed: 0,Area,Continent,Latitude,Longitude,Year,Import,Export,Production,Consumption,Unit
0,Afghanistan,Asia,33.0,65.0,2014,283.85,21099.0,21500.0,684.85,t
1,Afghanistan,Asia,33.0,65.0,2015,1000.16,17340.0,18000.0,1660.16,t
2,Afghanistan,Asia,33.0,65.0,2016,814.88,8353.0,17333.33,9795.21,t


In [3]:
# display last 3 rows of `data`
data.tail(3)

Unnamed: 0,Area,Continent,Latitude,Longitude,Year,Import,Export,Production,Consumption,Unit
47969,Zimbabwe,Africa,-20.0,30.0,2016,0.01,0.0,9.73,9.74,t
47970,Zimbabwe,Africa,-20.0,30.0,2019,0.02,0.0,9.88,9.9,t
47971,Zimbabwe,Africa,-20.0,30.0,2023,0.12,0.0,9.92,10.04,t
