# Cluster Countries

## 1. Setup Environment

In [None]:
# use %conda to install modules in current environment
%conda install pandas numpy scikit-learn plotly pandas-datareader -y

# import modules
import pandas as pd
import numpy as np
from pandas_datareader import wb
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.express as px
import warnings

<div class="alert alert-block alert-info">
<b>Reminder:</b> Restart the kernel after installing any modules.
</div>

## 2. Collect and Clean Data

In [None]:
# Define the indicators you need
indicators = {
    'NY.GDP.PCAP.CD': 'GDP_per_capita',
    'SL.UEM.TOTL.ZS': 'Unemployment_rate',
    'FP.CPI.TOTL.ZG': 'Inflation_rate',
    'SE.ADT.LITR.ZS': 'Literacy_rate',  # Proxy for Education index
    'SH.XPD.CHEX.GD.ZS': 'Healthcare_expenditure'
}

# Fetch data for all countries from Wrold Bank API
with warnings.catch_warnings():
    warnings.simplefilter("ignore", FutureWarning)
    data = wb.download(indicator=indicators, 
                       country='all', 
                       start=2020, 
                       end=2020).reset_index()

# Rename the columns
data_pivot = data.drop(['year'], axis = 1)
data_pivot.rename(columns=indicators, inplace=True)
data_pivot.rename(columns={'country': 'Country'}, inplace=True)

# Impute missing values using apply
data_cleaned = data_pivot.copy()

numeric_columns = data_cleaned.select_dtypes(include=[np.number]).columns

# Function to replace NaN with column mean
data_cleaned[numeric_columns] = data_cleaned[numeric_columns].apply(
    lambda x: x.fillna(x.mean())
)

# Save to CSV for later use
data_cleaned.to_csv('economic_indicators.csv', index=False)

## 3. Cluster Countries using K-means

In [None]:
# Load the dataset
data = pd.read_csv('economic_indicators.csv')

# Select and rename relevant features
features = data[['GDP_per_capita', 'Unemployment_rate', 'Inflation_rate', 'Literacy_rate', 'Healthcare_expenditure']]

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(scaled_features)
data['Cluster'] = clusters

## 4. Show Clusters in a World Map

In [None]:
# Define a list of five distinct colors
cluster_colors = ['#ee4035', '#f37736', '#fdf498', '#7bc043', '#0392cf']  # Example colors

# Create a discrete colorscale
discrete_colorscale = []
for i, color in enumerate(cluster_colors):
    discrete_colorscale.append([i/5, color])
    discrete_colorscale.append([(i+1)/5, color])

fig = px.choropleth(
    data_frame=data,
    locations='Country',
    locationmode='country names',
    color='Cluster',
    hover_name='Country',
    hover_data=['GDP_per_capita', 'Unemployment_rate', 'Inflation_rate', 'Literacy_rate', 'Healthcare_expenditure'],
    color_continuous_scale=discrete_colorscale,
    range_color=(-0.5, 4.5),
    title='Economic Clusters of Countries'
)

fig.update_layout(
    width=1000, 
    height=700,
    margin=dict(l=50, r=50, t=50, b=50),
    geo=dict(
        projection=dict(type='natural earth'),
        showframe=False,
        showcoastlines=False,
        fitbounds='locations'
    )
)

fig.show()