Unsupervised learning project - World Bank dataset

Import libraries

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import geopandas as gpd


Načíst data a přejmenovat sloupce

In [None]:
df = pd.read_excel("CountryData.xlsx", sheet_name="Data")
df.columns = ["Country", "CountryName", "ForeignInvestment", "ElectricityAccess", "RenewableEnergy", "CO2Emission", "Inflation" - DOPLNIT]
df.head()

Úkol č. 1: Zkontrolovat chybějící hodnoty (NaN)

In [None]:
df.isna().sum()

Úkol č. 2: Doplnit chybějící hodnoty (např. průměrem daného sloupce)

In [None]:
X = df.iloc[:, 2:]
X = X.fillna(X.mean())
X.head()

Úkol č. 3: Standardizovat data

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

Úkol č. 4: Aplikovat shlukovací algoritmus na škálované proměnné, přiřadit label každé zemi, odpovídající shluku, ke kterému byla přiřazena.

In [None]:
# Clustering (k-means)
ks = range(1, 6)
# Inertia je to samé jako SSE
inertias = []
for k in ks:
    model = KMeans(n_clusters=k, n_init="auto")
    model.fit(X_scaled)
    inertias.append(model.inertia_)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(ks, inertias, "-o")
ax.set_xlabel("Number of clusters, k")
ax.set_ylabel("Inertia")
ax.grid(True)
# ax.set_ylabel("SSE")
# ax.set_xticks(ks)
plt.show()

In [None]:
# Perform clustering
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_scaled)
y_pred = kmeans.labels_

In [None]:
# Set label to original data
df["label"] = y_pred

In [None]:
# Load world map
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

# Merge data with the world map
world = world.merge(df, left_on="iso_a3", right_on="Country", how="left")

In [None]:
cluster_labels = np.unique(y_pred)

Úkol č. 5: Vizualizovat výsledky

In [8]:
colors = ["red", "green", "blue"]
# Map clusters to colors
color_map = {cluster: colors[i] for i, cluster in enumerate(cluster_labels)}

# Assign colors to countries based on cluster labels, default to gray, if NaN
world["color"] = world["label"].map(color_map).fillna("lightgray")

# Plot the map with clusters
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
world.boundary.plot(ax=ax, linewidth=1)
world.plot(color=world["color"], ax=ax, legend=False)

# Create custom legend
legend_handles = [Line2D([0], [0], marker="o", color="w", markerfacecolor=color_map[cluster], markersize=10, label=f"Shluk {cluster}")
    for cluster in cluster_labels if not pd.isna(cluster)]
legend_handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="lightgray", markersize=10, label="Nepřiřazeno" ))
ax.legend(handles=legend_handles, title="Číslo shluku", loc="lower left")

# Add title
plt.title("Segmentace zemí podle k-means algoritmu")
plt.show()


NameError: name 'cluster_labels' is not defined

Úkol č. 6: Interpretovat výsledky - k jaké segmentaci došlo? Očekávali bychom jiné rozdělení?

Úkol č. 7: Zkuste aplikovat jiný algoritmus. Jak se změnily výsledky?

Úkol č. 8: Aplikujte PCA na škálované proměnné. Kolik variability je vysvětleno prvními dvěma komponentami?

In [None]:
# PCA model
model = PCA()
pca_features = model.fit_transform(X_scaled)
explained_variance = model.explained_variance_ratio

# Scree plot
plt.figure(figsize=(6, 4))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker="o", linestyle="--")
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance")
plt.title("Scree Plot")
plt.show()
# plt.plot(np.cumsum(model.explained_variance_ratio_))
# plt.xlabel("Number of components")
# plt.ylabel("Cumulative explained variance")
# plt.title("Scree Plot")
# plt.show()

# Improve x-axis tick labels
plt.xticks(ticks=np.arange(1, len(explained_variance) + 1), rotation=45)

# Add gridlines
plt.grid(True, linestyle="--", linewidth=0.5)
plt.show()


Úkol č. 9: Vizualizujte scatteplot bodů PCA1 vs. PCA2. Vybarvěte body podle přiřazeného shluku.

In [None]:
# Extract the labels
labels = df["label"]

unique_labels = np.unique(labels)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))
# colors = sns.color_palette("colorblind", len(unique_labels))

# Create scatterplot
plt.figure(figsize=(8, 6))

Úkol č. 10: Se kterými z původních (škálovaných) proměnných je PCA1 silně korelována? (to samé provést i pro PCA2)

In [None]:
# Extract PCA1
pca1 = pca_features[:, 0]

# Compute correlations
correlations = np.corrcoef(pca1, X_scaled, rowvar=False)[0, 1:]

# Create a list of tuples (correlation, column name)
correlation_list = [(correlations[i], col) for i, col in enumerate(X.columns)]

# Sort the list of tuples in descending order of correlation
sorted_correlations = sorted(correlation_list, key=lambda x: x[0], reverse=True)

# Print sorted correlations
for corr, col in sorted_correlations:
    print(f"Correlation between PCA1 and {col}: {corr:.2f}")



In [None]:
# Extract PCA2
pca2 = pca_features[:, 1]

# Compute correlations
correlations = np.corrcoef(pca2, X_scaled, rowvar=False)[0, 1:]

# Create a list of tuples (correlation, column name)
correlation_list = [(correlations[i], col) for i, col in enumerate(X.columns)]

# Sort the list of tuples in descending order of correlation
sorted_correlations = sorted(correlation_list, key=lambda x: x[0], reverse=True)

# Print sorted correlations
for corr, col in sorted_correlations:
    print(f"Correlation between PCA2 and {col}: {corr:.2f}")


Úkol č. 11: Dokážeme interpretovat, jaká charakteristika se skrývá pod PCA1 a PCA2 ?