Unsupervised learning project - World Bank dataset

Import libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, accuracy_score
import geopandas as gpd


Načíst data a přejmenovat sloupce

In [None]:
df = pd.read_excel("CountryData.xlsx", sheet_name="Data")
df.columns = ["Country", "CountryName", "ForeignInvestment", "ElectricityAccess", "RenewableEnergy", "CO2Emission", "Inflation", "MobileSubscriptions", "InternetUse", "Exports","Imports", "GDP", "MortalityMale", "MortalityFemale", "BirthRate", "DeathRate", "MortalityInfant", "LifeEcpectancy", "FertilityRate", "PopulationGrowth", "UrbanPopulation"]
df.head()

Úkol č. 1: Zkontrolovat chybějící hodnoty (NaN)

In [None]:
df.isna().sum()

Úkol č. 2: Doplnit chybějící hodnoty (např. průměrem daného sloupce)

In [None]:
X = df.iloc[:, 2:]
X = X.fillna(X.mean())
X.head()

Úkol č. 3: Standardizovat data

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

Úkol č. 4: Aplikovat shlukovací algoritmus na škálované proměnné, přiřadit label každé zemi, odpovídající shluku, ke kterému byla přiřazena.
Přířadit label k původním datům pomocí
data['label'] = y_pred

In [None]:
# Clustering (k-means), 1-5 clusters
ks = range(1, 6)
# Inertia (SSE - sum of squared errors, the lower the better)
inertias = []
for k in ks:
    model = KMeans(n_clusters=k, n_init="auto")
    model.fit(X_scaled)
    inertias.append(model.inertia_)

In [None]:
# Creating plot and choosing the most suitable number of clusters
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(ks, inertias, "-o")
ax.set_xlabel("Number of clusters, k")
ax.set_ylabel("Sum of squared distance") # SSE
ax.grid(True)
# ax.set_xticks(ks)
plt.show()

In [None]:
# Perform clustering with k-means
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_scaled)   # choice of number of clusters
y_pred = kmeans.labels_

In [None]:
# Perform clustering with k-means and calculate silhouette score
kmeans = KMeans(n_clusters=3, n_init="auto")  # choice of number of clusters, test with different number of clusters
y_pred = kmeans.fit_predict(X_scaled)

# Calculate the silhouette score
score = silhouette_score(X_scaled, y_pred)

print(f"Silhouette Score: {score}")

In [None]:
# Function to calculate the Purity score
def purity_score(y_true, y_pred):
    # Compute contingency matrix (also called confusion matrix)
    contingency_matrix = np.histogram2d(y_true, y_pred, bins=(np.max(y_true) + 1, np.max(y_pred) + 1))[0]

    # Find the maximum value in each column and sum them up
    purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
    return purity

# Example usage - NUTNO UPRAVIT
# y_true is the array of true labels
# y_pred is the array of predicted cluster labels
y_true = np.array([1, 0, 0, 1, 1, 1, 0, 0, 0, 1])
y_pred = np.array([0, 0, 0, 1, 1, 1, 0, 0, 1, 1])

score = purity_score(y_true, y_pred)
print(f"Purity Score: {score}")

In [None]:
# Set label to original data
df["label"] = y_pred

In [None]:
# Load world map
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

# Merging world DataFrame (containing the world map) with our "df" that has additional data you want to visualize on the map
world = world.merge(df, left_on="iso_a3", right_on="Country", how="left")

In [None]:
# How many different clusters we have
cluster_labels = np.unique(y_pred)

Úkol č. 5: Vizualizovat výsledky

In [None]:
# Choosing colors
colors = ["yellow", "lightgreen", "lightblue"]
# Map clusters to colors
color_map = {cluster: colors[i] for i, cluster in enumerate(cluster_labels)}

# Assign colors to countries based on cluster labels, default to gray, if NaN
world["color"] = world["label"].map(color_map).fillna("lightgray")

# Plot the map with clusters
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
world.boundary.plot(ax=ax, linewidth=1)
world.plot(color=world["color"], ax=ax, legend=False)

# Create custom legend
legend_handles = [Line2D([0], [0], marker="o", color="w", markerfacecolor=color_map[cluster], markersize=10, label=f"Shluk {cluster}")
    for cluster in cluster_labels if not pd.isna(cluster)]
legend_handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="lightgray", markersize=10, label="Nepřiřazeno" ))
ax.legend(handles=legend_handles, title="Číslo shluku", loc="lower left")

# Add title and show the plot
plt.title("Segmentace zemí podle k-means algoritmu")
plt.show()


NameError: name 'cluster_labels' is not defined

Úkol č. 6: Interpretovat výsledky - k jaké segmentaci došlo? Očekávali bychom jiné rozdělení?

Úkol č. 7: Zkuste aplikovat jiný algoritmus. Jak se změnily výsledky?

In [None]:
# Apply new algorithm


Úkol č. 8: Aplikujte PCA na škálované proměnné. Kolik variability je vysvětleno prvními dvěma komponentami?

Principal Component Analysis (PCA)
PCA: PCA is a technique used for reducing the dimensionality of large datasets while preserving as much variance as possible. It transforms the original variables into a new set of uncorrelated variables called principal components.

Purpose: It's commonly used to simplify data, speed up machine learning algorithms, and visualize high-dimensional data.

Scree Plot
Scree Plot: This is a graphical representation used in PCA to display the variance explained by each principal component. It plots the eigenvalues (variance) of each principal component in descending order.

Purpose: It's helpful in determining the optimal number of principal components to retain. Typically, you look for an "elbow" point in the plot, where the explained variance starts to level off. This indicates the point where adding more components yields diminishing returns.

In [None]:
# PCA model
model = PCA()
pca_features = model.fit_transform(X_scaled)
explained_variance = model.explained_variance_ratio

# Scree plot
plt.figure(figsize=(6, 4))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker="o", linestyle="--")
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance")
plt.title("Scree Plot")
plt.show()
# plt.plot(np.cumsum(model.explained_variance_ratio_))
# plt.xlabel("Number of components")
# plt.ylabel("Cumulative explained variance")
# plt.title("Scree Plot")
# plt.show()

# Improve x-axis tick labels
plt.xticks(ticks=np.arange(1, len(explained_variance) + 1), rotation=45)

# Add gridlines
plt.grid(True, linestyle="--", linewidth=0.5)
plt.show()


Úkol č. 9: Vizualizujte scatteplot bodů PCA1 vs. PCA2. Vybarvěte body podle přiřazeného shluku.

In [None]:
# Extract the labels
labels = df["label"]

unique_labels = np.unique(labels)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))
# colors = sns.color_palette("colorblind", len(unique_labels))

# Create scatterplot
plt.figure(figsize=(8, 6))

Úkol č. 10: Se kterými z původních (škálovaných) proměnných je PCA1 silně korelována? (to samé provést i pro PCA2)

In [None]:
# Extract PCA1
pca1 = pca_features[:, 0]

# Compute correlations
correlations = np.corrcoef(pca1, X_scaled, rowvar=False)[0, 1:]

# Create a list of tuples (correlation, column name)
correlation_list = [(correlations[i], col) for i, col in enumerate(X.columns)]

# Sort the list of tuples in descending order of correlation
sorted_correlations = sorted(correlation_list, key=lambda x: x[0], reverse=True)

# Print sorted correlations
for corr, col in sorted_correlations:
    print(f"Correlation between PCA1 and {col}: {corr:.2f}")



In [None]:
# Extract PCA2
pca2 = pca_features[:, 1]

# Compute correlations
correlations = np.corrcoef(pca2, X_scaled, rowvar=False)[0, 1:]

# Create a list of tuples (correlation, column name)
correlation_list = [(correlations[i], col) for i, col in enumerate(X.columns)]

# Sort the list of tuples in descending order of correlation
sorted_correlations = sorted(correlation_list, key=lambda x: x[0], reverse=True)

# Print sorted correlations
for corr, col in sorted_correlations:
    print(f"Correlation between PCA2 and {col}: {corr:.2f}")


Úkol č. 11: Dokážeme interpretovat, jaká charakteristika se skrývá pod PCA1 a PCA2 ?