In [348]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [349]:
data = pd.read_csv("./countries_economic_clusters.csv")
data = data.set_index("країна")
data

Unnamed: 0_level_0,ВВП_на_душу_населення_тис$*,рівень_безробіття*%,індекс_освіти,тривалість_життя,індекс_щастя
країна,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
США,68,4.5,0.92,79,7.0
Німеччина,60,3.5,0.93,81,7.2
Італія,55,4.0,0.94,82,7.5
Японія,50,2.8,0.91,84,6.9
Франція,45,7.0,0.9,82,6.8
Польща,18,5.5,0.85,77,6.0
Угорщина,16,6.0,0.83,76,5.8
Румунія,15,5.0,0.8,75,5.7
Індія,7,7.5,0.65,70,4.0
Нігерія,6,7.0,0.55,66,4.1


In [350]:
#chosen metric for distance - Euclidian distance

def euclidian_distance(x1: np.ndarray, x2: np.ndarray):
  return np.sqrt(np.sum([(x2_i - x1_i)**2 for x1_i, x2_i in zip(x1, x2)]))  

In [351]:
def get_cluster_len(cluster: tuple):
  counter = 0
  for element in cluster:
    if pd.isna(element):
      continue
    
    counter += 1
      
  return counter

In [352]:
def compute_distance_matrix(data, clusters):
  distance_matrix = pd.DataFrame(columns=pd.Index(clusters),
                                 index=pd.Index(clusters),
                                 data=np.zeros((len(clusters), len(clusters))))
  
  for row in distance_matrix.index:
    for column in distance_matrix.columns:
      if row == column:
        continue
      
      if get_cluster_len(row) > 1 and get_cluster_len(column) <= 1:
        distances = []
        for idx in range(get_cluster_len(row)):
          distances.append(
            euclidian_distance(np.array(data.loc[row[idx]]), 
                               np.array(data.loc[column[0]]))
          )
        distance_matrix.loc[row, column] = min(distances)
        continue
        
      if get_cluster_len(column) > 1 and get_cluster_len(row) <= 1:
        distances = []
        for idx in range(get_cluster_len(column)):
          distances.append(
            euclidian_distance(np.array(data.loc[row[0]]), 
                               np.array(data.loc[column[idx]]))
          )
        distance_matrix.loc[row, column] = min(distances)
        continue
      
      if get_cluster_len(column) > 1 and get_cluster_len(row) > 1:
        distances = []
        for row_idx in range(get_cluster_len(row)):
          for col_idx in range(get_cluster_len(column)):
            distances.append(
              euclidian_distance(np.array(data.loc[row[row_idx]]),
                                 np.array(data.loc[column[col_idx]]))
            )
        distance_matrix.loc[row, column] = min(distances)
        continue
      
      distance_matrix.loc[row, column] = euclidian_distance(
        np.array(data.loc[row[0]]), 
        np.array(data.loc[column[0]])
      )
  
  return distance_matrix

In [353]:
def find_min_distance(distance_matrix):
  min_distance = 10**10
  min_label_1 = None
  min_label_2 = None

  for row in distance_matrix.index:
    new_row = distance_matrix.loc[row].drop(row)
    minima = new_row.min()
    
    if minima > min_distance:
      continue
    
    min_distance = minima
    min_label_1 = row
    min_label_2 = new_row.idxmin()
  
  return (min_distance, [min_label_1, min_label_2])

In [354]:
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        cleaned = tuple(x for x in label if not pd.isna(x))
        cleaned_labels.append(cleaned)
    return cleaned_labels

In [355]:
def delete_and_create_clusters(clusters, labels):
    cleaned_labels = clean_labels(labels)

    for label in cleaned_labels:
        if pd.isna(label):
            continue
        if label in clusters:
            clusters.remove(label)

    values = []
    for cluster in cleaned_labels:
        for element in cluster:
            if not pd.isna(element):
                values.append(element)

    clusters.append(tuple(values))
    return clusters


In [356]:
def clustering(data):
    clusters = [(index,) for index in data.index]
    clusters_history = []
    iter = 0

    while len(clusters) > 1:
        iter += 1
        print("=" * 60)
        print(f"ITERATION {iter}")
        print("=" * 60)
        
        print("\nCurrent Clusters:")
        for i, c in enumerate(clusters):
            print(f"  {i+1:>2}. {c}")

        # Compute distance matrix
        distance_matrix = compute_distance_matrix(data, clusters)
        
        print("\nDistance Matrix:")
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            print(distance_matrix.round(3))

        # Find minimum distance
        min_distance, labels = find_min_distance(distance_matrix)
        print(f"\nMinimum Distance: {min_distance:.4f}")
        print(f"Clusters to Merge: {labels}")

        # Update clusters
        clusters = delete_and_create_clusters(clusters, labels)
        clusters_history.append(clusters)

        print("\nUpdated Clusters:")
        for i, c in enumerate(clusters):
            print(f"  {i+1:>2}. {c}")
        print("\n")

    print("=" * 60)
    print("Final Clusters:")
    for i, c in enumerate(clusters):
        print(f"  {i+1:>2}. {c}")
    print("=" * 60)

    return clusters_history


In [357]:
history = clustering(data)

ITERATION 1

Current Clusters:
   1. ('США',)
   2. ('Німеччина',)
   3. ('Італія',)
   4. ('Японія',)
   5. ('Франція',)
   6. ('Польща',)
   7. ('Угорщина',)
   8. ('Румунія',)
   9. ('Індія',)
  10. ('Нігерія',)
  11. ('Пакистан',)
  12. ('Ефіопія',)

Distance Matrix:
              США Німеччина  Італія  Японія Франція  Польща Угорщина Румунія  \
США         0.000     8.309  13.360  18.759  23.330  50.060   52.122  53.169   
Німеччина   8.309     0.000   5.132  10.468  15.441  42.255   44.376  45.448   
Італія     13.360     5.132   0.000   5.550  10.464  37.397   39.546  40.660   
Японія     18.759    10.468   5.550   0.000   6.830  32.880   35.092  36.226   
Франція    23.330    15.441  10.464   6.830   0.000  27.512   29.648  30.890   
Польща     50.060    42.255  37.397  32.880  27.512   0.000    2.300   3.653   
Угорщина   52.122    44.376  39.546  35.092  29.648   2.300    0.000   1.735   
Румунія    53.169    45.448  40.660  36.226  30.890   3.653    1.735   0.000   
Індія   

  distance_matrix.loc[row, column] = euclidian_distance(
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = euclidian_distance(
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = euclidian_distance(
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = euclidian_distance(
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = euclidian_distance(
  distance_matrix.loc[row, column] = min(distances)
  distance_matrix.loc[row, column] = mi


Distance Matrix:
                              США Німеччина  Італія  Японія Франція   Індія  \
                              NaN       NaN     NaN     NaN     NaN     NaN   
                              NaN       NaN     NaN     NaN     NaN     NaN   
США       NaN      NaN      0.000     8.309  13.360  18.759  23.330  61.807   
Німеччина NaN      NaN      8.309     0.000   5.132  10.468  15.441  54.372   
Італія    NaN      NaN     13.360     5.132   0.000   5.550  10.464  49.725   
Японія    NaN      NaN     18.759    10.468   5.550   0.000   6.830  45.558   
Франція   NaN      NaN     23.330    15.441  10.464   6.830   0.000  39.952   
Індія     NaN      NaN     61.807    54.372  49.725  45.558  39.952   0.000   
Ефіопія   NaN      NaN     67.452    60.491  56.082  52.406  46.488   9.249   
Пакистан  Нігерія  NaN     63.465    56.241  51.747  47.808  42.242   4.156   
Румунія   Угорщина Польща  50.060    42.255  37.397  32.880  27.512   9.908   

                          Ефіопія

  distance_matrix.loc[row, column] = min(distances)
