# AGNES Hierarchical clustering algorithm implementation

In [1]:
import pandas as pd
import numpy as np
from package.normalize import normalize_data
from package.distance import calcul_distances

In [2]:
def display_dist_matrix(labels: list, matrix: np.ndarray):
  # create a pandas dataframe from the numpy matrix with row and column labels
  df = pd.DataFrame(matrix, index=labels, columns=labels)
  # display the dataframe with labels for rows and columns
  styled_df = df.style.set_caption('Matrix').set_table_styles(
    [{'selector': 'th', 'props': [('font-size', '14px')]}]).set_properties(
    **{'text-align': 'center', 'font-size': '12px'}).set_table_attributes('border="1"')

  display(styled_df)

In [3]:
# Get the index of the minimum value of the matrix
def np_min_ignore_diagonal(z: np.ndarray):
  row_index , col_index = 0, 1
  for i in range(len(z)):
    for j in range(len(z)):
      if i == j:
        continue
      if z[i, j] < z[row_index, col_index]:
        row_index = i
        col_index = j
  return row_index, col_index

In [4]:
def agnes(df: pd.DataFrame, num_clusters: int = 2, options: dict = {}):
    # Normalize data
    normalize_data(df, None)
    print('Normalized data:')
    print(df, end='\n\n')
    
    labels = list(map(str, df['id'].values))
    if 'class' in df.columns:
        df = df.drop(columns=['id', 'class'])
    else:
        df = df.drop(columns=['id'])
    Z = calcul_distances(df, labels)

    
    while len(Z) > 1 and len(labels) > num_clusters:
        display_dist_matrix(labels, Z)
        
        row_index, col_index = np_min_ignore_diagonal(Z)

        
        for j in range(len(Z)):
            if j == col_index:
                Z[row_index, j] = float('inf')
            elif j != row_index:
                Z[row_index, j] = min(Z[row_index, j], Z[col_index, j])
                Z[j, row_index] = Z[row_index, j]
        Z = np.delete(Z, col_index, 0)
        Z = np.delete(Z, col_index, 1)

        labels[row_index] = f'{labels[row_index]}-{labels[col_index]}'
        labels = list(np.delete(labels, col_index, 0))

    
    print("Final distance matrix:")
    display_dist_matrix(labels, Z)
    print("\nFinal clusters:", labels)

## Main

In [5]:
data = pd.read_csv('data/exam-data.csv')
labels = data['id'].values

agnes(data, 2)

Normalized data:
   id     note1     note2
0  e1  0.333333  0.222222
1  e2  0.833333  0.666667
2  e3  1.000000  1.000000
3  e4  0.000000  0.000000
4  e5  0.666667  0.555556



Unnamed: 0,e1,e2,e3,e4,e5
e1,0.0,0.472222,0.722222,0.277778,0.333333
e2,0.472222,0.0,0.25,0.75,0.138889
e3,0.722222,0.25,0.0,1.0,0.388889
e4,0.277778,0.75,1.0,0.0,0.611111
e5,0.333333,0.138889,0.388889,0.611111,0.0


Unnamed: 0,e1,e2-e5,e3,e4
e1,0.0,0.333333,0.722222,0.277778
e2-e5,0.333333,0.0,0.25,0.611111
e3,0.722222,0.25,0.0,1.0
e4,0.277778,0.611111,1.0,0.0


Unnamed: 0,e1,e2-e5-e3,e4
e1,0.0,0.333333,0.277778
e2-e5-e3,0.333333,0.0,0.611111
e4,0.277778,0.611111,0.0


Final distance matrix:


Unnamed: 0,e1-e4,e2-e5-e3
e1-e4,0.0,0.333333
e2-e5-e3,0.333333,0.0



Final clusters: ['e1-e4', 'e2-e5-e3']
