# Extracting measures of thes clusterings

In [1]:
import pandas as pd
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [2]:
k_minimum = 2
k_maximum = 7

In [3]:
df_without_collinearity_standardized = pd.read_csv('data/output/df_without_collinearity_standardized.csv', index_col=0)

In [4]:
features_without_collinearity = df_without_collinearity_standardized.columns

features_to_transform = [
    'percentage_estimated_households_in_informal_settlements', 
    'demographic_density_in_informal_settlements', 
    'percentage_hospitalizations_diseases_inadequate_sanitation',      
    'percentage_indigenous_population'
]
features_without_collinearity_transformed = [
    f'sqrt_{feature}' if feature in features_to_transform else feature 
    for feature in features_without_collinearity
]

features_to_transform = [
    'demographic_density',  
    'per_capita_income'
]
features_without_collinearity_transformed = [
    f'log_{feature}' if feature in features_to_transform else feature 
    for feature in features_without_collinearity_transformed
]

In [5]:
list_dataset_names = ['df_standardized', 'df_normalized_min_max', 'df_normalized_l2', 'df_normalized_l1', 'df_normalized_max']

for dataset_name in list_dataset_names:
    print("\n *** DATASET NAME: ", dataset_name)
    
    # for dataset_type in ['features', 'pca']:
    for dataset_type in ['pca']:
        print(dataset_type)        
                
        # for set_features in ['all', 'without_collinearity']:
        for set_features in ['without_collinearity']:
            print(set_features)
            
            list_covariance_types = ['full', 'tied', 'diag', 'spherical']
            if dataset_type == 'features':                
                df_features = pd.read_csv('data/output/'+dataset_name+'.csv', index_col=0)
                if set_features == 'all':
                    df = df_features
                elif dataset_name == 'df_standardized':
                    df = df_features[features_without_collinearity]
                else:
                    df = df_features[features_without_collinearity_transformed]
            else:                
                if set_features == 'all':
                    df = pd.read_csv('data/output/'+dataset_name+'_pca.csv', index_col=0)
                else:
                    df = pd.read_csv('data/output/'+dataset_name+'_without_collinearity_pca.csv', index_col=0)
            
            for covariance_type in list_covariance_types:  
                print(covariance_type)
                
                df_y = pd.read_csv('data/output/'+dataset_name+'_'+dataset_type+'_'+set_features+'_'+covariance_type+'_clustering.csv', index_col=0) 

                dict_k_sc = {}
                dict_k_ch = {}
                dict_k_db = {}
                
                for k in range(k_minimum, k_maximum + 1):
                    print(k)
                    k_string = str(k)
                    dict_k_sc[k] = silhouette_score(df, df_y[k_string])
                    dict_k_ch[k] = calinski_harabasz_score(df, df_y[k_string])
                    dict_k_db[k] = davies_bouldin_score(df, df_y[k_string])
                
                df_measures = pd.DataFrame(data=[2,3,4,5,6,7], columns=['k'])
                df_measures['sc'] = df_measures.k.map(dict_k_sc)
                df_measures['ch'] = df_measures.k.map(dict_k_ch)
                df_measures['db'] = df_measures.k.map(dict_k_db)
            
                df_measures.to_csv('data/output/'+dataset_name+'_'+dataset_type+'_'+set_features+'_'+covariance_type+'_measures.csv', index=True)


 *** DATASET NAME:  df_standardized
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7

 *** DATASET NAME:  df_normalized_min_max
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7

 *** DATASET NAME:  df_normalized_l2
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7

 *** DATASET NAME:  df_normalized_l1
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7

 *** DATASET NAME:  df_normalized_max
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7
