# Executing clustering to sociodemographic dataset

In [1]:
import pandas as pd
from sklearn.mixture import GaussianMixture
import numpy as np

## Loading data

In [2]:
df_without_collinearity_standardized = pd.read_csv('data/output/df_without_collinearity_standardized.csv', index_col=0)

In [3]:
features_without_collinearity = df_without_collinearity_standardized.columns

features_to_transform = [
    'percentage_estimated_households_in_informal_settlements', 
    'demographic_density_in_informal_settlements', 
    'percentage_hospitalizations_diseases_inadequate_sanitation',   
    'percentage_indigenous_population'
]
features_without_collinearity_transformed = [
    f'sqrt_{feature}' if feature in features_to_transform else feature 
    for feature in features_without_collinearity
]

features_to_transform = [
    'demographic_density',  
    'per_capita_income'
]
features_without_collinearity_transformed = [
    f'log_{feature}' if feature in features_to_transform else feature 
    for feature in features_without_collinearity_transformed
]

## Parameters

In [4]:
total_executions = 200
# total_executions = 1
k_min = 2
k_max = 7

## Clustering

In [5]:
list_dataset_names = ['df_standardized', 'df_normalized_min_max', 'df_normalized_l2', 'df_normalized_l1', 'df_normalized_max']
list_covariance_types = ['full', 'tied', 'diag', 'spherical']

for dataset_name in list_dataset_names:
    print("\n *** DATASET NAME: ", dataset_name)
    
    # for dataset_type in ['features', 'pca']:
    for dataset_type in ['pca']:
        print(dataset_type)        
                
        # for set_features in ['all', 'without_collinearity']:
        for set_features in ['without_collinearity']:
            print(set_features)

            if dataset_type == 'features':                
                df_features = pd.read_csv('data/output/'+dataset_name+'.csv', index_col=0)
                if set_features == 'all':
                    df = df_features
                elif dataset_name == 'df_standardized':
                    df = df_features[features_without_collinearity]
                else:
                    df = df_features[features_without_collinearity_transformed]
            else:                
                if set_features == 'all':
                    df = pd.read_csv('data/output/'+dataset_name+'_pca.csv', index_col=0)
                else:
                    df = pd.read_csv('data/output/'+dataset_name+'_without_collinearity_pca.csv', index_col=0)
            
            for covariance_type in list_covariance_types:  
                print(covariance_type)
                dict_k_y = {}
                
                for k in range(k_min, k_max + 1):
                    print(k)
                    model = GaussianMixture(k, n_init=total_executions, covariance_type=covariance_type)
                    y = model.fit_predict(df)
                    dict_k_y[k] = y    
                    df_probability_sociodemographic = pd.DataFrame(model.predict_proba(df), index=df.index)
                    
                    filename_base = 'data/output/'+dataset_name+'_'+dataset_type+'_'+set_features+'_'+covariance_type+'_'+str(k)                   
                    df_probability_sociodemographic.to_csv(filename_base+'_probability.csv', index=True)                    
                    np.save(filename_base+'_weights.npy', model.weights_)                    
                    np.save(filename_base+'_means.npy', model.means_)                    
                    np.save(filename_base+'_precisions.npy', model.precisions_)
                
                df_clustering_sociodemographic = pd.DataFrame(dict_k_y, index=df.index)                
                df_clustering_sociodemographic.to_csv('data/output/'+dataset_name+'_'+dataset_type+'_'+set_features+'_'+covariance_type+'_clustering.csv', index=True)   


 *** DATASET NAME:  df_standardized
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7

 *** DATASET NAME:  df_normalized_min_max
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7

 *** DATASET NAME:  df_normalized_l2
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7

 *** DATASET NAME:  df_normalized_l1
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7

 *** DATASET NAME:  df_normalized_max
pca
without_collinearity
full
2
3
4
5
6
7
tied
2
3
4
5
6
7
diag
2
3
4
5
6
7
spherical
2
3
4
5
6
7
