In [9]:
import os 
import pandas as pd 
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
from itertools import combinations
from tqdm import tqdm
import numpy as np 
import pickle 

In [10]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(data_path, "y_train.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test.csv"))
y_test = pd.read_csv(os.path.join(data_path, "y_test.csv"))
# For unsupervising task we do not need to separate dataset into train and test sets|
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [11]:
# Total number of feature combinations
all_combinations = []
for r in range(18, len(X.columns) + 1):
    for combo in combinations(X.columns, r):
        all_combinations.append(list(combo))
print(f"Total number of feature combinations: {len(all_combinations)}")

Total number of feature combinations: 20


In [12]:
# Note on silhouette scoring 
# going to default with euclidean for the purposes of this paper
# distance_metrics = ['cityblock', 'cosine', 'euclidean', 'manhattan']

In [13]:
# Parameters
traits = ['Extraversion_bin', 'Agreeableness_bin', 'Conscientiousness_bin', 'Emotional Stability_bin', 'Openness_bin']
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
k_values = [2,3,4,5,6,7,8,9,10,15,20] # values are from 0 to 7 for each personality trait 

random_state = 27

# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "gmm_non_pca")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [14]:
output_metrics = []
for trait in traits:
    print(trait)
    y_labels = [label_mapping[label] for label in y[trait]]
    metrics = []
    ari_scores = []
    for perm in tqdm(all_combinations):
        selected_features = X[perm]
        for k in k_values:
            clusterer = GaussianMixture(n_components=k, random_state=random_state).fit(selected_features)
            cluster_labels = clusterer.predict(selected_features)
            metrics.append({"features": perm, "clusterer": clusterer, "optimal_k": k})
            ari_scores.append(adjusted_rand_score(y_labels, cluster_labels))
    top_features = metrics[np.argmax(ari_scores)]["features"]
    final_clusterer = metrics[np.argmax(ari_scores)]["clusterer"]
    optimal_k = metrics[np.argmax(ari_scores)]["optimal_k"]
    output_template = {
        "trait": trait, 
        "top_features": top_features,
        "optimal_k": optimal_k,
        "clusterer": final_clusterer,
        "ARI": max(ari_scores),
        "silh_score": silhouette_score(X[top_features], final_clusterer.fit_predict(X[top_features]))
        }
    output_metrics.append(output_template)

Extraversion_bin


100%|██████████| 20/20 [2:46:53<00:00, 500.66s/it] 


Agreeableness_bin


100%|██████████| 20/20 [4:37:56<00:00, 833.82s/it]  


Conscientiousness_bin


100%|██████████| 20/20 [1:20:36<00:00, 241.84s/it]


Emotional Stability_bin


100%|██████████| 20/20 [49:13<00:00, 147.65s/it] 


Openness_bin


100%|██████████| 20/20 [49:23<00:00, 148.18s/it] 


In [15]:
with open(os.path.join(specific_results_path, 'gmm_ari_driven.pkl'), 'wb') as file:
    pickle.dump(output_metrics, file)
# Test load
with open(os.path.join(specific_results_path, 'gmm_ari_driven.pkl'), 'rb') as file:
    output_metrics_test = pickle.load(file)

In [16]:
output_metrics_test

[{'trait': 'Extraversion_bin',
  'top_features': ['ibi_5',
   'ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hrv_1',
   'hrv_2',
   'hrv_3',
   'hrv_4',
   'hrv_5',
   'hrv_6',
   'Arousal',
   'Valence',
   'Engagement',
   'Liking',
   'Familiarity'],
  'optimal_k': 4,
  'clusterer': GaussianMixture(n_components=4, random_state=27),
  'ARI': 0.020178779943133333,
  'silh_score': 0.1416910414653129},
 {'trait': 'Agreeableness_bin',
  'top_features': ['ibi_5',
   'ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hr_6',
   'hrv_1',
   'hrv_2',
   'hrv_3',
   'hrv_4',
   'hrv_5',
   'hrv_6',
   'Valence',
   'Engagement',
   'Liking',
   'Familiarity'],
  'optimal_k': 7,
  'clusterer': GaussianMixture(n_components=7, random_state=27),
  'ARI': 0.009930626905005896,
  'silh_score': 0.08839394350495015},
 {'trait': 'Conscientiousness_bin',
  'top_features': ['ibi_5',
   'ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hr_6',
   'hrv