In [1]:
import os 
import pandas as pd 
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
from itertools import combinations
from tqdm import tqdm
import numpy as np 
import pickle 

In [2]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train_pca.csv"))
y_train = pd.read_csv(os.path.join(data_path, "y_train_pca.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test_pca.csv"))
y_test = pd.read_csv(os.path.join(data_path, "y_test_pca.csv"))
# For unsupervising task we do not need to separate dataset into train and test sets|
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [3]:
# Total number of feature combinations
all_combinations = []
for r in range(1, len(X.columns) + 1):
    for combo in combinations(X.columns, r):
        all_combinations.append(list(combo))
print(f"Total number of feature combinations: {len(all_combinations)}")

Total number of feature combinations: 255


In [4]:
# Note on silhouette scoring 
# going to default with euclidean for the purposes of this paper
# distance_metrics = ['cityblock', 'cosine', 'euclidean', 'manhattan']

In [5]:
# Parameters
traits = ['Extraversion_bin', 'Agreeableness_bin', 'Conscientiousness_bin', 'Emotional Stability_bin', 'Openness_bin']
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
k_values = [2,3,4,5,6,7,8,9,10,15,20] # values are from 0 to 7 for each personality trait 

random_state = 27

# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "gmm")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [6]:
output_metrics = []
for trait in traits:
    print(trait)
    y_labels = [label_mapping[label] for label in y[trait]]
    metrics = []
    ari_scores = []
    for perm in tqdm(all_combinations):
        selected_features = X[perm]
        for k in k_values:
            clusterer = GaussianMixture(n_components=k, random_state=random_state).fit(selected_features)
            cluster_labels = clusterer.predict(selected_features)
            metrics.append({"features": perm, "clusterer": clusterer, "optimal_k": k})
            ari_scores.append(adjusted_rand_score(y_labels, cluster_labels))
    top_features = metrics[np.argmax(ari_scores)]["features"]
    final_clusterer = metrics[np.argmax(ari_scores)]["clusterer"]
    optimal_k = metrics[np.argmax(ari_scores)]["optimal_k"]
    output_template = {
        "trait": trait, 
        "top_features": top_features,
        "optimal_k": optimal_k,
        "clusterer": final_clusterer,
        "ARI": max(ari_scores),
        "silh_score": silhouette_score(X[top_features], final_clusterer.fit_predict(X[top_features]))
        }
    output_metrics.append(output_template)

Extraversion_bin


100%|██████████| 255/255 [1:50:00<00:00, 25.88s/it] 


Agreeableness_bin


100%|██████████| 255/255 [1:38:10<00:00, 23.10s/it]


Conscientiousness_bin


100%|██████████| 255/255 [1:45:51<00:00, 24.91s/it]


Emotional Stability_bin


100%|██████████| 255/255 [1:29:29<00:00, 21.06s/it]


Openness_bin


100%|██████████| 255/255 [1:24:38<00:00, 19.92s/it]


In [7]:
with open(os.path.join(specific_results_path, 'gmm_ari_driven.pkl'), 'wb') as file:
    pickle.dump(output_metrics, file)
# Test load
with open(os.path.join(specific_results_path, 'gmm_ari_driven.pkl'), 'rb') as file:
    output_metrics_test = pickle.load(file)

In [8]:
output_metrics_test

[{'trait': 'Extraversion_bin',
  'top_features': ['PC6', 'PC8'],
  'optimal_k': 3,
  'clusterer': GaussianMixture(n_components=3, random_state=27),
  'ARI': 0.025465123182580488,
  'silh_score': 0.34855100383348103},
 {'trait': 'Agreeableness_bin',
  'top_features': ['PC4', 'PC7'],
  'optimal_k': 4,
  'clusterer': GaussianMixture(n_components=4, random_state=27),
  'ARI': 0.021854955324873446,
  'silh_score': 0.3361261262925572},
 {'trait': 'Conscientiousness_bin',
  'top_features': ['PC2', 'PC4', 'PC6', 'PC7'],
  'optimal_k': 2,
  'clusterer': GaussianMixture(n_components=2, random_state=27),
  'ARI': 0.027207345843783575,
  'silh_score': 0.3039071512686172},
 {'trait': 'Emotional Stability_bin',
  'top_features': ['PC1', 'PC2', 'PC3', 'PC5'],
  'optimal_k': 3,
  'clusterer': GaussianMixture(n_components=3, random_state=27),
  'ARI': 0.09715646677611949,
  'silh_score': 0.4674857251163913},
 {'trait': 'Openness_bin',
  'top_features': ['PC1', 'PC3', 'PC5', 'PC7', 'PC8'],
  'optimal_k'