In [1]:
import os 
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
from itertools import combinations
from tqdm import tqdm
import numpy as np 
import pickle 

In [2]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train_pca.csv"))
y_train = pd.read_csv(os.path.join(data_path, "y_train_pca.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test_pca.csv"))
y_test = pd.read_csv(os.path.join(data_path, "y_test_pca.csv"))
# For unsupervising task we do not need to separate dataset into train and test sets|
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [3]:
# Total number of feature combinations
all_combinations = []
for r in range(1, len(X.columns) + 1):
    for combo in combinations(X.columns, r):
        all_combinations.append(list(combo))
print(f"Total number of feature combinations: {len(all_combinations)}")

Total number of feature combinations: 255


In [4]:
# Note on silhouette scoring 
# going to default with euclidean for the purposes of this paper
# distance_metrics = ['cityblock', 'cosine', 'euclidean', 'manhattan']

In [5]:
# Parameters
traits = ['Extraversion_bin', 'Agreeableness_bin', 'Conscientiousness_bin', 'Emotional Stability_bin', 'Openness_bin']
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
k_values = [3] # values are from 0 to 7 for each personality trait 

random_state = 27

# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "kmeans")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [6]:
output_metrics = []
for trait in traits:
    print(trait)
    y_labels = [label_mapping[label] for label in y[trait]]
    ari_scores = []
    top_clusterers = []
    for perm in tqdm(all_combinations):
        selected_features = X[perm]
        silh_scores = []
        clusterers = []
        for k in k_values:
            clusterer = KMeans(n_clusters=k, random_state=random_state, n_init="auto")
            cluster_labels = clusterer.fit_predict(selected_features)
            silh_scores.append(silhouette_score(selected_features, cluster_labels, metric="euclidean"))
            clusterers.append(clusterer)
        clusterer_best = clusterers[np.argmax(silh_scores)]
        cluster_labels_best = clusterer_best.fit_predict(selected_features)
        ari_scores.append(adjusted_rand_score(y_labels, cluster_labels))
        top_clusterers.append(clusterer_best)
    top_features = all_combinations[np.argmax(ari_scores)]
    final_clusterer = top_clusterers[np.argmax(ari_scores)]
    output_template = {
        "trait": trait, 
        "top_features": top_features,
        "optimal_k": final_clusterer.n_clusters,
        "clusterer": final_clusterer,
        "ARI": ari_scores[np.argmax(ari_scores)],
        "silh_score": silhouette_score(X[top_features], final_clusterer.fit_predict(X[top_features]))
        }
    output_metrics.append(output_template)

Extraversion_bin


100%|██████████| 255/255 [01:51<00:00,  2.28it/s]


Agreeableness_bin


 11%|█         | 27/255 [00:10<01:45,  2.16it/s]

In [None]:
with open(os.path.join(specific_results_path, 'kmeans_static_output.pkl'), 'wb') as file:
    pickle.dump(output_metrics, file)
# Test load
with open(os.path.join(specific_results_path, 'kmeans_static_output.pkl'), 'rb') as file:
    output_metrics_test = pickle.load(file)