In [1]:
import os 
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
from itertools import combinations
from tqdm import tqdm
import numpy as np 
import pickle 

In [2]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(data_path, "y_train.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test.csv"))
y_test = pd.read_csv(os.path.join(data_path, "y_test.csv"))
# For unsupervising task we do not need to separate dataset into train and test sets|
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [3]:
# Total number of feature combinations
all_combinations = []
for r in range(18, len(X.columns) + 1):
    for combo in combinations(X.columns, r):
        all_combinations.append(list(combo))
print(f"Total number of feature combinations: {len(all_combinations)}")

Total number of feature combinations: 20


In [4]:
# Note on silhouette scoring 
# going to default with euclidean for the purposes of this paper
# distance_metrics = ['cityblock', 'cosine', 'euclidean', 'manhattan']

In [5]:
# Parameters
traits = ['Extraversion_bin', 'Agreeableness_bin', 'Conscientiousness_bin', 'Emotional Stability_bin', 'Openness_bin']
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
k_values = [2,3,4,5,6,7,8,9,10,15,20,25,30,40,50] # values are from 0 to 7 for each personality trait 

random_state = 27

# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "kmeans_non_pca")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [6]:
output_metrics = []
for trait in traits:
    print(trait)
    y_labels = [label_mapping[label] for label in y[trait]]
    metrics = []
    ari_scores = []
    for perm in tqdm(all_combinations):
        selected_features = X[perm]
        for k in k_values:
            clusterer = KMeans(n_clusters=k, random_state=random_state, n_init="auto").fit(selected_features)
            cluster_labels = clusterer.labels_
            metrics.append({"features": perm, "clusterer": clusterer})
            ari_scores.append(adjusted_rand_score(y_labels, cluster_labels))
    top_features = metrics[np.argmax(ari_scores)]["features"]
    final_clusterer = metrics[np.argmax(ari_scores)]["clusterer"]
    output_template = {
        "trait": trait, 
        "top_features": top_features,
        "optimal_k": final_clusterer.n_clusters,
        "clusterer": final_clusterer,
        "ARI": max(ari_scores),
        }
    output_metrics.append(output_template)

Extraversion_bin


100%|██████████| 20/20 [07:26<00:00, 22.35s/it]


Agreeableness_bin


100%|██████████| 20/20 [07:50<00:00, 23.53s/it]


Conscientiousness_bin


100%|██████████| 20/20 [07:53<00:00, 23.65s/it]


Emotional Stability_bin


100%|██████████| 20/20 [07:42<00:00, 23.14s/it]


Openness_bin


100%|██████████| 20/20 [08:11<00:00, 24.56s/it]


In [7]:
with open(os.path.join(specific_results_path, 'kmeans_ari_driven.pkl'), 'wb') as file:
    pickle.dump(output_metrics, file)
# Test load
with open(os.path.join(specific_results_path, 'kmeans_ari_driven.pkl'), 'rb') as file:
    output_metrics_test = pickle.load(file)

In [8]:
output_metrics_test

[{'trait': 'Extraversion_bin',
  'top_features': ['ibi_5',
   'ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hr_6',
   'hrv_1',
   'hrv_2',
   'hrv_3',
   'hrv_4',
   'hrv_5',
   'hrv_6',
   'Arousal',
   'Valence',
   'Engagement',
   'Familiarity'],
  'optimal_k': 2,
  'clusterer': KMeans(n_clusters=2, n_init='auto', random_state=27),
  'ARI': 0.015853707085133218},
 {'trait': 'Agreeableness_bin',
  'top_features': ['ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hr_6',
   'hrv_1',
   'hrv_2',
   'hrv_3',
   'hrv_4',
   'hrv_5',
   'hrv_6',
   'Arousal',
   'Valence',
   'Engagement',
   'Liking',
   'Familiarity'],
  'optimal_k': 2,
  'clusterer': KMeans(n_clusters=2, n_init='auto', random_state=27),
  'ARI': 0.008993087142347476},
 {'trait': 'Conscientiousness_bin',
  'top_features': ['ibi_5',
   'ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hr_6',
   'hrv_1',
   'hrv_2',
   'hrv_3',
   'hrv_4',
   'hrv_5',
   'hrv_6',
