In [1]:
import os 
import pandas as pd 
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
from itertools import combinations
from itertools import product
from tqdm import tqdm
import numpy as np 
import pickle 

In [2]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(data_path, "y_train.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test.csv"))
y_test = pd.read_csv(os.path.join(data_path, "y_test.csv"))
# For unsupervising task we do not need to separate dataset into train and test sets|
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [3]:
# Total number of feature combinations
all_combinations = []
for r in range(18, len(X.columns) + 1):
    for combo in combinations(X.columns, r):
        all_combinations.append(list(combo))
print(f"Total number of feature combinations: {len(all_combinations)}")

Total number of feature combinations: 20


In [4]:
# Parameters
traits = ['Extraversion_bin', 'Agreeableness_bin', 'Conscientiousness_bin', 'Emotional Stability_bin', 'Openness_bin']
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
eps_values = np.arange(0.3, 1.0, 0.1)  
min_samples_values = range(2, 10)  
random_state = 27

# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "dbscan_non_pca")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [5]:
output_metrics = []
for trait in traits:
    print(trait)
    y_labels = [label_mapping[label] for label in y[trait]]
    # Iterate over all combinations of hyperparameters
    best_ari = -1
    best_params = {'features': None, 'eps': None, 'min_samples': None}
    for perm in tqdm(all_combinations):
        selected_features = X[perm]
        for eps, min_samples in product(eps_values, min_samples_values):
            db = DBSCAN(eps=eps, min_samples=min_samples).fit(selected_features)
            cluster_labels = db.labels_
            ari = adjusted_rand_score(y_labels, cluster_labels)
            if ari > best_ari:
                best_ari = ari
                best_params['features'] = perm
                best_params['eps'] = eps
                best_params['min_samples'] = min_samples
    final_clusterer = best_params['min_samples'] 
    output_template = {
        "trait": trait, 
        "top_features": best_params['features'],
        "eps": best_params["eps"],
        "ARI": best_ari,
        }
    output_metrics.append(output_template)

Extraversion_bin


100%|██████████| 20/20 [05:05<00:00, 15.28s/it]


Agreeableness_bin


100%|██████████| 20/20 [13:04<00:00, 39.21s/it]


Conscientiousness_bin


100%|██████████| 20/20 [14:29<00:00, 43.46s/it]


Emotional Stability_bin


100%|██████████| 20/20 [14:14<00:00, 42.73s/it]


Openness_bin


100%|██████████| 20/20 [14:22<00:00, 43.13s/it]


In [6]:
with open(os.path.join(specific_results_path, 'dbscan_ari_driven.pkl'), 'wb') as file:
    pickle.dump(output_metrics, file)
# Test load
with open(os.path.join(specific_results_path, 'dbscan_ari_driven.pkl'), 'rb') as file:
    output_metrics_test = pickle.load(file)

In [7]:
output_metrics_test

[{'trait': 'Extraversion_bin',
  'top_features': ['ibi_5',
   'ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hr_6',
   'hrv_1',
   'hrv_2',
   'hrv_3',
   'hrv_4',
   'hrv_5',
   'hrv_6',
   'Valence',
   'Engagement',
   'Liking',
   'Familiarity'],
  'eps': 0.3,
  'ARI': 0.04200705793405078},
 {'trait': 'Agreeableness_bin',
  'top_features': ['ibi_5',
   'ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hr_6',
   'hrv_1',
   'hrv_2',
   'hrv_3',
   'hrv_4',
   'hrv_5',
   'hrv_6',
   'Arousal',
   'Engagement',
   'Liking',
   'Familiarity'],
  'eps': 0.4,
  'ARI': 0.012120885959031907},
 {'trait': 'Conscientiousness_bin',
  'top_features': ['ibi_5',
   'ibi_6',
   'hr_1',
   'hr_2',
   'hr_3',
   'hr_4',
   'hr_5',
   'hr_6',
   'hrv_1',
   'hrv_2',
   'hrv_3',
   'hrv_4',
   'hrv_5',
   'Arousal',
   'Valence',
   'Engagement',
   'Liking',
   'Familiarity'],
  'eps': 0.4,
  'ARI': 0.011153571582787444},
 {'trait': 'Emotional Stability_bin',
 