In [1]:
import pandas as pd
import numpy as np
import os
import hdbscan

# Load the original data
dataset_path = '/Users/leo/Programming/PLR/Leo/data/cleaned_data_SYMPTOMS_9_13_23.csv'
original_data = pd.read_csv(dataset_path)

# Load the encoded data
encoded_data_path = '/Users/leo/Programming/PLR/Leo/main/final/grid_autoencoder/final_embedding/encoded_data_5_5.npy'
data_encoded = np.load(encoded_data_path)

# Apply HDBSCAN with the best parameters
clusterer = hdbscan.HDBSCAN(min_cluster_size=37, min_samples=6)
cluster_labels = clusterer.fit_predict(data_encoded)

# Add the cluster labels to the original data
original_data['cluster'] = cluster_labels

In [2]:
# Create a new DataFrame with only 'Unnamed: 0' and 'cluster' columns
data_to_save = original_data[['Unnamed: 0', 'cluster']]

# Define the save directory and file name
save_dir = '/Users/leo/Programming/PLR/clusterings/leo'
os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist
save_path = os.path.join(save_dir, 'clustered_data.csv')

# Save the DataFrame to CSV
data_to_save.to_csv(save_path, index=False)
print(f"Data saved to {save_path}")

Data saved to /Users/leo/Programming/PLR/clusterings/leo/clustered_data.csv


In [3]:
for column in original_data.columns:
    print(column)

Unnamed: 0
Symptom_Memory_short
Symptom_Memory_long
Symptom_Memory_no_new_memories
Symptom_Memory_forget_tasks
Symptom_Memory_other
Symptom_Cognitive_Functioning_thinking
Symptom_Cognitive_Functioning_executive
Symptom_Cognitive_Functioning_problemsolving
Symptom_Cognitive_Functioning_slowedthoughts
Symptom_Cognitive_Functioning_fastthoughts
Symptom_Cognitive_Functioning_agnosia
Symptom_Cognitive_Functioning_attentionconcentration
Symptom_Cognitive_Functioning_other
Symptom_Speech_difficulty_finding_words
Symptom_Speech_difficulty_communicating_verbally
Symptom_Speech_difficulty_reading_processing_text
Symptom_Speech_difficulty_understanding_others_speech
Symptom_Speech_difficulty_speaking_complete_sentences
Symptom_Speech_speaking_unrecognizable_words
Symptom_Speech_difficulty_communicating_writing
Symptom_Speech_changes_to_secondary_languages
Symptom_Speech_other
Symptom_Headaches_behind_eyes
Symptom_Headaches_stiff_neck
Symptom_Headaches_brain_pressure
Symptom_Headaches_diffuse
Symp

In [4]:
original_data["Flag_POTS"].value_counts()

Flag_POTS
No, no tachycardia                        2459
Possible, had tachycardia/palpitations    1773
Likely, had higher bpm standing            901
Definite, measured bpm > 30                898
Name: count, dtype: int64

In [5]:
# Ensure 'cluster' is a categorical variable
original_data['cluster'] = original_data['cluster'].astype('category')

# Group by 'cluster'
grouped = original_data.groupby('cluster')

# Calculate the size of each group
cluster_sizes = grouped.size()

# Print the size of each cluster
print(cluster_sizes)

# Calculate the average size of the clusters
average_cluster_size = cluster_sizes.mean()
print(f"Average size per cluster: {average_cluster_size}")

cluster
-1      72
0      522
1      247
2      336
3      385
4     1303
5     1500
6      931
7      735
dtype: int64
Average size per cluster: 670.1111111111111


In [6]:
# Identify columns that are symptoms
symptom_columns = [col for col in original_data.columns if col.startswith('Symptom_')]

# Create a subset of data with only symptom columns and the cluster column
symptom_data = original_data[symptom_columns + ['cluster']]

# Compute the total count of symptoms for each row
symptom_data['symptom_count'] = symptom_data[symptom_columns].sum(axis=1)

# Group by cluster and calculate the average number of symptoms
average_symptoms_per_cluster = round(symptom_data.groupby('cluster')['symptom_count'].mean(),0)
# Display the result
print(average_symptoms_per_cluster)

cluster
-1    56.0
0     20.0
1     31.0
2     34.0
3     35.0
4     54.0
5     58.0
6     35.0
7     47.0
Name: symptom_count, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  symptom_data['symptom_count'] = symptom_data[symptom_columns].sum(axis=1)


In [7]:
original_data["Demographics_Gender_Cleaned"].value_counts()

Demographics_Gender_Cleaned
Woman                                           4703
Man                                             1205
Non-binary/Genderqueer/Gender non-conforming     100
Name: count, dtype: int64

In [8]:
original_data['Demographics_Gender_Cleaned'] = original_data['Demographics_Gender_Cleaned'].apply(lambda x: 1 if x == 'Woman' else 0)

# Group by cluster and calculate the average value for 'Woman'
average_woman_per_cluster = round(original_data.groupby('cluster')['Demographics_Gender_Cleaned'].mean(),3)

# Display the result
print(average_woman_per_cluster)

cluster
-1    0.778
0     0.670
1     0.713
2     0.768
3     0.797
4     0.819
5     0.844
6     0.727
7     0.743
Name: Demographics_Gender_Cleaned, dtype: float64


In [12]:
original_data["Demographics_Age_Cleaned"].value_counts()

Demographics_Age_Cleaned
40-49    1725
50-59    1460
30-39    1352
60-69     736
18-29     526
70-79     213
80+        19
Name: count, dtype: int64

In [15]:
original_data['Age_numeric'] =original_data['Demographics_Age_Cleaned'].replace({'30-39': 34.5,'40-49': 44.5, '50-59':54.5, '60-69':64.5,
                                                    '70-79': 74.5, '80+': 80.0,'18-29' : 23.5})

In [19]:
# Group by cluster and calculate the average age
average_age_per_cluster = round(original_data.groupby('cluster')['Age_numeric'].mean(), 2)

# Display the result
print(average_age_per_cluster)

cluster
-1    48.82
0     48.31
1     49.52
2     47.13
3     47.57
4     45.55
5     45.73
6     45.91
7     46.80
Name: Age_numeric, dtype: float64


In [27]:
original_data["Cognitive_PEM_Severity"].value_counts()

Cognitive_PEM_Severity
8.0     772
7.0     724
0.0     610
5.0     603
6.0     565
10.0    520
9.0     479
4.0     346
3.0     298
2.0     216
1.0     196
Name: count, dtype: int64

In [25]:
# Normalize the Physical_PEM_Severity column
pem_min = original_data['Physical_PEM_Severity'].min()
pem_max = original_data['Physical_PEM_Severity'].max()
original_data['Normalized_Physical_PEM_Severity'] = (original_data['Physical_PEM_Severity'] - pem_min) / (pem_max - pem_min)

# Group by cluster and calculate the normalized average
normalized_avg_per_cluster = round(original_data.groupby('cluster')['Normalized_Physical_PEM_Severity'].mean(),3)

In [26]:
normalized_avg_per_cluster

cluster
-1    0.813
0     0.686
1     0.704
2     0.714
3     0.714
4     0.804
5     0.810
6     0.738
7     0.768
Name: Normalized_Physical_PEM_Severity, dtype: float64

In [29]:
# Normalize the Cognitive_PEM_Severity column
cog_pem_min = original_data['Cognitive_PEM_Severity'].min()
cog_pem_max = original_data['Cognitive_PEM_Severity'].max()
original_data['Normalized_Cognitive_PEM_Severity'] = (original_data['Cognitive_PEM_Severity'] - cog_pem_min) / (cog_pem_max - cog_pem_min)

# Group by cluster and calculate the normalized average
normalized_cog_avg_per_cluster = round(original_data.groupby('cluster')['Normalized_Cognitive_PEM_Severity'].mean(), 3)

# Display the result
print(normalized_cog_avg_per_cluster)

cluster
-1    0.617
0     0.265
1     0.327
2     0.343
3     0.330
4     0.646
5     0.654
6     0.578
7     0.620
Name: Normalized_Cognitive_PEM_Severity, dtype: float64
