In [20]:
import os, csv
import pandas as pd

In [None]:
# Extract healthy subjects and MDD patinents from HCP dataset
DEMOS_PATH = "/cluster/home/herminea/mental_health_project/tcp_dataset/ds005237/phenotype/demos.tsv"
OUT_DIR = "subjects_lists"
os.makedirs(OUT_DIR, exist_ok=True)

In [61]:
# Read manually (due to malformed header)
with open(DEMOS_PATH, "r", encoding="ISO-8859-1") as f:
    reader = csv.reader(f, delimiter=",")
    lines = list(reader)

header = lines[1]
data = lines[2:]
df = pd.DataFrame(data, columns=header)

# Normalize key columns
for col in ["subjectkey", "Primary_Dx", "Non-Primary_Dx", "Group"]:
    df[col] = df[col].astype(str).str.strip().str.upper()

# Split MDD types
mdd_now_mask  = df["Primary_Dx"] == "MDD"
mdd_past_mask = df["Primary_Dx"].str.startswith("PAST MDD")  
hc_mask       = (df["Group"] == "GENPOP") & (df["Primary_Dx"] == "999") & (df["Non-Primary_Dx"] == "999")

# Extract subject lists
mdd_now_subjects  = df.loc[mdd_now_mask, "subjectkey"].unique()
mdd_past_subjects = df.loc[mdd_past_mask, "subjectkey"].unique()
hc_subjects       = df.loc[hc_mask, "subjectkey"].unique()


In [65]:
# Write to text files
def save_list(subjects, filename):
    with open(os.path.join(OUT_DIR, filename), "w") as f:
        f.write("\n".join(subjects))

save_list(mdd_now_subjects,  "subjects_mdd_primary.txt")
save_list(mdd_past_subjects, "subjects_mdd_past.txt")
save_list(hc_subjects,       "subjects_hc.txt")

# Print summary
print(f"MDD (current) subjects: {len(mdd_now_subjects)}")
print(f"MDD (past) subjects   : {len(mdd_past_subjects)}")
print(f"HC subjects            : {len(hc_subjects)}")

MDD (current) subjects: 21
MDD (past) subjects   : 13
HC subjects            : 93


In [66]:
# Filter rows where "MDD" is mentioned in Primary_Dx
mdd_any_primary = df[df["Primary_Dx"].str.contains("MDD", na=False)]
print(f"Found {len(mdd_any_primary)} rows with MDD in Primary_Dx")

# Count unique variants
print("MDD types in Primary_Dx (any mention):")
print(mdd_any_primary["Primary_Dx"].value_counts())

Found 36 rows with MDD in Primary_Dx
MDD types in Primary_Dx (any mention):
Primary_Dx
MDD                                                     21
PAST MDD                                                12
PAST MDD (DUE TO SUD)                                    1
PMDD                                                     1
MDD (W/ MILD ANXIOUS DISTRESS, MELANCHOLIC FEATURES)     1
Name: count, dtype: int64


In [67]:
print("\nAll columns in the DataFrame:")
for col in df.columns:
    print(col)


All columns in the DataFrame:
subjectkey
src_subject_id
interview_age
sex
Site
Primary_Dx
Non-Primary_Dx
Group
Active Psychosis
Age
Clinician_administered
Clin_admin_ndays
REDCap_data
REDCap_data_ndays
Test_My_Brain
TMB_ndays
Clinical_self
Clinical_self_ndays
Clinical_voice_recording 
XNAT_ Data
MRI_battery 
MRI_battery_ndays
post_scan_interview 
post_scan_interview_audio_recording
supplemental 
supplemental_ndays
year_birth
height#1_1
height#2_1
weight
marital
house
house_7_TEXT
live_with_whom
native_lang
native_lang_2_TEXT
age_learn_eng
occupation
occupation_9_TEXT
year_retire
grade_completed
ethnic
racial
hand_1
hand_2
hand_3
current_oc_1
future_career_1
parent1_gender
parent1_career_1
parent2_gender
parent2_career_1
physical_health
present_health
lost_conscious
post-meno
meds_yes_no
name_meds
dosage_meds
dosage_meds_1_TEXT
times_meds
pers_conditions_1
pers_conditions_2
pers_conditions_3
pers_conditions_4
pers_conditions_5
pers_conditions_6
pers_conditions_7
pers_conditions_8
pers_

In [68]:
# Filter only subjects with "PAST MDD" in Primary_Dx
mask_past_mdd = df["Primary_Dx"].str.strip().str.upper().str.startswith("PAST MDD")
past_mdd_df = df[mask_past_mdd]

# Select columns related to timing
timing_cols = [
    "subjectkey",
    "Primary_Dx",
    "Clin_admin_ndays",
    "REDCap_data_ndays"
]

# Display
print("\nPAST MDD subjects and timing info:")
print(past_mdd_df[timing_cols])



PAST MDD subjects and timing info:
           subjectkey             Primary_Dx Clin_admin_ndays  \
53   NDAR_INVDG233EBR               PAST MDD                1   
79   NDAR_INVGZ602BF8               PAST MDD                9   
90   NDAR_INVTV991YAD               PAST MDD                6   
132  NDAR_INVGH969TWR               PAST MDD                9   
146  NDAR_INVXZ023ZLG               PAST MDD               41   
150  NDAR_INVBD216MCC               PAST MDD               51   
152  NDAR_INVEC746UWL               PAST MDD               53   
157  NDAR_INVZF290GFY               PAST MDD                8   
159  NDAR_INVDU085XVZ  PAST MDD (DUE TO SUD)                5   
192  NDAR_INVRR054KAM               PAST MDD                5   
214  NDAR_INVWF881BPQ               PAST MDD                4   
230  NDAR_INVPE293RXE               PAST MDD                0   
240  NDAR_INVFT463JPQ               PAST MDD                1   

    REDCap_data_ndays  
53                  1  
79   