In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import adjusted_rand_score
from sklearn.utils import shuffle
import umap.umap_ as umap
%matplotlib inline

In [None]:
# Create a dataframe of all subjects in the final sample
phenotype_df = pd.read_csv('[Insert filepath to phenotype data]', sep='\t', low_memory=False)
phenotype_df = phenotype_df[phenotype_df['INT_NUM'] == 1]
SUBJID_df = pd.concat([pd.read_csv('../results/youth_SUBJIDs'), pd.read_csv('../results/early_adult_SUBJIDs')], axis=0)
phenotype_df = pd.merge(phenotype_df, SUBJID_df['SUBJID'], how='inner', on='SUBJID')

In [None]:
# define lists for the different categories of variables
ID_vars = ['SUBJID', 'INT_NUM', 'INT_TYPE', 'Sex']
ADD = ['ADD011', 'ADD012', 'ADD013', 'ADD014', 'ADD015', 'ADD016', 'ADD020', 'ADD021', 'ADD022']
AGR = ['AGR001', 'AGR002', 'AGR003', 'AGR004', 'AGR005', 'AGR006', 'AGR007', 'AGR008']
CDD = ['CDD001', 'CDD002', 'CDD003', 'CDD004', 'CDD005', 'CDD006', 'CDD007', 'CDD008']
DEP = ['DEP001', 'DEP002', 'DEP004', 'DEP006']
GAD = ['GAD001', 'GAD002']
MAN = ['MAN001', 'MAN002', 'MAN003', 'MAN004', 'MAN005', 'MAN006', 'MAN007']
OCD = ['OCD001', 'OCD002', 'OCD003', 'OCD004', 'OCD005', 'OCD006', 'OCD007', 'OCD008', 'OCD011', 'OCD012', 'OCD013', 'OCD014', 'OCD015', 'OCD016', 'OCD017', 'OCD018', 'OCD019']
ODD = ['ODD001', 'ODD002', 'ODD003', 'ODD005', 'ODD006']
PAN = ['PAN001', 'PAN003', 'PAN004']
PHB = ['PHB001', 'PHB002', 'PHB003', 'PHB004', 'PHB005', 'PHB006', 'PHB007', 'PHB008']
PSY = ['PSY001', 'PSY029', 'PSY050', 'PSY060', 'PSY070', 'PSY071']
SIP = ['SIP003', 'SIP004', 'SIP005', 'SIP006', 'SIP007', 'SIP008', 'SIP009', 'SIP010', 'SIP011', 'SIP012', 'SIP013', 'SIP014']
PTD = ['PTD001', 'PTD002', 'PTD003', 'PTD004', 'PTD006', 'PTD007', 'PTD008', 'PTD009']
SEP = ['SEP500', 'SEP508', 'SEP509', 'SEP510', 'SEP511']
SOC = ['SOC001', 'SOC002', 'SOC003', 'SOC004', 'SOC005']
SUI = ['SUI001', 'SUI002']

# dictionary for grouping the disorder variables
disorders = {
    'ADD': ADD,
    'AGR': AGR,
    'CDD': CDD,
    'DEP': DEP,
    'GAD': GAD,
    'MAN': MAN,
    'OCD': OCD,
    'ODD': ODD,
    'PAN': PAN,
    'PHB': PHB,
    'PSY': PSY,
    'SIP': SIP,
    'PTD': PTD,
    'SEP': SEP,
    'SOC': SOC,
    'SUI': SUI
}

for disorder in disorders.keys():
    if disorder == 'SIP':
        phenotype_df[disorder] = phenotype_df[disorders[disorder]].mean(axis=1)
    else:
        phenotype_df[disorder] = phenotype_df[disorders[disorder]].sum(axis=1)

In [None]:
# Split the dataframe into the youth and early adult age groups
youth_phenotype_df = phenotype_df[phenotype_df['INT_TYPE'] == 'MP']
early_adult_phenotype_df = phenotype_df[phenotype_df['INT_TYPE'] == 'AP']

In [None]:
SIP_score_threshold_labels_youth = youth_phenotype_df['SIP'].apply(lambda x: 'Mean Response = 0' if x == 0 else 'Mean Response > 0').tolist()
SIP_score_threshold_labels_early_adult = early_adult_phenotype_df['SIP'].apply(lambda x: 'Mean Response = 0' if x == 0 else 'Mean Response > 0').tolist()

# Compute the Adjusted Rand Index (ARI) between the community detection results from the multiplex SBMs and simple SBMs

In [None]:
def permutation_test_ari(model_1_labels, model_2_labels, n_permutations=10000, random_state=1234, return_distribution=False):
    np.random.seed(random_state)
    observed_ari = adjusted_rand_score(model_1_labels, model_2_labels)

    permuted_aris = []
    for _ in range(n_permutations):
        permuted_model_1_labels = shuffle(model_1_labels, random_state=np.random.randint(0, 1e6))
        ari = adjusted_rand_score(permuted_model_1_labels, model_2_labels)
        permuted_aris.append(ari)

    permuted_aris = np.array(permuted_aris)
    p_value = np.mean(permuted_aris >= observed_ari)

    if return_distribution:
        return observed_ari, p_value, permuted_aris
    
    return observed_ari, p_value

In [None]:
def plot_umap_block_overlays(X, model_1_labels, model_2_labels, n_neighbors=15, min_dist=0.1, random_state=1234,
                             model_1_labels_title='Model 1 Labels', model_2_labels_title='Model 2 Labels', legend_loc='upper right',
                             filepath=None):
    reducer = umap.UMAP(metric='precomputed', random_state=random_state)
    X_embedded = reducer.fit_transform(X)

    fig, axs = plt.subplots(1, 2, figsize=(14, 6))
    
    sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=model_1_labels, palette='tab10', ax=axs[0])
    axs[0].set_title(model_1_labels_title)
    axs[0].set_xlabel("UMAP 1")
    axs[0].set_ylabel("UMAP 2")
    axs[0].set_xticks([])
    axs[0].set_yticks([])
    axs[0].legend(title='Block', loc=legend_loc)

    sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=model_2_labels, palette='tab10', ax=axs[1])
    axs[1].set_title(model_2_labels_title)
    axs[1].set_xlabel("UMAP 1")
    axs[1].set_xticks([])
    axs[1].set_yticks([])
    axs[1].legend(title='Block', ncol=2, loc=legend_loc)

    sns.despine()
    axs[1].spines['left'].set_visible(False)
    
    plt.tight_layout()
    plt.show()

    if filepath:
        fig.savefig(filepath, bbox_inches='tight', dpi=300)

In [None]:
# Memberships for the youth subset
youth_membership_list_filepath_multiplex_Pearson = '../results/youth_multiplex_Pearson_memberships.csv'
youth_membership_list_multiplex_Pearson = pd.read_csv(youth_membership_list_filepath_multiplex_Pearson)['memberships'].values.tolist()

youth_membership_list_filepath_multiplex_Euclidean = '../results/youth_multiplex_Euclidean_memberships.csv'
youth_membership_list_multiplex_Euclidean = pd.read_csv(youth_membership_list_filepath_multiplex_Euclidean)['memberships'].values.tolist()

youth_membership_list_filepath_simple = '../results/youth_simple_memberships.csv'
youth_membership_list_simple = pd.read_csv(youth_membership_list_filepath_simple)['memberships'].values.tolist()

# Memberhsips for the early adult subset
early_adult_membership_list_filepath_multiplex_Pearson = '../results/early_adult_multiplex_Pearson_memberships.csv'
early_adult_membership_list_multiplex_Pearson = pd.read_csv(early_adult_membership_list_filepath_multiplex_Pearson)['memberships'].values.tolist()

early_adult_membership_list_filepath_multiplex_Euclidean = '../results/early_adult_multiplex_Euclidean_memberships.csv'
early_adult_membership_list_multiplex_Euclidean = pd.read_csv(early_adult_membership_list_filepath_multiplex_Euclidean)['memberships'].values.tolist()

early_adult_membership_list_filepath_simple = '../results/early_adult_simple_memberships.csv'
early_adult_membership_list_simple = pd.read_csv(early_adult_membership_list_filepath_simple)['memberships'].values.tolist()

## Youth age group

In [None]:
# ARI analysis for the youth age group comparing the community detection results from the multiplex SBM and simple SBM
observed_ari, p_value = permutation_test_ari(youth_membership_list_multiplex_Pearson, youth_membership_list_simple)
print(f"Observed ARI: {observed_ari}, p-value: {p_value}")

In [None]:
X = pd.read_csv("../results/youth_symptom_matrix.csv").values

plot_umap_block_overlays(X, youth_membership_list_multiplex_Pearson, youth_membership_list_simple,
                         model_1_labels_title='Multiplex SBM Clusters', model_2_labels_title='Simple SBM Clusters', legend_loc='lower left',
                         filepath='../results/youth_community_detection_UMAP_comparison')

In [None]:
X = pd.read_csv("../results/youth_symptom_matrix.csv").values
filepath='../results/youth_SIP_score_UMAP'
legend_loc='lower left'
random_state=1234

reducer = umap.UMAP(metric='precomputed', random_state=random_state)
X_embedded = reducer.fit_transform(X)

plt.figure(figsize=(7, 6))
sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=SIP_score_threshold_labels_youth, palette='tab10')
plt.title("Psychosis Risk (SIP) Domain")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.xticks([])
plt.yticks([])
plt.legend(loc=legend_loc)
sns.despine()
        
plt.savefig(filepath, bbox_inches='tight', dpi=300)

In [None]:
# ARI analysis for the youth age group comparing the community detection results from multiplex SBMs fit to different neuroimaging layers
# (i.e., one fit to a neuroimaging layer constructed using pairwise Pearson dissimilarities
# and another fit to a neuroimaging layer constructed using pairwise Euclidean distances)
observed_ari, p_value = permutation_test_ari(youth_membership_list_multiplex_Pearson, youth_membership_list_multiplex_Euclidean)
print(f"Observed ARI: {observed_ari}, p-value: {p_value}")

## Early adult age group

In [None]:
# ARI analysis for the early adult age group comparing the community detection results from the multiplex SBM and simple SBM
observed_ari, p_value = permutation_test_ari(early_adult_membership_list_multiplex_Pearson, early_adult_membership_list_simple)
print(f"Observed ARI: {observed_ari}, p-value: {p_value}")

In [None]:
X = pd.read_csv("../results/early_adult_symptom_matrix.csv").values

plot_umap_block_overlays(X, early_adult_membership_list_multiplex_Pearson, early_adult_membership_list_simple,
                         model_1_labels_title='Multiplex SBM Clusters', model_2_labels_title='Simple SBM Clusters', legend_loc='upper right',
                         filepath='../results/early_adult_community_detection_UMAP_comparison')

In [None]:
X = pd.read_csv("../results/early_adult_symptom_matrix.csv").values
filepath='../results/early_adult_SIP_score_UMAP'
legend_loc='upper right'
random_state=1234

reducer = umap.UMAP(metric='precomputed', random_state=random_state)
X_embedded = reducer.fit_transform(X)

plt.figure(figsize=(7, 6))
sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=SIP_score_threshold_labels_early_adult, palette='tab10')
plt.title("Psychosis Risk (SIP) Domain")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.xticks([])
plt.yticks([])
plt.legend(loc=legend_loc)
sns.despine()
        
plt.savefig(filepath, bbox_inches='tight', dpi=300)

In [None]:
# ARI analysis for the early adult age group comparing the community detection results from multiplex SBMs fit to different neuroimaging layers
# (i.e., one fit to a neuroimaging layer constructed using pairwise Pearson dissimilarities
# and another fit to a neuroimaging layer constructed using pairwise Euclidean distances)
observed_ari, p_value = permutation_test_ari(early_adult_membership_list_multiplex_Pearson, early_adult_membership_list_multiplex_Euclidean)
print(f"Observed ARI: {observed_ari}, p-value: {p_value}")

# Compute the Adjusted Rand Index (ARI) between the community detection results from the multiplex SBMs and simple SBMs if the blocks with greater-than-zero mean SIP scores are consolodated

In [None]:
# ARI analysis for the youth age group comparing the community detection results from the multiplex SBM and simple SBM
# if the blocks with greater-than-zero mean SIP scores are consolodated
observed_ari, p_value = permutation_test_ari(
    youth_membership_list_multiplex_Pearson,
    [2 if membership == 29 else 1 for membership in youth_membership_list_simple]
)
print(f"Observed ARI: {observed_ari}, p-value: {p_value}")

In [None]:
# ARI analysis for the early adult age group comparing the community detection results from the multiplex SBM and simple SBM
# if the blocks with greater-than-zero mean SIP scores are consolodated
observed_ari, p_value = permutation_test_ari(
    early_adult_membership_list_multiplex_Pearson,
    [2 if membership == 14 else 1 for membership in early_adult_membership_list_simple]
)
print(f"Observed ARI: {observed_ari}, p-value: {p_value}")