In [24]:
import pandas as pd
import networkx as nx
import numpy as np
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy
import nibabel as nib
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [23]:
def load_fc_matrix(file_path):
    """ Load functional connectivity matrix from a .pconn.nii file. """
    img = nib.load(file_path)
    fc_matrix = img.get_fdata()
    return fc_matrix

def create_knn_graph(fc_matrix, k=5):
    """ Create a graph from a functional connectivity matrix using k-nearest neighbors based on absolute values. """
    n = fc_matrix.shape[0]  # Number of nodes
    G = nx.Graph()
    for i in range(n):
        G.add_node(i)
    
    # For each node, add edges to the k-nearest neighbors based on absolute values of connectivity strengths
    for i in range(n):
        # Sort indices based on the absolute values, get the k highest values indices for each row
        indices = np.argsort(np.abs(fc_matrix[i]))[-k:]
        for j in indices:
            if i != j:  # Ensure no self-loops
                G.add_edge(i, j, weight=fc_matrix[i][j])
    
    return G
def create_threshold_graph(fc_matrix, std_multiplier=2):
    """
    Create a graph from a functional connectivity matrix by adding edges where the 
    absolute connection strength is above a threshold defined as a multiple of the
    standard deviation of the absolute values in the connectivity matrix.
    """
    n = fc_matrix.shape[0]  # Number of nodes
    G = nx.Graph()
    
    # Calculate the threshold as std_multiplier times the standard deviation of the absolute values
    threshold = std_multiplier * np.std(np.abs(fc_matrix))
    
    # Add nodes
    for i in range(n):
        G.add_node(i)
    
    # Add edges based on the threshold
    for i in range(n):
        for j in range(n):
            if i != j and np.abs(fc_matrix[i, j]) > threshold:  # Avoid self-loops and check threshold
                G.add_edge(i, j, weight=fc_matrix[i, j])
    
    return G

In [6]:
def compute_features(graph):
    features = {}
    
    # Degree distribution
    degrees = [d for n, d in graph.degree()]
    features['degree_mean'] = np.mean(degrees)
    features['degree_std'] = np.std(degrees)
    
    # Clustering coefficient
    clustering_coeffs = list(nx.clustering(graph).values())
    features['clustering_mean'] = np.mean(clustering_coeffs)
    features['clustering_std'] = np.std(clustering_coeffs)
    
    # Average shortest path length
    if nx.is_connected(graph):
        features['avg_shortest_path'] = nx.average_shortest_path_length(graph)
    else:
        features['avg_shortest_path'] = np.nan  # Use NaN for disconnected graph
    
    return features

def compute_feature_vector(graph):
    features = compute_features(graph)
    feature_vector = np.array([features['degree_mean'], features['degree_std'], 
                               features['clustering_mean'], features['clustering_std'], 
                               features['avg_shortest_path']])
    # Handle NaN values by replacing them with a large finite value
    feature_vector = np.nan_to_num(feature_vector, nan=1e6)
    return feature_vector

def compute_graph_distance(fv1, fv2):
    return euclidean(fv1, fv2)

# Example graphs
G1 = nx.erdos_renyi_graph(100, 0.05)
G2 = nx.erdos_renyi_graph(100, 0.1)

# Compute feature vectors
fv1 = compute_feature_vector(G1)
fv2 = compute_feature_vector(G2)

# Compute distance
distance = compute_graph_distance(fv1, fv2)
print(f'D-measure (distance) between G1 and G2: {distance}')
# Compute distance
distance = compute_graph_distance(fv2, fv2)
print(f'D-measure (distance) between G1 and G2: {distance}')

D-measure (distance) between G1 and G2: 999997.7169783808
D-measure (distance) between G1 and G2: 0.0


In [13]:
def compute_nnd(graph):
    if not nx.is_connected(graph):
        largest_cc = max(nx.connected_components(graph), key=len)
        graph = graph.subgraph(largest_cc).copy()
    
    N = len(graph.nodes)
    if N == 1:
        return 0
    
    d = nx.diameter(graph)
    distance_distribution = np.zeros(d + 1)
    
    for node in graph.nodes:
        lengths = nx.single_source_shortest_path_length(graph, node)
        for length in lengths.values():
            if length <= d:
                distance_distribution[length] += 1
    
    distance_distribution /= (N * (N - 1))
    distance_distribution = np.clip(distance_distribution, 1e-10, None)  # Avoid log(0)
    nnd = np.sum(distance_distribution * np.log(1 / distance_distribution)) / np.log(d + 1)
    
    return nnd

def degree_distribution(graph):
    degrees = [degree for node, degree in graph.degree()]
    hist, bins = np.histogram(degrees, bins=range(1, max(degrees) + 2), density=True)
    return hist

def pad_distributions(dist1, dist2):
    max_length = max(len(dist1), len(dist2))
    dist1 = np.pad(dist1, (0, max_length - len(dist1)), 'constant')
    dist2 = np.pad(dist2, (0, max_length - len(dist2)), 'constant')
    return dist1, dist2

def graph_dissimilarity(G, G_prime, w1=1, w2=1, w3=1):
    # Average node distance difference
    mu_G = np.mean([len(nx.single_source_shortest_path_length(G, source=i)) - 1 for i in G.nodes])
    mu_G_prime = np.mean([len(nx.single_source_shortest_path_length(G_prime, source=i)) - 1 for i in G_prime.nodes])
    mu_diff = abs(mu_G - mu_G_prime)
    
    # NND difference
    nnd_G = compute_nnd(G)
    nnd_G_prime = compute_nnd(G_prime)
    nnd_diff = abs(nnd_G - nnd_G_prime)
    
    # Jensen-Shannon divergence for node distance distributions
    degree_dist_G = degree_distribution(G)
    degree_dist_G_prime = degree_distribution(G_prime)
    degree_dist_G, degree_dist_G_prime = pad_distributions(degree_dist_G, degree_dist_G_prime)
    js_divergence = jensenshannon(degree_dist_G, degree_dist_G_prime)
    
    # Jensen-Shannon divergence for complement graphs
    G_c = nx.complement(G)
    G_prime_c = nx.complement(G_prime)
    degree_dist_G_c = degree_distribution(G_c)
    degree_dist_G_prime_c = degree_distribution(G_prime_c)
    degree_dist_G_c, degree_dist_G_prime_c = pad_distributions(degree_dist_G_c, degree_dist_G_prime_c)
    js_divergence_complement = jensenshannon(degree_dist_G_c, degree_dist_G_prime_c)
    
    dissimilarity = (w1 * mu_diff + w2 * nnd_diff + 
                     (w3 / 2) * js_divergence + (w3 / 2) * js_divergence_complement)
    
    return dissimilarity

# Compute D-measure dissimilarity
dissimilarity = graph_dissimilarity(G1, G2)
print(f'D-measure (distance) between G1 and G2: {dissimilarity}')

# Compute D-measure dissimilarity between G2 and itself
dissimilarity_self = graph_dissimilarity(G2, G2)
print(f'D-measure (distance) between G2 and G2: {dissimilarity_self}')

D-measure (distance) between G1 and G2: 2.556388957922372
D-measure (distance) between G2 and G2: 0.0


In [28]:
# Set the behavior path and list all files
behavior_path = '/home/tico/Desktop/master_classes/project/behavior/'
behavior_files = os.listdir(behavior_path)

# Read the first file and initialize the dataframe
behavior_source = pd.read_csv(behavior_path + behavior_files[0], sep='\t')
for behavior_file in behavior_files[1:]:
    curr_behavior_source = pd.read_csv(behavior_path + behavior_file, sep='\t')
    behavior_source = pd.concat([behavior_source, curr_behavior_source], axis=0)

# Select the relevant columns
behavior_source = behavior_source[["session_id", "Group"]]

# Group by 'Group' and sample 3 from each group
bags = behavior_source.groupby('Group').apply(lambda x: x.sample(n=3)).reset_index(drop=True)

# Filter out the samples in 'bags' from 'behavior_source' using 'session_id'
remaining_behavior_source = behavior_source[~behavior_source['session_id'].isin(bags['session_id'])]

# Group by 'Group' and sample 1 from each group
remaining_samples = remaining_behavior_source.groupby('Group').apply(lambda x: x.sample(n=1)).reset_index(drop=True)

# Combine session_ids from bags and remaining_samples to form pconn_files
session_ids = pd.concat([bags['session_id'], remaining_samples['session_id']])
pconn_files = [f"{session_id}.pconn.nii" for session_id in session_ids]

# Directory setup
directory = "/home/tico/Desktop/master_classes/project/BSNIP/pconn"

# Load graphs
graphs = {}
for file_name in pconn_files:
    file_path = os.path.join(directory, file_name)
    fc_matrix = load_fc_matrix(file_path)
    graphs[file_name] = create_threshold_graph(fc_matrix, std_multiplier=2)

# Compute similarities
similarity_matrix = pd.DataFrame(index=session_ids, columns=session_ids)
for session_id1 in session_ids:
    for session_id2 in session_ids:
        print(f'processing: {session_id1}, {session_id2}')
        if session_id1 != session_id2:
            graph1 = graphs[f"{session_id1}.pconn.nii"]
            graph2 = graphs[f"{session_id2}.pconn.nii"]
            similarity_matrix.loc[session_id1, session_id2] = graph_dissimilarity(graph1, graph2)
        else:
            similarity_matrix.loc[session_id1, session_id2] = 0  # Self similarity is zero

# Sort the matrix
sorted_similarity_matrix = similarity_matrix.sort_index(axis=0).sort_index(axis=1)


  bags = behavior_source.groupby('Group').apply(lambda x: x.sample(n=3)).reset_index(drop=True)
  remaining_samples = remaining_behavior_source.groupby('Group').apply(lambda x: x.sample(n=1)).reset_index(drop=True)


In [None]:
# Plot the matrix
plt.figure(figsize=(12, 10))
sns.heatmap(sorted_similarity_matrix, cmap="YlGnBu", annot=True, fmt=".2f")
plt.title("Graph Similarity Matrix")
plt.xlabel("Sessions")
plt.ylabel("Sessions")
plt.show()