In [1]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import os
from cdlib.algorithms import leiden
import nibabel as nib
import pandas as pd

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'infomap', 'wurlitzer', 'bayanpy'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'ASLPAw', 'pyclustering'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'infomap', 'wurlitzer'}


In [37]:
def read_behavior(path):
    # Reading
    behavior_path = path
    
    behavior_files = os.listdir(behavior_path)
    
    behavior_source = pd.read_csv(behavior_path+behavior_files[0], sep='\t')
    for behavior_file in behavior_files[1:]:
        curr_behavior_source = pd.read_csv(behavior_path+behavior_file, sep='\t')
        behavior_source = pd.concat([behavior_source, curr_behavior_source], axis=0)

    return behavior_source

def build_network(data, df, th_std, visualize = True):
    # Create a graph from the data
    # This example assumes 'data' is a square matrix where data[i][j] represents the connection
    # strength between region i and region j. Your data's structure may vary.
    G = nx.Graph()
    
    # Add nodes with the brain region names or indices if you don't have names
    num_regions = data.shape[0]
    G.add_nodes_from(range(num_regions))
    
    # Add edges based on connectivity data
    # Here we're simply adding an edge for every non-zero connection
    for i in range(num_regions):
        # Calculate the standard deviation of the specified row
        std_dev = df.iloc[i].std()
        threshold = th_std * std_dev
        for j in range(i+1, num_regions):  # Ensure i < j to avoid duplicating edges
            if abs(data[i, j]) > threshold:  # Assuming 0 means no connection
                G.add_edge(i, j, weight=data[i, j])
    
    if visualize:
        # You can also visualize the graph using matplotlib or similar libraries
        # This step requires matplotlib to be installed (`pip install matplotlib`)
        nx.draw(G, with_labels=True)
        plt.show()
        
    return G

def read_pconn(path):
    
    # Load the .pconn.nii file
    img = nib.load(path)
    
    # Extract the data array from the image
    # The data might need to be processed or reshaped depending on its structure
    data = img.get_fdata()

    df = pd.DataFrame(data)
    
    return data, df

In [3]:
base_path = '/Users/ahmet/Desktop/Study/second_semester/ds_project/BSNIP/pconn'

In [4]:
pconn_paths = os.listdir('/Users/ahmet/Desktop/Study/second_semester/ds_project/BSNIP/pconn')

In [5]:
session_ids = [path.split('.')[0] for path in pconn_paths]

In [6]:
behavior_path = '/Users/ahmet/Desktop/Study/second_semester/ds_project/behavior/'

In [39]:
behavior_df = read_behavior(behavior_path)

Important features found by feature selection process are ['X534', 'X484', 'X426', 'X284', 'X684']. These features corresponds to nodes 533, 483, 425, 283 and 683 because of 0 indexing. I will check if there is explanation to this nodes in the graphs.

In [7]:
important_by_selection = [533, 483, 425, 283, 683]

In [1]:
def check_hubs(G, group, std, session_id):
    
    degree_dict = dict(G.degree)
    average_degree = (2 * G.number_of_edges()) / G.number_of_nodes()
    imp_degree_dict = {key: degree_dict[key] for key in important_by_selection}
    
    hub = []
    for k, v in imp_degree_dict.items():
        if v > average_degree:
            hub.append({'session_id' : session_id, 'group' : group, 'std': std, 'avg_degree': average_degree, 'feature' : k, 'hub_flag': True})
        else:
            hub.append({'session_id' : session_id , 'group' : group, 'std': std, 'avg_degree': average_degree, 'feature' : k, 'hub_flag': False})
            
    return hub

def check_betweenness(G, group, std, th, session_id):
    
    bc_scores =  nx.betweenness_centrality(G)
    top = sorted(bc_scores, key=bc_scores.get, reverse=True)[:th]
    
    bet = []
    for feat in important_by_selection:
        if feat in top:
            bet.append({'session_id': session_id, 'group' : group, 'std': std, 'top' : th, 'feature' : feat, 'betweenness_flag': True})
        else:
            bet.append({'session_id': session_id, 'group': group, 'std': std,  'top' : th, 'feature' : feat, 'betweenness_flag': False})
            
    return bet

def check_communities(session_id, group, G, th):
    
    mapping = leiden(G).to_node_community_map()
    num_communities = len(leiden(G).communities)
    
    com = []
    for feat in important_by_selection:
        com.append({'session_id' : session_id, 'group' : group, 'std': th, 'feature' : feat, 'community': mapping[feat][0], 'num_communities': num_communities})
    
    return com

def check_own_community(df):
    
    ''''This function checks if features are in the same or different communities
    and assigns a flag feature.
    '''
    
    result = []
    for session_id, group in df.groupby('session_id'):
        for community, community_group in group.groupby('community'):
            if len(community_group) == 1:
                row = community_group.iloc[0]
                result.append((row['session_id'], row['group'], row['feature'], row['community'], True))
            else:
                for _, row in community_group.iterrows():
                    result.append((row['session_id'], row['group'], row['feature'], row['community'], False))
                    
    res_df = pd.DataFrame(result, columns=['session_id', 'group', 'feature', 'community', 'forms_own_community'])
    
    return res_df

In [None]:
community = []
hubs = []
betweenness = []
stds = [2, 3, 4.5]

count = 0
for std in stds:
    for session_id, pconn_path in zip(session_ids, pconn_paths):
        print(f'{count + 1}th Graph:')
        group = behavior_df[behavior_df['session_id'] == session_id]['Group'].values[0]
        path = os.path.join(base_path, pconn_path)
        data, df = read_pconn(path)
        G = build_network(data, df, std, visualize = False)
        try:
            hub = check_hubs(G, group, std, session_id)
            hubs.append(hub)
        except:
            print('Exception occured for hubs')
            pass
        
        try:
            bet = check_betweenness(G, group, std, 10, session_id)
            betweenness.append(bet)
        except:
            print('Exception occured for betweenness')
            pass
            
        try:
            com = check_communities(session_id, group, G, std)
            community.append(com)
        except:
            print('Exception occured for communities')
            pass
        count += 1

In [48]:
com_df = pd.DataFrame([i for com in community for i in com])
hub_df = pd.DataFrame([i for hub in hubs for i in hub])
bet_df = pd.DataFrame([i for bet in betweenness for i in bet])

In [58]:
com_2 = com_df[com_df['std'] == 2]
com_3 = com_df[com_df['std'] == 3]
com_4_5 = com_df[com_df['std'] == 4.5]

In [60]:
print('HUB RESULTS:')
print('-'*30)
display(hub_df.groupby(['std','feature']).sum('hub_flag')['hub_flag'] / 638)
print('BETWEENNESS RESULTS:')
print('-'*30)
display(bet_df.groupby(['std','feature']).sum('betweenness_flag')['betweenness_flag'] / 638)

HUB RESULTS:
------------------------------


std  feature
2.0  283        0.141066
     425        0.536050
     483        0.253918
     533        0.170846
     683        0.260188
3.0  283        0.117555
     425        0.399687
     483        0.197492
     533        0.172414
     683        0.133229
4.5  283        0.003135
     425        0.000000
     483        0.007837
     533        0.028213
     683        0.001567
Name: hub_flag, dtype: float64

BETWEENNESS RESULTS:
------------------------------


std  feature
2.0  283        0.003135
     425        0.001567
     483        0.003135
     533        0.000000
     683        0.021944
3.0  283        0.007837
     425        0.015674
     483        0.007837
     533        0.003135
     683        0.000000
4.5  283        0.000000
     425        0.000000
     483        0.000000
     533        0.000000
     683        0.000000
Name: betweenness_flag, dtype: float64

- There is no clear indicator that these 5 features appears as hubs in the graphs for any STD we picked. The results are 0 for 4.5 because most of nodes are going to be isolated in this case we can see that results also in community analysis later on. This means these 5 features didn't have any links to any other node. The results we would expect to say these are the hubs would be if they appear 80% or more time in the hubs.
- Also for betweenness, I've checked if these nodes are appears in the top 10 nodes according to their betweenness centrality scores and it's seen that their percentages are really low so this cannot be explained by betweenness centrality either.

In [61]:
print('HUB RESULTS:')
print('-'*30)
display(hub_df.groupby(['std','feature', 'group']).sum('hub_flag')['hub_flag'] / 638)
print('BETWEENNESS RESULTS:')
print('-'*30)
display(bet_df.groupby(['std','feature', 'group']).sum('betweenness_flag')['betweenness_flag'] / 638)

HUB RESULTS:
------------------------------


std  feature  group
2.0  283      BPP      0.037618
              CON      0.036050
              SADP     0.020376
              SCZP     0.047022
     425      BPP      0.128527
              CON      0.173981
              SADP     0.092476
              SCZP     0.141066
     483      BPP      0.057994
              CON      0.081505
              SADP     0.039185
              SCZP     0.075235
     533      BPP      0.032915
              CON      0.034483
              SADP     0.039185
              SCZP     0.064263
     683      BPP      0.065831
              CON      0.073668
              SADP     0.050157
              SCZP     0.070533
3.0  283      BPP      0.026646
              CON      0.040752
              SADP     0.025078
              SCZP     0.025078
     425      BPP      0.089342
              CON      0.141066
              SADP     0.073668
              SCZP     0.095611
     483      BPP      0.040752
              CON      0.070533
              SADP  

BETWEENNESS RESULTS:
------------------------------


std  feature  group
2.0  283      BPP      0.001567
              CON      0.001567
              SADP     0.000000
              SCZP     0.000000
     425      BPP      0.000000
              CON      0.001567
              SADP     0.000000
              SCZP     0.000000
     483      BPP      0.000000
              CON      0.000000
              SADP     0.000000
              SCZP     0.003135
     533      BPP      0.000000
              CON      0.000000
              SADP     0.000000
              SCZP     0.000000
     683      BPP      0.001567
              CON      0.009404
              SADP     0.006270
              SCZP     0.004702
3.0  283      BPP      0.001567
              CON      0.000000
              SADP     0.000000
              SCZP     0.006270
     425      BPP      0.001567
              CON      0.004702
              SADP     0.004702
              SCZP     0.004702
     483      BPP      0.000000
              CON      0.004702
              SADP  

When we check contriubtions from each group to the percentages we looked before, there is also no explanability found here. The results seems like pretty much random.

In [143]:
print('2 STD:')
res_2 = check_own_community(com_2)
res_2 = res_2.groupby(['feature']).sum('forms_own_community')['forms_own_community'].reset_index()
res_2['forms_own_community'] = res_2['forms_own_community'] / 638
display(res_2)

print('3 STD:')
res_3 = check_own_community(com_3)
res_3 = res_3.groupby(['feature']).sum('forms_own_community')['forms_own_community'].reset_index()
res_3['forms_own_community'] = res_3['forms_own_community'] / 638
display(res_3)

print('4.5 STD:')
res_4_5 = check_own_community(com_4_5)
res_4_5 = res_4_5.groupby(['feature']).sum('forms_own_community')['forms_own_community'].reset_index()
res_4_5['forms_own_community'] = res_4_5['forms_own_community'] / 638
display(res_4_5)

2 STD:


Unnamed: 0,feature,forms_own_community
0,283,0.326019
1,425,0.575235
2,483,0.363636
3,533,0.297806
4,683,0.299373


3 STD:


Unnamed: 0,feature,forms_own_community
0,283,0.88558
1,425,0.937304
2,483,0.873041
3,533,0.874608
4,683,0.907524


4.5 STD:


Unnamed: 0,feature,forms_own_community
0,283,0.04232
1,425,0.04232
2,483,0.04232
3,533,0.04232
4,683,0.04232


The results for community detection seen above. I basically checked if these features were in the same community or not. We would expect them to be in different communities if they provide different informations to the machine learning algorithms. These was the main motivation.

It's seen that for 2 STD the percentages again not that high. 3 STD seems perfect and what we would expect to see based on our assumptions but if we check the statistics for number of communities below, even with 3 STD most of the nodes are being isolated and forming its own community. That means because of high number of communities in average, these 5 features are in the different communities but it doesn't indicate that they form meaningful communities.
The percentages are so low and same for all feature in STD 4.5 because community detection algorithm throws an error almost everytime since most of the nodes are isolated (That means it would be TRUE for all features and it would make percentages to get closer to the 1). We normalized values by 638 that's why percentages are so small. Actually, they are pretty much close 1 but I didn't bother to fix it since it's not informative because of high number of communities.

In conclusion, if there was a result like 3 STD in 2 STD, we would conclude that these features forms its own communities. Unfortunately, any of these 3 statistics (hubs, betweenness centrality, community detection) didn't give explanation to the these 5 features.

In [157]:
print('2 STD:')
print('-'*30)
display(com_2.num_communities.describe())
print('3 STD:')
print('-'*30)
display(com_3.num_communities.describe())
print('4.5 STD:')
print('-'*30)
display(com_4_5.num_communities.describe())

2 STD:
------------------------------


count    3190.000000
mean       34.974922
std       120.974158
min         2.000000
25%         4.000000
50%         5.000000
75%         6.000000
max       698.000000
Name: num_communities, dtype: float64

3 STD:
------------------------------


count    3180.000000
mean      333.020440
std       179.551163
min        12.000000
25%       191.000000
50%       337.500000
75%       469.000000
max       707.000000
Name: num_communities, dtype: float64

4.5 STD:
------------------------------


count    135.000000
mean     707.407407
std        7.984403
min      680.000000
25%      706.000000
50%      710.000000
75%      713.000000
max      716.000000
Name: num_communities, dtype: float64