## Network analysis
I have already tried to use the raw correlation values between brain regions as features for predicting adhd diagnosis. I will now try a different approach by calculating network measures for each subject's connectivity matrix. Network measures refer to specific graph summary statistics such as functional integration, functional segregation, centrality, and resilience. NetworkX implements many of these.

In [1]:
from collections import defaultdict, OrderedDict
import os
import sys
import re

from bs4 import BeautifulSoup

import pandas as pd

from scipy.stats import pearsonr
import numpy as np

import networkx as nx

import pickle as pkl

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler, ADASYN

In [2]:
def get_conn_matrices():
    file_names = os.listdir("data/ADHD200_CC200")

    cm_file_re = r"^\S+connectivity_matrix_file\.txt$"

    conn_matrices = OrderedDict()
    for file_name in file_names:
        if re.match(cm_file_re, file_name):
            id_ = "".join(file_name.split("_")[:-3])
        
            cm = np.empty((190,190))
            with open("data/ADHD200_CC200/{}".format(file_name)) as f:
                for idx, row in enumerate(f):
                    row = row.strip().split(" ")
                    row = list(map(np.float, row))
                    cm[idx, :] = row
        
            conn_matrices[id_] = cm
    return conn_matrices

In [3]:
def get_regions():
    """
    Gets the names of the regions (in order of appearance in connectivity matrix). 
    All files have the same order of regions, so we only to need to get this once.
    Some region names are repeated because there are multiple points within that region,
        so numbers are appended to the region names to distinguish them.
    
    returns a list of strings
    """
    regions_path = "data/ADHD200_CC200/KKI_1018959_region_names_abbrev_file.txt"
    regions = []
    with open(regions_path, "r") as f:
        regions = [region.strip().replace(" ", "_") for region in f]
    names = defaultdict(int)
    distinct_region_names = []
    for region in regions:
        distinct_region_names.append(region+"_"+str(names[region]))
        names[region] += 1
    return distinct_region_names

In [4]:
region_names = get_regions()

In [5]:
conn_matrices = get_conn_matrices()

## Calculate graph statistics

In [6]:
# if the measures have already been calculated and saved, just load them
load_measures = False
if load_measures:
    with open("conn_mat_measures_local.pkl", "rb") as f:
        measures = pkl.load(f)

In [7]:
def binarize(g, thresh):
    """
    Turn a weighted graph into an unweighted graph with the specified threshold.
    All edges less than the threshold become 0, while those above the threshold become 1.
    """
    return (g > thresh).astype(int)

In [8]:
# binarize each matrix
subjects = list(conn_matrices.keys())
binarized = OrderedDict()
for idx, subject in enumerate(subjects):
    cm = conn_matrices[subject]
    uw = binarize(cm, .2)
    binarized[subject] = uw

In [11]:
def measure_graphs_local(binarized):
    """
    Calculate predetermined graph analytics for each graph and return then in a dict of lists.
    measures: degree centrality, closeness centrality, and nodal clustering coefficient
    
    These measures are calculated for each node in the network, so a list of the measures is created for each network
    """
    measures = {}
    
    print("calculating degree centralities")
    measures["degree_centrality"] = [nx.degree_centrality(nx.Graph(bu)) for bu in binarized.values()]
    
    print("calculating closeness centralities")
    measures["closeness_centrality"] = [nx.closeness_centrality(nx.Graph(bu)) for bu in binarized.values()]
    
    print("calculating clustering coefficients")
    measures["clustering_coefficient"] = [nx.clustering(nx.Graph(bu)) for bu in binarized.values()]
    
    return measures

In [12]:
# get measures for each of the connectivity matrices
measures = measure_graphs_local(binarized)

calculating degree centralities
calculating closeness centralities
calculating clustering coefficients


In [13]:
# convert the measurements to a standard format table: subjects x features
features = defaultdict(list)
for measure in list(measures.keys()): # for each measure type (degree centrality, closeness centrality, clustering)
    for measures_dict in measures[measure]: # for each subject's dict of measures (one measure for each node)
        for key, val in measures_dict.items(): # for the (node idx, measurement value) in the subjects dict
            features["node_{}_{}".format(key, measure)].append(val)

In [14]:
local_node_features = pd.DataFrame(features, index=list(conn_matrices.keys()))

In [16]:
save_local_node_features = True
if save_local_node_features:
    with open("all_local_node_features.pkl", "wb") as f:
        pkl.dump(local_node_features, f)