# Get the data
In this notebook the connection strength data will be processed into a nice dataframe to use in other notebooks

In [1]:
%matplotlib inline

In [2]:
from bs4 import BeautifulSoup
from collections import defaultdict, OrderedDict

import numpy as np
import pandas as pd

import os
import re

import pickle as pkl

## Get metadata for each subject

In [3]:
with open("cm_table.html", "r") as f:
    table = f.read()

soup = BeautifulSoup(table, "html.parser")

rows = soup.find_all(class_="powerTable")[1].tbody.find_all("tr")[3:523]

In [4]:
cols = defaultdict(list)
for row in rows:
    text_list = list(row.stripped_strings)
    if len(text_list) == 13:
        text_list.insert(7, 'na') # insert so list is standard size when that column was empty on the webpage
    cols["study"].append(text_list[2])
    cols["id"].append(text_list[3].replace("_", ""))
    cols["age"].append(float(text_list[8]))
    cols["gender"].append(text_list[10])
    cols["label"].append(text_list[11])

In [5]:
metadata = pd.DataFrame(cols)

In [6]:
del cols
del soup
del rows

## Load the connectivity matrices

In [7]:
def get_conn_matrices():
    file_names = os.listdir("data/ADHD200_CC200")

    cm_file_re = r"^\S+connectivity_matrix_file\.txt$"

    conn_matrices = OrderedDict()
    for file_name in file_names:
        if re.match(cm_file_re, file_name):
            id_ = "".join(file_name.split("_")[:-3])
        
            cm = np.empty((190,190))
            with open("data/ADHD200_CC200/{}".format(file_name)) as f:
                for idx, row in enumerate(f):
                    row = row.strip().split(" ")
                    row = list(map(np.float, row))
                    cm[idx, :] = row
            
            less_than_zero_mask = cm < 0 # get indices of values less than zero
            cm[less_than_zero_mask] = 0 # set values less than zero to zero
            
            conn_matrices[id_] = cm
    return conn_matrices

In [8]:
def get_regions(abbrev=True):
    """
    Gets the names of the regions (in order of appearance in connectivity matrix). 
    All files have the same order of regions, so we only to need to get this once.
    Some region names are repeated because there are multiple points within that region,
        so numbers are appended to the region names to distinguish them.
    
    returns a list of strings
    """
    regions_path = "data/ADHD200_CC200/KKI_1018959_region_names_abbrev_file.txt" if abbrev else "data/ADHD200_CC200/KKI_1018959_region_names_full_file.txt"
    regions = []
    with open(regions_path, "r") as f:
        regions = [region.strip().replace(" ", "_") for region in f]
    names = defaultdict(int)
    distinct_region_names = []
    for region in regions:
        distinct_region_names.append(region+"_"+str(names[region]))
        names[region] += 1
    return distinct_region_names

In [9]:
region_names = get_regions()

conn_matrices = get_conn_matrices()

## data prep

In [10]:
def flatten_conn_matrices(conn_matrices_dict, region_names):
    """
    Flatten a cm dictionary (mapping subjects to connectivity matrices), such that each unique value in the
        connectivity matrix is a column feature in a row.
    Returns: 1) a numpy array where each row represents a subject with each column a feature;
             2) a list of the subject ids in the order they appear in the feature array;
             3) a list of the feature names in the order they appear in the feature array.
    The subjects list holds the row labels, feature_names list holds column labels.
    """
    subjects = list(conn_matrices_dict.keys())
    num_rows = len(subjects)
    features = np.empty((num_rows, 17955))
    
    # adjacency matrices have duplicate values, only need values from half of the matrix (and don't need diagonal)
    # np.tril_indices() returns indices of unique values
    row_idxs, col_idxs = np.tril_indices(190, k=-1)
    for idx, subject in enumerate(subjects):
        cm = conn_matrices_dict[subject]
        row = np.array([cm[row_idx, col_idx] for row_idx, col_idx in zip(row_idxs, col_idxs)])
        features[idx, :] = row
    
    feature_names = [region_names[row_idx]+"_to_"+region_names[col_idx] 
                     for row_idx, col_idx in zip(row_idxs, col_idxs)]
    
    return features, subjects, feature_names

In [11]:
features, subjects, feature_names = flatten_conn_matrices(conn_matrices, region_names)

In [12]:
# free up some memory
del conn_matrices

In [13]:
def sort_metadata(metadata, subjects):
    """
    Sorts a metadata dataframe so that the order is the same as the order of subjects in the subjects list.
    :arg metadata: dataframe with ADHD200 metadata
    :arg subjects: a list of subjects of specific order
    """
    metadata_ids = metadata["id"].values
    subjects_order_in_metadata = [np.where(metadata_ids==subject)[0][0] for subject in subjects]
    metadata_subject_sort = metadata.iloc[subjects_order_in_metadata, :]
    return metadata_subject_sort

In [14]:
# sort the metadata so that the order is the same as in the feature matrix
metadata_sorted = sort_metadata(metadata, subjects).reset_index()

In [15]:
del metadata

In [16]:
# add one hot vectors for each of the ADHD labels
adhd = [0 if label == "Typically Developing" else 1 for label in metadata_sorted["label"]]
metadata_sorted = metadata_sorted.assign(adhd=adhd)
metadata_sorted.drop(columns="label", inplace=True)

In [17]:
# check to make sure the metadata and feature table are in the same order (subjects is the order of features)
# list(zip(list(metadata_sorted["id"]), subjects))

[('NYU1567356', 'NYU1567356'),
 ('Peking12106109', 'Peking12106109'),
 ('Peking23562883', 'Peking23562883'),
 ('Peking11139030', 'Peking11139030'),
 ('NYU0010080', 'NYU0010080'),
 ('Peking24221029', 'Peking24221029'),
 ('NeuroIMAGE7339173', 'NeuroIMAGE7339173'),
 ('NYU0010095', 'NYU0010095'),
 ('NYU4079254', 'NYU4079254'),
 ('KKI2768273', 'KKI2768273'),
 ('Pittsburgh0016069', 'Pittsburgh0016069'),
 ('Pittsburgh0016067', 'Pittsburgh0016067'),
 ('Peking14053836', 'Peking14053836'),
 ('Peking23856956', 'Peking23856956'),
 ('KKI2641332', 'KKI2641332'),
 ('NYU0010029', 'NYU0010029'),
 ('Peking23248920', 'Peking23248920'),
 ('Peking21643780', 'Peking21643780'),
 ('Peking22207418', 'Peking22207418'),
 ('NeuroIMAGE2756846', 'NeuroIMAGE2756846'),
 ('NYU0010032', 'NYU0010032'),
 ('NYU3845761', 'NYU3845761'),
 ('Pittsburgh0016056', 'Pittsburgh0016056'),
 ('Pittsburgh0016058', 'Pittsburgh0016058'),
 ('KKI8337695', 'KKI8337695'),
 ('NeuroIMAGE8991934', 'NeuroIMAGE8991934'),
 ('Peking13390312', 'Pek

In [18]:
connection_features = pd.DataFrame(features, columns=feature_names, index=subjects)

In [19]:
connection_features["age"] = metadata_sorted["age"].values
connection_features["gender"] = [1 if g == "Male" else 0 for g in metadata_sorted["gender"].values]
connection_features["adhd"] = metadata_sorted["adhd"].values

In [20]:
# sort so adhd, age, gender columns are first
connection_features = connection_features[["adhd", "age", "gender"] + list(connection_features.columns[:-3])]

In [21]:
connection_features

Unnamed: 0,adhd,age,gender,RFP_0_to_BS_0,RFP_1_to_BS_0,RFP_1_to_RFP_0,RLG_0_to_BS_0,RLG_0_to_RFP_0,RLG_0_to_RFP_1,RIC_0_to_BS_0,...,RIFGpo_0_to_LITGpd_1,RIFGpo_0_to_LLOCid_2,RIFGpo_0_to_RCI_1,RIFGpo_0_to_LLOCsd_4,RIFGpo_0_to_RLOCsd_4,RIFGpo_0_to_RTP_3,RIFGpo_0_to_RCII_0,RIFGpo_0_to_LMTGpd_1,RIFGpo_0_to_LFP_8,RIFGpo_0_to_RCGad_2
NYU1567356,0,8.91,0,0.000000,0.000000,0.000000,0.173291,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.001152,0.000000,0.000000,0.084797,0.000000,0.000000,0.000000,0.000000
Peking12106109,0,9.92,0,0.285667,0.000000,0.000000,0.708364,0.402485,0.000000,0.000000,...,0.240733,0.000000,0.217633,0.000000,0.000000,0.416794,0.000000,0.334937,0.083589,0.073926
Peking23562883,0,11.25,1,0.219172,0.232962,0.034964,0.012224,0.153660,0.000000,0.287462,...,0.173660,0.000000,0.000000,0.022333,0.000000,0.000000,0.000000,0.267343,0.000000,0.000000
Peking11139030,0,11.33,0,0.401079,0.116047,0.007259,0.000000,0.261506,0.126393,0.429395,...,0.000000,0.000000,0.326461,0.000000,0.000000,0.437491,0.000000,0.000000,0.000000,0.000000
NYU0010080,0,17.86,0,0.000000,0.000000,0.184725,0.259696,0.000000,0.000000,0.397702,...,0.234918,0.000000,0.124058,0.000000,0.000000,0.130253,0.000000,0.020319,0.000000,0.065947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NYU0010112,0,14.79,1,0.423015,0.000000,0.000000,0.000000,0.000000,0.459780,0.000000,...,0.138709,0.160208,0.000000,0.000000,0.062983,0.165115,0.000000,0.000000,0.000000,0.000000
NYU1000804,0,7.29,1,0.000000,0.076278,0.000000,0.407724,0.000000,0.627842,0.283502,...,0.504621,0.000000,0.000000,0.000000,0.212321,0.000000,0.000000,0.000000,0.000000,0.000000
KKI8083695,0,8.62,1,0.153625,0.000000,0.085031,0.000000,0.223546,0.075935,0.207574,...,0.074787,0.000000,0.000000,0.000000,0.288287,0.000000,0.000000,0.000000,0.000000,0.313964
KKI3884955,0,11.84,1,0.615978,0.000000,0.000000,0.000000,0.000000,0.020968,0.000000,...,0.156700,0.000000,0.000000,0.000000,0.210738,0.280571,0.000000,0.000000,0.000000,0.000000


In [22]:
save_connection_features = True
if save_connection_features:
    with open("all_connection_features.pkl", "wb") as f:
        pkl.dump(connection_features, f)