# Sum connection strengths
Following the steps of [this](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5526681/) paper, I tried generating just two features from all of the connection strengths:
* sum of positive connections
* sum of negative connections

### Outcome:
This method was unsuccessful, ROC AUC and accuracy scores decreased significantly. This method is not used in my final modeling.

In [18]:
from collections import defaultdict, OrderedDict
import os
import re

from bs4 import BeautifulSoup

import pandas as pd

from scipy.stats import pearsonr
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
def get_conn_matrices():
    file_names = os.listdir("data/ADHD200_CC200")

    cm_file_re = r"^\S+connectivity_matrix_file\.txt$"

    conn_matrices = OrderedDict()
    for file_name in file_names:
        if re.match(cm_file_re, file_name):
            id_ = "".join(file_name.split("_")[:-3])
        
            cm = np.empty((190,190))
            with open("data/ADHD200_CC200/{}".format(file_name)) as f:
                for idx, row in enumerate(f):
                    row = row.strip().split(" ")
                    row = list(map(np.float, row))
                    cm[idx, :] = row
        
            conn_matrices[id_] = cm
    return conn_matrices

In [3]:
def get_regions():
    """
    Gets the names of the regions (in order of appearance in connectivity matrix). 
    All files have the same order of regions, so we only to need to get this once.
    Some region names are repeated because there are multiple points within that region,
        so numbers are appended to the region names to distinguish them.
    
    returns a list of strings
    """
    regions_path = "data/ADHD200_CC200/KKI_1018959_region_names_full_file.txt"
    regions = []
    with open(regions_path, "r") as f:
        regions = [region.strip().replace(" ", "_") for region in f]
    names = defaultdict(int)
    distinct_region_names = []
    for region in regions:
        distinct_region_names.append(region+"_"+str(names[region]))
        names[region] += 1
    return distinct_region_names

In [4]:
region_names = get_regions()
conn_matrices = get_conn_matrices()

In [5]:
## Get subject data
with open("cm_table.html", "r") as f:
    table = f.read()

soup = BeautifulSoup(table, "html.parser")

rows = soup.find_all(class_="powerTable")[1].tbody.find_all("tr")[3:523]

cols = defaultdict(list)
for row in rows:
    text_list = list(row.stripped_strings)
    if len(text_list) == 13:
        text_list.insert(7, 'na') # insert so list is standard size when that column was empty on the webpage
    cols["study"].append(text_list[2])
    cols["id"].append(text_list[3].replace("_", ""))
    cols["age"].append(float(text_list[8]))
    cols["gender"].append(text_list[10])
    cols["label"].append(text_list[11])

subject_data = pd.DataFrame(cols)

In [6]:
def sort_subject_data(subject_data, subject_order):
    """
    Sorts a dataframe by the order given
    :arg subject_data: dataframe with ADHD200 subject data
    :arg subject_order: a list of subjects in specific order
    """
    subject_data_ids = subject_data["id"].values
    subjects_order_in_subject_data = [np.where(subject_data_ids==subject)[0][0] for subject in subject_order]
    subject_data_sort = subject_data.iloc[subjects_order_in_subject_data, :]
    return subject_data_sort

In [7]:
subject_order = list(conn_matrices.keys())
subject_data_sorted = sort_subject_data(subject_data, subject_order)
adhd = [0 if label == "Typically Developing" else 1 for label in subject_data_sorted["label"]]
subject_data_sorted = subject_data_sorted.assign(adhd=adhd).drop(columns="label")

In [8]:
def flatten_conn_matrices(conn_matrices_dict, region_names):
    """
    Flatten a cm dictionary (mapping subjects to connectivity matrices), such that each unique value in the
        connectivity matrix is a column feature in a row.
    Returns: 1) a numpy array where each row represents a subject with each column a feature;
             2) a list of the subject ids in the order they appear in the feature array;
             3) a list of the feature names in the order they appear in the feature array.
    The subjects list holds the row labels, feature_names list holds column labels.
    """
    subjects = list(conn_matrices_dict.keys())
    num_rows = len(subjects)
    features = np.empty((num_rows, 17955))
    
    # adjacency matrices have duplicate values, only need values from half of the matrix (and don't need diagonal)
    # np.tril_indices() returns indices of unique values
    row_idxs, col_idxs = np.tril_indices(190, k=-1)
    for idx, subject in enumerate(subjects):
        cm = conn_matrices_dict[subject]
        row = np.array([cm[row_idx, col_idx] for row_idx, col_idx in zip(row_idxs, col_idxs)])
        features[idx, :] = row
    
    feature_names = [region_names[row_idx]+"_to_"+region_names[col_idx] 
                     for row_idx, col_idx in zip(row_idxs, col_idxs)]
    
    return features, subjects, feature_names

In [9]:
features, subjects, feature_names = flatten_conn_matrices(conn_matrices, region_names)

In [10]:
def most_correlated_features(features, metadata, feature_names, p_val=.01):
    """
    returns a DataFrame with a subset of the features which have a correlation p value less than the specified cutoff
    :arg features: numpy feature matrix, sorted in the same order as the metadata.
    :arg target: DataFrame with target and ids, sorted in the same order as the feature matrix.
    :arg feature_names: the names of the features in the feature matrix, same order.
    :arg p_val: the maximum p value for a feature to be included.
    """
    # get the p values for correlations. lower is better!
    target=metadata["adhd"] 
    correlation_p_vals = np.array([pearsonr(features[:,col], target)[1] for col in range(features.shape[1])])
    # get the order of columns which are most correlated with having adhd
    corr_p_vals_argsort = correlation_p_vals.argsort()
    # the number of features with correlation p values less than the cutoff
    num_features = np.count_nonzero(correlation_p_vals < p_val)
    # get the indices of features of features with p vals less than the cutoff
    most_correlated = corr_p_vals_argsort[:num_features]
    
    features_most_correlated = features[:, most_correlated]
    feature_names_most_correlated = [feature_names[idx] for idx in most_correlated]
    
    # make features dataframe with the smaller features
    X = pd.DataFrame(features_most_correlated, columns=feature_names_most_correlated)
    X = X.assign(adhd=target.values)
    X = X.assign(id=metadata["id"].values)
    X = X.assign(gender=metadata["gender"].values)
    X = X.assign(gender=pd.get_dummies(X["gender"], drop_first=True)["Male"])
    X = X.assign(age=metadata["age"].values)
    cols = list(X.columns)
    col_order = [cols[-3]] + [cols[-4]] + [cols[-1]] + [cols[-2]] + cols[:-4]
    X = X[col_order]
    
    return X

In [11]:
data = most_correlated_features(features, subject_data_sorted, feature_names, p_val=.05)

## Sum connection values:
Two features will be created from all of the significant connections: a sum of the positive values, and a sum of the negative values. This method is recommended in https://www.nature.com/articles/nprot.2016.178

In [12]:
data.iloc[0,:][4:]

Right_Lateral_Occipital_Cortex_inferior_division_2_to_Cerebellum_Vermis_VI_0        -0.228602
Right_Occipital_Pole_2_to_Left_Cerebellum_Crus_II_0                                  0.185352
Left_Cingulate_Gyrus_posterior_division_0_to_Right_Frontal_Pole_0                    0.202713
Left_Lateral_Occipital_Cortex_superior_division_2_to_Right_Frontal_Pole_0           -0.128154
Left_Frontal_Pole_0_to_Right_Frontal_Pole_0                                         0.0335659
                                                                                      ...    
Left_Thalamus_0_to_Left_Cerebellum_VIIb_0                                            0.173973
Right_Frontal_Pole_2_to_Left_Frontal_Orbital_Cortex_0                               -0.216298
Left_Middle_Temporal_Gyrus_posterior_division_1_to_Left_Frontal_Orbital_Cortex_0     0.172591
Left_Lateral_Occipital_Cortex_superior_division_2_to_Left_Cerebellum_Crus_II_1       0.323969
Left_Frontal_Pole_7_to_Right_Supramarginal_Gyrus_posterior_d

In [13]:
data = data.assign(pos_sum=data.apply(lambda row: np.sum([val for val in row.values[4:] if val > 0]), axis=1))
data = data.assign(neg_sum=data.apply(lambda row: np.sum([val for val in row.values[4:] if val < 0]), axis=1))

In [14]:
X = data[["pos_sum", "neg_sum"]].values
y = data["adhd"].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=2)

In [16]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
roc_auc = roc_auc_score(y_test, preds)

print("train acc = {:.3f}".format(model.score(X_train, y_train)))
print("test acc = {:.3f}".format(model.score(X_test, y_test)))
print("ROC AUC = {:.3f}".format(roc_auc))
print(confusion_matrix(y_test, preds))

train acc = 0.627
test acc = 0.615
ROC AUC = 0.496
[[62  4]
 [36  2]]


In [19]:
model = GaussianNB()
model.fit(X_train, y_train)
preds = model.predict(X_test)
roc_auc = roc_auc_score(y_test, preds)

print("train acc = {:.3f}".format(model.score(X_train, y_train)))
print("test acc = {:.3f}".format(model.score(X_test, y_test)))
print("ROC AUC = {:.3f}".format(roc_auc))
print(confusion_matrix(y_test, preds))

train acc = 0.630
test acc = 0.606
ROC AUC = 0.500
[[59  7]
 [34  4]]


### Predicting just based on sum of connection values doesn't work well at all