# Neural net
Using the combined features

In [1]:
%matplotlib inline

In [29]:
from bs4 import BeautifulSoup
from collections import defaultdict, OrderedDict

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr, spearmanr

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score
from sklearn.metrics import roc_auc_score, plot_roc_curve, make_scorer, roc_curve
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
#from lightgbm import LGBMClassifier

from imblearn.over_sampling import RandomOverSampler, ADASYN

import os
import re

import pickle as pkl

import torch
import torch.nn as nn
import torch.nn.functional as F

## Load connection strength data
Load the data then use pearsonr p values to select which features correlate most with having ADHD, and only include those features for modeling.

In [4]:
with open("all_connection_features.pkl", "rb") as f:
    X = pkl.load(f)

In [5]:
def most_correlated_features(X, target, p_val=.01, start_feature_idx=3):
    """
    returns a DataFrame with a subset of the features which have a correlation p value less than the specified cutoff
    """
    # get the p values for correlations. lower is better!
    correlation_p_vals = np.array([pearsonr(X[col].values, target)[1] for col in 
                                                       list(X.columns[start_feature_idx:])])
    # get the order of columns which are most correlated with having adhd
    corr_p_vals_argsort = correlation_p_vals.argsort()
    # the number of features with correlation p values less than the cutoff
    num_features = np.count_nonzero(correlation_p_vals < p_val)
    print(num_features, "features remaining with p value {}".format(p_val))
    # get the indices of features of features with p vals less than the cutoff
    most_correlated = list(corr_p_vals_argsort[:num_features])
    
    features_most_correlated = X.iloc[:, [0,1,2] + most_correlated]
    
    return features_most_correlated

In [6]:
target = X["adhd"].values
X_correlated = most_correlated_features(X, target, p_val=.02)

823 features remaining with p value 0.02


## Add local network statistic features
Load from a pickle file. These are calculated in the `project3 network analysis - local level.ipynb` file.

In [7]:
with open("all_local_node_features.pkl", "rb") as f:
    node_measures = pkl.load(f)

In [8]:
target = X["adhd"].values
node_measures_correlated = most_correlated_features(node_measures, target, p_val=.03, start_feature_idx=0)

34 features remaining with p value 0.03


In [9]:
X_correlated = pd.concat([X_correlated, node_measures_correlated], axis=1)

In [10]:
X_correlated.shape

(520, 863)

## Split data into training and hold out

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_correlated.drop(columns=["adhd"]), 
                                                    X_correlated["adhd"], test_size=.2, random_state=2)

In [41]:
# standard scale the data
scale = True
if scale:
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

In [42]:
X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)

In [43]:
y = np.zeros((y_train.size, y_train.max()+1))
y[np.arange(y_train.size), y_train] = 1
y_train = y

y = np.zeros((y_test.size, y_test.max()+1))
y[np.arange(y_test.size), y_test] = 1
y_test = y

## Modeling

In [44]:
X_train.shape

torch.Size([416, 862])

In [18]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(862, 10),
            nn.ReLU(),
            nn.Linear(10, 2),
            nn.Softmax())

    def forward(self, x):
        x = self.layers(x)
        return x

In [20]:
model = Net()

In [27]:
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()

In [28]:
y_train

NYU2497695           1
KKI2917777           0
NYU1737393           0
KKI2344857           0
Peking13976121       1
                    ..
Peking16187322       0
Peking11879542       0
NeuroIMAGE7504392    1
NYU0010081           1
Peking13233028       0
Name: adhd, Length: 416, dtype: int64

In [None]:
net.float()
epochs = 30
for epoch in range(epochs):
    model.train()
    
    train_losses = []
    valid_losses = []
    for i, (x, y) in zip(X_train, y_train):
        optimizer.zero_grad()
        
        outputs = model(x)
        loss = loss_fn(outputs, y)
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
        if (i * 128) % (128 * 100) == 0:
            print(f'{i * 128} / 50000')
            
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (images, labels) in enumerate(valid_loader):
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            
            valid_losses.append(loss.item())
            
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            
    mean_train_losses.append(np.mean(train_losses))
    mean_valid_losses.append(np.mean(valid_losses))
    
    accuracy = 100*correct/total
    valid_acc_list.append(accuracy)
    print('epoch : {}, train loss : {:.4f}, valid loss : {:.4f}, valid acc : {:.2f}%'\
         .format(epoch+1, np.mean(train_losses), np.mean(valid_losses), accuracy))
