In [90]:
import pandas as pd
import plotly.express as px

In [113]:
%load_ext autoreload
%autoreload 2

from train import train_model
from preprocess import preprocess
from explore import plot_pca, plot_pca_variance

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [92]:
GENDER = 'Gender'

In [93]:
mri = pd.read_csv('unrestricted_hcp_freesurfer.csv', index_col='Subject')
behav = pd.read_csv('hcp_behavioral_data.csv', index_col='Subject')
joined = mri.join(behav, how='inner', lsuffix='_mri', rsuffix='_behav')

# Setup X and y
y = joined['Gender_mri']
X = joined.drop(['Gender_mri', 'Gender_behav'], axis=1)


In [94]:
y = mri['Gender']
X = mri.drop(['Gender'], axis=1)
X_train, X_test, y_train, y_test = preprocess(X, y)
plot_pca_variance(X_train)

Index([], dtype='object')


In [133]:
%load_ext autoreload
%autoreload 2

from explore import plot_pca

y = mri['Gender']
X = mri.drop(['Gender'], axis=1)
X_train, X_test, y_train, y_test = preprocess(X, y)
plot_pca(X_train, y_train)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Index([], dtype='object')


In [134]:
%load_ext autoreload
%autoreload 2

from explore import plot_tsne

y = mri['Gender']
X = mri.drop(['Gender'], axis=1)
X_train, X_test, y_train, y_test = preprocess(X, y)
plot_tsne(X_train, y_train)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Index([], dtype='object')



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [96]:
def identify_leakage(df, col):
    """Identify leakage in a dataframe.
    Args:
        df (pd.DataFrame): The dataframe to check.
        col (str): The column to check for leakage.
    """
    # Add dummy coding for Gender
    df['F'] = (df[col] == 'F')

    # calculate correlation matrix
    corr = df.corr()
    px.bar(corr['F'].sort_values(ascending=False)).show()
    


In [97]:
identify_leakage(mri, GENDER)





In [99]:
y = mri['Gender']
X = mri.drop(['Gender'], axis=1)

X_train, X_test, y_train, y_test = preprocess(X, y)

train_model(X_train, X_test, y_train, y_test, 'rbf')
train_model(X_train, X_test, y_train, y_test, 'xgboost')
train_model(X_train, X_test, y_train, y_test, 'random forrest')
train_model(X_train, X_test, y_train, y_test, 'tree')

Index([], dtype='object')
Model: rbf, Training accuracy: 0.9865168539325843, Test accuracy: 0.8654708520179372
Model: xgboost, Training accuracy: 1.0, Test accuracy: 0.8878923766816144
Model: random forrest, Training accuracy: 1.0, Test accuracy: 0.8430493273542601
Model: tree, Training accuracy: 1.0, Test accuracy: 0.8340807174887892


In [141]:
results = {'model': [], 'data': [], 'train_acc': [], 'test_acc': []}
models = ['rbf', 'xgboost', 'random forrest', 'tree']

In [142]:
%load_ext autoreload
%autoreload 2
from kfold_train import k_fold_train_model

y = mri['Gender']
X = mri.drop(['Gender'], axis=1)

for model in models:
    train, test = k_fold_train_model(X, y, 5, model)
    results['model'].append(model)
    results['data'].append('mri')
    results['train_acc'].append(train)
    results['test_acc'].append(test)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Model: rbf, Training accuracy: 0.9865168539325843, Test accuracy: 0.8654708520179372
Model: rbf, Training accuracy: 0.9808988764044944, Test accuracy: 0.8923766816143498
Model: rbf, Training accuracy: 0.9865168539325843, Test accuracy: 0.852017937219731
Model: rbf, Training accuracy: 0.9842873176206509, Test accuracy: 0.9009009009009009
Model: rbf, Training accuracy: 0.9865319865319865, Test accuracy: 0.8468468468468469
Model: rbf, Average training accuracy: 0.9849503776844599, Average test accuracy: 0.8715226437199531


In [140]:
%load_ext autoreload
%autoreload 2
from kfold_train import k_fold_train_model

y = behav['Gender']
X = behav.drop(['Gender'], axis=1)

for model in models:
    train, test = k_fold_train_model(X, y, 5, model)
    results['model'].append(model)
    results['data'].append('behav')
    results['train_acc'].append(train)
    results['test_acc'].append(test)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Model: rbf, Training accuracy: 0.9875518672199171, Test accuracy: 0.9380165289256198
Model: rbf, Training accuracy: 0.9927461139896373, Test accuracy: 0.9336099585062241
Model: rbf, Training accuracy: 0.9896373056994818, Test accuracy: 0.8962655601659751
Model: rbf, Training accuracy: 0.9906735751295337, Test accuracy: 0.9253112033195021
Model: rbf, Training accuracy: 0.9927461139896373, Test accuracy: 0.9128630705394191
Model: rbf, Average training accuracy: 0.9906709952056415, Average test accuracy: 0.9212132642913481
Model: xgboost, Training accuracy: 0.9989626556016598, Test accuracy: 0.9752066115702479


KeyboardInterrupt: 

In [None]:
results = pd.DataFrame(results)
px.bar(results, x='model', y='test_acc', color='data', barmode='group').show()