In [1]:
import pandas as pd
import pickle as pkl
from create_datasets.create_table_datasets import load_data
from ml_models.machine_learning_models import create_class_LOAD, main, baseline_model
import seaborn as sns
import matplotlib.pyplot as plt

# plt.style.use('ggplot')
sns.set(font="Helvetica")

**1. Create table datasets**

Create table datasets (with and without APOE) for the targets and genes of interest to use as input for more-standard machine learning models.

In [2]:
diseases = ['AD', 'ND']

for disease in diseases:

    infile1 = f'data/table_datasets/{disease}_PPI_missense_LOAD.csv'
    infile2 = 'data/LOAD/LOAD_metadata.tsv'

    table_dataset = load_data('missense', disease, infile1, infile2)
    table_dataset.to_csv(f'data/table_datasets/{disease}_PPI_missense_LOAD_labeled.csv')

In [3]:
target = 'LOAD'

infile = f'data/table_datasets/AD_PPI_missense_LOAD_labeled.csv'
data = pd.read_csv(infile, index_col = 0)
data_wclass = create_class_LOAD(data)

x = data_wclass.drop(columns=['y'])
x = x['APOE']

y = data_wclass['y']
x.index = x.index.str.upper()

f = open(f'data/splits/split_{target}.pkl', 'rb')
split = pkl.load(f)
f.close()

auc = baseline_model(split, x, y)
print(f'Baseline model {target}, AUC ROC:', auc)

Class distribution:
1    1014
0     585
Name: y, dtype: int64
0.66875 0.7553191489361702 0.7029702970297029 0.7282051282051281


Confusion matrix:
 [[36 23]
 [30 71]]

              precision    recall  f1-score   support

           0       0.55      0.61      0.58        59
           1       0.76      0.70      0.73       101

    accuracy                           0.67       160
   macro avg       0.65      0.66      0.65       160
weighted avg       0.68      0.67      0.67       160

Baseline model LOAD, AUC ROC: 0.6266152038932706


**2. Run no-GNN models**

Run more-standard machine learning models 3 times (to properly compare with the 3 runs for each GNN model).

In [4]:
diseases = ['AD', 'ND']

for disease in diseases:
    for i in range(1, 4):
        result_ad = main('LOAD', disease, 'PPI', 'LOAD', 'data/table_datasets', f'results/GNNs_LOAD/results_{disease}_missense_LOAD_rep{i}')

Class distribution:
1    1014
0     585
Name: y, dtype: int64

Classification with LOAD - Split 10.0% - GridSearchCV 5
      APOE  BDNF  EPHA1  HFE  INSR  MAPT  MTHFR   TF  HMOX1  GAPDHS  ...  \
0      0.0   0.0    0.0  0.0   0.0   0.0    1.0  1.0    0.0     0.0  ...   
1      0.0   0.0    0.0  0.0   0.0   0.0    1.0  0.0    0.0     0.0  ...   
2      0.0   0.0    0.0  0.0   0.0   1.0    1.0  1.0    0.0     0.0  ...   
3      0.0   1.0    0.0  0.0   0.0   1.0    1.0  0.0    0.0     0.0  ...   
4      0.0   0.0    0.0  0.0   0.0   1.0    1.0  1.0    0.0     0.0  ...   
...    ...   ...    ...  ...   ...   ...    ...  ...    ...     ...  ...   
1594   0.0   1.0    0.0  1.0   0.0   0.0    0.0  1.0    0.0     0.0  ...   
1595   0.0   0.0    0.0  0.0   0.0   0.0    1.0  1.0    0.0     0.0  ...   
1596   0.0   1.0    0.0  0.0   0.0   1.0    1.0  0.0    0.0     0.0  ...   
1597   0.0   0.0    0.0  0.0   0.0   1.0    1.0  0.0    0.0     0.0  ...   
1598   0.0   1.0    0.0  0.0   0.0   1.0    1

Joint results of both baseline, no-GNN, and GNN models are in [results/GNNs_comparison](results/GNNs_comparison/) for [PET](results/GNNs_comparison/2022_01_PET.csv) and [PET&DX](results/GNNs_comparison/2022_01_PETandDX.csv) targets.