In [None]:
# Computing model baselines on real diabetes data
# Using full size dataset (4901 samples x 72819 SNPs)

# Velina Kozareva

In [1]:
import argparse
import joblib
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

import matplotlib.pyplot as plt 

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

## Modifications to Divya's model baseline code

Once this is organized in repo, can make the corresponding changes there and redo notebook more cleanly.

In [13]:
class ModelSum:
    """Handy class to store all information about each model"""
    def __init__(self, pset, simnum):
        self.pset = pset
        self.simnum = int(simnum)

        self.data_type = None
        # self.log_reg_real = None
        # self.rand_forest_real = None
        # self.log_reg_rand = None
        # self.rand_forest_rand = None
        self.rho=None
        self.pve=None
        self.maf_frac=None
        self.obs=None
        self.k=None
        self.overlap=None
        self.class_imbalance = 0.5

        self.real_interp_auc = {}
        self.rand_interp_auc = {}

        self.real_class_auc = {}
        self.rand_class_auc = {}

        self.classification_aucs = {}
        ### Return importance scores as well
        self.vscores = {}

    def populate_pset_vars(self):
        # Set up simlation run
        sp = self.pset.split('-')

        if 'prop_case' in sp:
            self.class_imbalance = float(sp[sp.index('prop_case')+1])
        if 'rho' in sp:
            self.rho = float(sp[sp.index('rho')+1])
        if 'pve' in sp:
            self.pve = float(sp[sp.index('pve')+1])
        if 'maf_frac' in sp:
            self.maf_frac = float(sp[sp.index('maf_frac')+1])
        if 'obs' in sp:
            self.obs = float(sp[sp.index('obs')+1])
        if 'k' in sp:
            self.k = float(sp[sp.index('k')+1])

        if 'ind' in sp:
            self.overlap = True
        elif 'non_overlap_degree' in sp:
            self.overlap = False

# get interpretability and class ROC for one split (then average ROCs over 5 splits)

def split_data(X, y, test_size=0.1, rand_state=5):
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=rand_state,
                                                    shuffle = True, stratify = y)
    snp_labels = list(X.columns)
    return train_x, test_x, train_y, test_y, snp_labels


def train_model(mod_type, train_x, test_x, train_y, test_y, var_labels=None, causal_vars=None, var_type='snps', rand_state=5, calc_interpretability=True):
    # Setup classifier
    implemented_list = ['log_reg', 'rand_forest']
#     if not calc_interpretability:
    interp_auc = None

    assert mod_type in implemented_list, 'check mod type: {}'.format(mod_type)
    if mod_type == 'log_reg':
        mlmod = LogisticRegression(penalty='l2', solver='liblinear', class_weight='balanced', random_state=rand_state)
        mlmod.fit(train_x, np.ravel(train_y)) 
        test_class_auc = mlmod.score(test_x, test_y) #confirm that score is AUC

        if calc_interpretability:
            if var_type == 'snps':
                coefs = np.abs(mlmod.coef_) # check absolute value
#           elif causa_vars == 'pathways':
            vscores = pd.DataFrame(zip(var_labels, np.ravel(coefs)), columns=['var', 'score']).set_index('var')
            if causal_vars is not None:
                _, interp_auc = calc_ROC(vscores, causal_vars)
            
    elif mod_type == 'rand_forest':
        mlmod = RandomForestClassifier(class_weight='balanced', random_state=rand_state)
        mlmod.fit(train_x, np.ravel(train_y)) 
        test_class_auc = mlmod.score(test_x, test_y) #confirm that score is AUC

        if calc_interpretability:
            if var_type == 'snps':
                coefs = np.abs(mlmod.feature_importances_) # check absolute value
                vscores = pd.DataFrame(zip(var_labels, np.ravel(coefs)), columns=['var', 'score']).set_index('var')
            # elif var_type == 'patways':
            if causal_vars is not None:
                _, interp_auc = calc_ROC(vscores, causal_vars)
        
    ### Return vscores as well
    return mlmod, test_class_auc, interp_auc, vscores

# right now, using snps; modify so can use either SNps or genens
def calc_ROC(df_scores, true_causal): # layer: 'snp' or 'gene', list_pos: 'list_snp' or 'list_genes'
    # from Ashley
    assert true_causal is not None, 'check true_causal list'

    npips = len(df_scores)
    all_vals = list(df_scores.index)
    Pos = true_causal
    Negs = set(all_vals) - set(Pos)

    df_res = pd.DataFrame(columns=["TPR", "FPR", "FDR", "PWR"], dtype=object) 
    df_scores.sort_values(by=['score'], ascending=False, inplace=True)
    
    for i in range(1,npips+1):
        v = df_scores[0:i].index
        z = set(all_vals) -  set(v)
#         TP = len(intersection(set(v), set(Pos)))
        TP = len(set(v) & set(Pos))
        FP = len(set(v)-set(Pos))
#         TN = len(intersection(set(z), set(Negs)))
        TN = len(set(z) & set(Negs))
        FN = len(set(z)-set(Negs))
#         print(TPs,FP,TN,FN)        
        TPR = TP/(TP+FN)
            
        FPR = FP/(FP+TN) 
        FDR = FP/(FP+TP)
        PWR = 1-(1-TPR)
        df_res.loc[len(df_res)] = [TPR, FPR, FDR, PWR]
        
    # add AUC calc here    
    auc = metrics.auc(np.array(df_res['FPR']), np.array(df_res['TPR']))
    return df_res, auc

def score_cv(model_type, X, y, true_causal=None, calc_interpretability=False, cv=5):
    mods = []
    class_aucs = []
    interp_aucs = []
    ### Keep track of vscores as well
    vscores_l = []
    
    for i in range(cv):
        train_x, test_x, train_y, test_y, snp_labels = split_data(X, y, rand_state=5+i) #check random state set
        mlmod, test_class_auc, interp_auc, vscores = train_model(model_type, train_x, test_x, train_y, test_y, snp_labels, true_causal, calc_interpretability=calc_interpretability) 

        # print(i, test_class_auc, interp_auc)
        mods.append(mlmod)
        class_aucs.append(test_class_auc)
        interp_aucs.append(interp_auc)
        vscores_l.append(vscores)
        
    return mods, class_aucs, interp_aucs, vscores_l

## Result-generating code

In [3]:
# read in diabetes data

data_pd = joblib.load('/scratch/users/velina/sparse_nn/t1d_data/pathway_level_data_snps72819/data_pd.pkl')

In [34]:
# Reformat data
X = data_pd.drop(['label1', 'label2'], axis=1)
display(X.head())
y = data_pd[['label1']]
display(y.head())

Unnamed: 0,rs6658795,rs1016209,rs760967,rs760969,rs2235550,rs6686862,rs6687289,rs6694932,rs7534396,rs7542174,...,rs3865507,rs17771961,rs3928916,rs3928917,rs7251886,rs10423171,rs11883018,rs2701,rs3795018,rs11669653
ind0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,0.0,2.0
ind1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0
ind2,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,2.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0
ind3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,2.0,1.0,2.0,0.0,0.0,1.0,0.0,2.0
ind4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


Unnamed: 0,label1
ind0,0
ind1,0
ind2,0
ind3,0
ind4,0


In [14]:
%%time

# Run data through baseline models
ms = ModelSum('diabetes', 1)
ms.populate_pset_vars()

mod_list = ['log_reg', 'rand_forest']
cX = X
cY = y

for model in mod_list:
    mmod, mcauc, miauc, vscores = score_cv(model, cX, cY, true_causal=None, calc_interpretability=True)
    ms.real_class_auc[model]=mcauc
    ms.real_interp_auc[model]=miauc
    ms.vscores[model] = vscores

CPU times: user 51min 41s, sys: 34min 19s, total: 1h 26min
Wall time: 45min 35s


In [28]:
! mkdir diabetes_baselines/rand_forest
! mkdir diabetes_baselines/log_reg

In [31]:
# save these with appropriate names for next step (GSEA in R)
for model in mod_list:
    prefix = 'rf' if model == 'rand_forest' else 'lr'
    for seed in range(5):
        # Rename index for R function
        ms.vscores[model][seed].index.names = ['X']
        ms.vscores[model][seed].columns = ['coefs']
        ms.vscores[model][seed].to_csv(f'diabetes_baselines/{model}/{prefix}_s{seed}_coefs.csv')

In [33]:
# What are the classification AUCs here?
# Quite a bit lower than expected...
ms.real_class_auc

{'log_reg': [0.6191446028513238,
  0.6232179226069247,
  0.6272912423625254,
  0.6130346232179226,
  0.6374745417515275],
 'rand_forest': [0.604887983706721,
  0.6089613034623218,
  0.615071283095723,
  0.6089613034623218,
  0.6171079429735234]}