In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
from scipy import stats
import math
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.decomposition import PCA

In [88]:
def get_top_systems(drug_list, nest_map, all_df):
    
    df_map = dict()
    for drug in drug_list:
        rlipp_df = all_df.query("@drug == Drug")
        subsys_df = rlipp_df.sort_values(by='P_rho', ascending=False, ignore_index=True)
        subsys_df['Rank'] = 0
        subsys_df['Name'] = ''
        subsys_df = subsys_df[['Rank', 'Term', 'Name', 'P_rho', 'C_rho', 'RLIPP']]
        for i, row in subsys_df.iterrows():
            subsys_df.at[i, 'Rank'] = i+1
            subsys_df.at[i, 'Name'] = nest_map[row['Term']]
        df_map[drug] = subsys_df
    return df_map

In [None]:
def bh(p_vals, alpha):
    res = multipletests(p_vals, alpha=alpha, method='fdr_bh')
    return res[1]

bh.__name__ = 'BH'

In [None]:
def bonferroni(p_vals):
    res = multipletests(p_vals, alpha=0.05, method='bonferroni')
    return res[1]

bonferroni.__name__ = 'Bonferroni'

In [None]:
def system_significance(ont, dataset, drug, zscore_method):
    
    rlipp_dict = dict()
    for i in range(1, 101):
        rlipp_file_list = []
        for j in range(1, 6):
            nf = i + 100*(j-1)
            modeldir = '../models/mbb/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(nf)
            rlipp_df = pd.read_csv(modeldir + '/rlipp.out', sep='\t')[['Term', 'P_rho']]
            rlipp_file_list.append(rlipp_df)
        agg_df = pd.concat(rlipp_file_list, ignore_index=True)
        agg_rlipp_df = pd.DataFrame(agg_df.groupby(['Term']).mean()).reset_index()
        for _, row in agg_rlipp_df.iterrows():
            term = row['Term']
            if term not in rlipp_dict:
                rlipp_dict[term] = []
            rlipp_dict[term].append(row['P_rho'])
    
    main_rlipp_df = pd.read_csv('../models/rlipp/' + drug + '.txt', sep='\t')
    main_rlipp_df['t_test'] = 0.0
    main_rlipp_df['Perm_test'] = 0.0
    
    for i, row in main_rlipp_df.iterrows():
        
        true_prho = row['P_rho']
        term = row['Term']
        prho_list = sorted(rlipp_dict[term], reverse=True)
            
        result = stats.ttest_1samp(prho_list, true_prho, alternative='less', nan_policy='raise')
        pval = result.pvalue
        if math.isnan(pval):
            pval = 1.0
        main_rlipp_df.at[i, 't_test'] = pval
        
        for j, prho in enumerate(prho_list):
            if true_prho <= prho:
                continue
            break
        main_rlipp_df.at[i, 'Perm_test'] = j/100
        
    main_rlipp_df['t_test'] = bh(main_rlipp_df['t_test'])
    main_rlipp_df['Perm_test'] = bh(main_rlipp_df['Perm_test'])
        
    return rlipp_dict, main_rlipp_df

In [None]:
def system_significance_2(ont, dataset, drug, zscore_method, alpha):
    
    mbb_rlipp_dict = dict()
    vnn_rlipp_dict = dict()
    for i in range(1, 6):
        for j in range(1, 101):
            nf = j + 100*(i-1)
            modeldir = '../models/mbb/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(nf)
            rlipp_df = pd.read_csv(modeldir + '/rlipp.out', sep='\t')[['Term', 'P_rho']]
            for _, row in rlipp_df.iterrows():
                term = row['Term']
                if term not in mbb_rlipp_dict:
                    mbb_rlipp_dict[term] = []
                mbb_rlipp_dict[term].append(row['P_rho'])
                
        modeldir = '../models/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i)
        rlipp_df = pd.read_csv(modeldir + '/rlipp.out', sep='\t')[['Term', 'P_rho']]
        for _, row in rlipp_df.iterrows():
            term = row['Term']
            if term not in vnn_rlipp_dict:
                vnn_rlipp_dict[term] = []
            vnn_rlipp_dict[term].append(row['P_rho'])
    
    main_rlipp_df = pd.read_csv('../models/rlipp/' + drug + '.txt', sep='\t')
    main_rlipp_df['t_test'] = 0.0
    
    for i, row in main_rlipp_df.iterrows():
        term = row['Term']
        result = stats.ttest_ind(mbb_rlipp_dict[term], vnn_rlipp_dict[term], alternative='less')
        pval = result.pvalue
        if math.isnan(pval):
            pval = 1.0
        main_rlipp_df.at[i, 't_test'] = pval
        
    main_rlipp_df['t_test'] = bh(main_rlipp_df['t_test'], alpha)
        
    return mbb_rlipp_dict, main_rlipp_df

In [None]:
def get_shuffled_rlipp():
    rlipp_dict = dict()
    for i in range(1, 1001):
        rlipp_file_list = []
        for j in range(1, 6):
            modeldir = '../models/shuffled_input/' + str(j) + '_' + str(i)
            rlipp_df = pd.read_csv(modeldir + '/rlipp.out', sep='\t')[['Term', 'P_rho']]
            rlipp_file_list.append(rlipp_df)
        agg_df = pd.concat(rlipp_file_list, ignore_index=True)
        agg_rlipp_df = pd.DataFrame(agg_df.groupby(['Term']).mean()).reset_index()
        for _, row in agg_rlipp_df.iterrows():
            term = row['Term']
            if term not in rlipp_dict:
                rlipp_dict[term] = []
            rlipp_dict[term].append(row['P_rho'])
            
    return rlipp_dict

In [None]:
def system_significance_shuffled_input(rlipp_dict, drug, alpha):
    
    main_rlipp_df = pd.read_csv('../models/rlipp/' + drug + '.txt', sep='\t')
    main_rlipp_df['t_test'] = 0.0
    main_rlipp_df['Perm_test'] = 0.0
    for i, row in main_rlipp_df.iterrows():
        
        term = row['Term']
        true_prho = 0.5 #row['P_rho']
        prho_list = sorted(rlipp_dict[term], reverse=True)
        
        result = stats.ttest_1samp(prho_list, true_prho, alternative='less')
        main_rlipp_df.at[i, 't_test'] = result.pvalue
        
        for j, prho in enumerate(prho_list):
            if true_prho <= prho + 1.0/1000:
                continue
            break
        main_rlipp_df.at[i, 'Perm_test'] = j/1000
        
    main_rlipp_df['t_test'] = bh(main_rlipp_df['t_test'], alpha)
    main_rlipp_df['Perm_test'] = bh(main_rlipp_df['Perm_test'], alpha)
        
    return main_rlipp_df

In [2]:
nest_df = pd.read_csv('../data/NeST/NeST_node.csv', sep=',')
nest_map = {row['name'].replace('.', '-'):row['Annotation'] for i, row in nest_df.iterrows()}

In [3]:
ont = 'ctg'
dataset = 'av'
folds = 5
tasks=['RS']

In [4]:
drug_info = pd.read_csv("../data/master_druglist_smiles_final.csv")[['name', 'isomeric_smiles']]
drug_info['name'] = drug_info['name'].str.replace(' ','-')
drug_info['name'] = drug_info['name'].str.replace('"','')

drug_smiles_map = dict(zip(drug_info.isomeric_smiles, drug_info.name))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [58]:
def exec_lm(X, y, pca_dim=2):
    pca = PCA(n_components=pca_dim)
    X_pca = pca.fit_transform(X)
    regr = ElasticNetCV()
    regr.fit(X_pca, y)
    y_pred = regr.predict(X_pca)
    return stats.pearsonr(y_pred, y)[0]

In [86]:
def term_analysis(ont, dataset, task, term, drug_smiles_map, fold_size=5):
    
    corr_map = dict()
    for k in range(1, fold_size+1):
        modeldir = '../models/Final/model_' + ont + '_' + dataset + '_' + task + '_' + str(k)
        test_df = pd.read_csv(modeldir + '/test.txt', sep='\t')
        predict_data = np.loadtxt(modeldir + '/predict.txt')
        
        system_hidden_df = pd.read_csv(modeldir + '/hidden/' + term + '.hidden', sep=' ', header=None)
        
        drug_list = list(test_df.columns)
        drug_list.remove('cell_line')
        for i, smiles in enumerate(drug_list):
            
            drug_test_df = test_df[['cell_line', smiles]].copy()
            drug_test_df['pred'] = np.where(drug_test_df[smiles].notna(), predict_data[:,i], np.nan)
            
            merged_df = pd.concat([drug_test_df, system_hidden_df], axis=1).reindex(drug_test_df.index)
            
            y = merged_df['pred']
            mask = np.isnan(y)
            y = y[~mask]
            X = np.array(merged_df.drop(drug_test_df.columns, axis=1))
            X = X[~mask,:]
            corr = exec_lm(X, y, X.shape[1])
            
            drug = drug_smiles_map[smiles]
            if drug in corr_map:
                corr_map[drug].append(corr)
            else:
                corr_map[drug] = [corr]
    
    return corr_map

In [67]:
term = 'NEST:89'
for task in tasks:
    corr_map = term_analysis(ont, dataset, task, term, drug_smiles_map)
    for key, value in corr_map.items():
        print(key, value)

gemcitabine [0.5385409478718337, 0.6672219924592176, 0.597079293245657, 0.5665146544638067, 0.6461019524905314]
125316-60-1 [0.5432894519403573, 0.65814752617807, 0.5480436858027579, 0.5754099423030417, 0.6653666823952628]
Olaparib [0.539991966862683, 0.6803238298151559, 0.586554402418771, 0.5770620153002288, 0.641389749135507]
camptothecin [0.5122408091682337, 0.6702299481690666, 0.6028749968742091, 0.556496358867397, 0.685563736941464]
etoposide [0.5362461348630843, 0.6716073270983497, 0.5930170285906315, 0.5670986947316987, 0.6379251546940323]
dichloroplatinum-diammoniate [0.5054720101702788, 0.6787415682254571, 0.5922594037187643, 0.5740160032906966, 0.6635068395872064]


In [89]:
for task in tasks:
    drugs = list(pd.read_csv('../data/training_files_' + dataset + '/task_list_' + task + '.txt', header=None, names=['D'])['D'])
    for i in range(1, folds+1):
        modeldir = '../models/Final/model_' + ont + '_' + dataset + '_' + task + '_' + str(i)
        rlipp_df = pd.read_csv(modeldir + '/rlipp.out', sep='\t')
        df_map = get_top_systems(drugs, nest_map, rlipp_df)
        for smiles, df in df_map.items():
            df.to_csv(modeldir + '/subsystem_ranks_' +  drug_smiles_map[smiles] + '.txt', sep='\t', index=False)

In [91]:
#Merging p_rho

for task in tasks:
    task_rlipp = []
    drugs = list(pd.read_csv('../data/training_files_' + dataset + '/task_list_' + task + '.txt', header=None, names=['D'])['D'])
    for drug in drugs:
        agg_terms = []
        for i in range(1, folds+1):
            modeldir = '../models/Final/model_' + ont + '_' + dataset + '_' + task + '_' + str(i)
            subsys_df = pd.read_csv(modeldir + '/subsystem_ranks_' +  drug_smiles_map[drug] + '.txt', sep='\t')[['Term', 'Name', 'P_rho', 'C_rho', 'RLIPP']]
            agg_terms.append(subsys_df)

        agg_df = pd.concat(agg_terms, ignore_index=True)
        agg_rlipp_df = pd.DataFrame(agg_df.groupby(['Term', 'Name']).mean()).reset_index()
        agg_rlipp_df = agg_rlipp_df.sort_values(by='P_rho', ascending=False)
        agg_rlipp_df.to_csv('../models/SI_scores/' + ont + '_' + dataset + '_' + task + '_' + drug_smiles_map[drug] + '.txt', sep='\t', float_format='%.4f', index=False)
        task_rlipp.append(agg_rlipp_df)
    
    avg_df = pd.concat(task_rlipp, ignore_index=True)
    task_df = pd.DataFrame(avg_df.groupby(['Term']).mean()).reset_index()
    task_df = task_df.sort_values(by='P_rho', ascending=False)
    task_df.to_csv('../models/SI_scores/' + ont + '_' + dataset + '_' + task + '.txt', sep='\t', float_format='%.4f', index=False)

In [None]:
prho_avg = []

drug = 'ML-210'
ML210_rlipp_df = pd.read_csv(modeldir_prefix + 'rlipp/' + ont + '_' + dataset + '_' + drug + '.txt', sep='\t')[['Term', 'P_rho']]
ML210_rlipp_df.sort_values(by='Term', inplace=True)
prho_avg.append(ML210_rlipp_df)

drug = '1035072-16-2'
ML162_rlipp_df = pd.read_csv(modeldir_prefix + 'rlipp/' + ont + '_' + dataset + '_' + drug + '.txt', sep='\t')[['Term', 'P_rho']]
ML162_rlipp_df.sort_values(by='Term', inplace=True)
prho_avg.append(ML162_rlipp_df)

drug = 'RSL3'
RSL3_rlipp_df = pd.read_csv(modeldir_prefix + 'rlipp/' + ont + '_' + dataset + '_' + drug + '.txt', sep='\t')[['Term', 'P_rho']]
RSL3_rlipp_df.sort_values(by='Term', inplace=True)
prho_avg.append(RSL3_rlipp_df)

In [None]:
avg_df = pd.concat(prho_avg, ignore_index=True)
prho_df = pd.DataFrame(avg_df.groupby(['Term']).mean()).reset_index()
prho_df = prho_df.sort_values(by='P_rho', ascending=False)
prho_df.to_csv(modeldir_prefix + 'rlipp/gpx4i_avg_' + ont + '_' + dataset + '_' + drug + '.txt', sep='\t', float_format='%.2e', index=False)