In [25]:
import argparse
import numpy as np
import sklearn.utils
import sklearn.metrics
from scipy import stats
from collections import Counter, OrderedDict
from pprint import pprint
import json

from DrVAE import DrVAE, DrVAEDataset, wrap_in_DrVAEDataset
import utils as utl

from utils import selectFromDict,subsetDict,concat_dicts,random_chance


In [26]:
data = utl.load_from_HDF("../workspace/datafiles/CTRPv2+L1000_FDAdrugs6h_v2.1.h5")

In [27]:
class Arguments:
    def __init__(self):
        self.type_y = "discrete"
        self.rseed = 24
        self.downlabel_to = None
        self.semi_supervised = False
        self.data_mode = "strictC2C"
        self.fold = 1
        self.clf_dataprior = False
        self.cuda = False
        self.pair_data_only = False
args = Arguments()

In [28]:
data

{'drug_drug': array(['axitinib', 'bortezomib', 'bosutinib', 'chlorambucil',
        'ciclosporin', 'cimetidine', 'clofarabine', 'crizotinib',
        'dasatinib', 'decitabine', 'dexamethasone', 'docetaxel',
        'erlotinib', 'etoposide', 'fluvastatin', 'fulvestrant',
        'gefitinib', 'gemcitabine', 'imatinib', 'itraconazole',
        'lovastatin', 'mitomycin', 'niclosamide', 'nilotinib',
        'omacetaxine mepesuccinate', 'paclitaxel', 'pazopanib', 'PLX-4032',
        'procarbazine', 'prochlorperazine', 'ruxolitinib', 'sildenafil',
        'simvastatin', 'sirolimus', 'sitagliptin', 'sorafenib',
        'tacrolimus', 'temozolomide', 'teniposide', 'thalidomide',
        'topotecan', 'tretinoin', 'trifluoperazine', 'valdecoxib',
        'vincristine', 'vorinostat'], dtype='<U25'),
 'drug_m': array([[0.52658381, 0.        , 0.35714286, ..., 0.        , 0.        ,
         0.        ],
        [0.20688762, 0.        , 0.71428571, ..., 0.        , 0.        ,
         0.        ],


In [29]:
## drug selection
drug_list_26 = ['omacetaxine mepesuccinate', 'bortezomib', 'vorinostat', 'paclitaxel', 'docetaxel', 'topotecan',
                'niclosamide', 'valdecoxib','teniposide', 'vincristine', 'prochlorperazine', 'mitomycin', 'lovastatin',
                'gemcitabine', 'dasatinib', 'fluvastatin', 'clofarabine', 'sirolimus', 'etoposide', 'sitagliptin',
                'decitabine', 'PLX-4032', 'fulvestrant', 'bosutinib', 'trifluoperazine', 'ciclosporin']
drug_list_26 = sorted(drug_list_26)

In [30]:
data.keys()

dict_keys(['drug_drug', 'drug_m', 'drug_threshold', 'labeled_pert_cellid', 'pair_cid', 'pair_conc', 'pair_drug', 'pair_dur', 'pair_m', 'pair_s', 'pair_tid', 'pair_x1', 'pair_x2', 'pair_y', 'pair_ycont', 'sing_cid', 'sing_s', 'sing_tid', 'sing_x1', 'sing_y', 'sing_ycont'])

In [31]:
data["sing_x1"].shape

(927, 973)

In [32]:
drug_list = drug_list_26[1:3]

In [33]:
drug_list

['bortezomib', 'bosutinib']

In [34]:
def compile_baseline_stats(type_y, tr, ev, svmkernel='rbf', rseed=None):
    s = OrderedDict()
    # s['Train|Random Chance|S'] = random_chance(tr['s'])
    # s['Eval|Random Chance|S'] = random_chance(ev['s'])
    # uniq_tids = np.unique(np.concatenate((tr['tid'], ev['tid'])))
    # s['Train|Counts|TissueIds'] = [(tr['tid'] == _t).sum() for _t in uniq_tids]
    # s['Eval|Counts|TissueIds'] = [(ev['tid'] == _t).sum() for _t in uniq_tids]

    if type_y == 'discrete':
        tr_ylab = tr['y'][tr['has_y']]
        ev_ylab = ev['y'][ev['has_y']]
        s['Train|Random Chance|Y'] = random_chance(tr_ylab)
        s['Eval|Random Chance|Y'] = random_chance(ev_ylab)
    else:
        tr_ylab = tr['ycont'][tr['has_y']]
        ev_ylab = ev['ycont'][ev['has_y']]
        s['Train|Random Chance|Y'] = tr_ylab.mean()
        s['Eval|Random Chance|Y'] = ev_ylab.mean()

    ## Run baselines on the original X1 -> Y
    x1_stats = utl.run_baseline_models(type_y,
                                   tr['x1'][tr['has_y']], tr_ylab,
                                   ev['x1'][ev['has_y']], ev_ylab,
                                   rseed=rseed, svmkernel=svmkernel)
    x1_stats = OrderedDict([('X1->Y|' + k, v) for k, v in x1_stats.items()])
    s = concat_dicts(s, x1_stats)
    
    return s

In [35]:
all_stats = {'train': dict(), 'valid': dict(), 'test': dict()}

for selected_drug in drug_list:
    ## ignore drugs that don't have enough perturbations
    if selected_drug in ["abiraterone", "azacitidine", "cyclophosphamide", "methotrexate", "fluorouracil",
                         "ifosfamide", "ciclopirox"]:
        print("Ignoring: ", selected_drug)
        continue
    ## initialize random state
    rnds = sklearn.utils.check_random_state(args.rseed)
    np.random.seed(args.rseed)

    ### select the drug data and get CV-split of all data types
    y_unlab_token = -47
    y_key = 'y' if args.type_y == 'discrete' else 'ycont'
    sing, singv, singt, pair, pairv, pairt = utl.split_data_sd(
        data, selected_drug, args.data_mode, fold=args.fold, n_folds=5, rnds=rnds,
        noPairTest=True, unlab_token=y_unlab_token, verbose=False
    )

    concat_flag = 'pair_only' if args.pair_data_only else 'both'
    train_dataset, train_ddict = wrap_in_DrVAEDataset(sing, pair, y_key, concat=concat_flag,
                                                      downlabel_to=args.downlabel_to,
                                                      remove_unlabeled=not args.semi_supervised)
    valid_dataset, valid_ddict = wrap_in_DrVAEDataset(singv, pairv, y_key, concat=concat_flag,
                                                      remove_unlabeled=not args.semi_supervised)
    test_dataset, test_ddict = wrap_in_DrVAEDataset(singt, pairt, y_key, concat=concat_flag)

    N = len(train_dataset)
    dim_x = sing['x1'].shape[1]
    dim_s = np.unique(sing['s']).shape[0]
    if args.type_y == 'discrete':
        class_sizes = np.bincount(sing['y'][ sing['has_y'] ])
        print(class_sizes)
        dim_y = len(class_sizes)
        data_prior_y = class_sizes / (1.*sum(class_sizes))
    else:
        dim_y = 1
        _tmp_ycont = sing['ycont'][ sing['has_y'] ]
        data_prior_y = np.array([_tmp_ycont.mean(), _tmp_ycont.std()])

    if args.clf_dataprior:
        prior_y = data_prior_y
    else:
        prior_y = 'uniform'

    print("selected_drug: ", selected_drug)
    print(N, dim_x, dim_y)
    print("sensitivity data")
    if args.type_y == 'discrete':
        print("    train data (Y, S):", Counter(sing['y']), Counter(sing['s']))
        print("    valid data (Y, S):", Counter(singv['y']), Counter(singv['s']))
        print("    test data  (Y, S):", Counter(singt['y']), Counter(singt['s']))
    else:
        print("    train data (Ycont):", stats.describe(sing['ycont'][sing['ycont'] != y_unlab_token]))
        print("    valid data (Ycont):", stats.describe(singv['ycont'][singv['ycont'] != y_unlab_token]))
        print("    test data  (Ycont):", stats.describe(singt['ycont'][singt['ycont'] != y_unlab_token]))
        
        
        
    ## run baselines
    all_stats['train'][selected_drug] = compile_baseline_stats(args.type_y,
                                                            tr=train_ddict, ev=train_ddict,
                                                            #model_tr=model_train_perf, model_ev=model_train_perf,
                                                            svmkernel='rbf', rseed=args.rseed)
    all_stats['valid'][selected_drug] = compile_baseline_stats(args.type_y,
                                                            tr=train_ddict, ev=valid_ddict,
                                                            #model_tr=model_train_perf, model_ev=model_valid_perf,
                                                            svmkernel='rbf', rseed=args.rseed)
    all_stats['test'][selected_drug] = compile_baseline_stats(args.type_y,
                                                            tr=train_ddict, ev=test_ddict,
                                                            #model_tr=model_train_perf, model_ev=model_test_perf,
                                                            svmkernel='rbf', rseed=args.rseed)
    

[147 304]
selected_drug:  bortezomib
525 973 2
sensitivity data
    train data (Y, S): Counter({1: 304, 0: 147, -47: 115}) Counter({0.0: 566})
    valid data (Y, S): Counter({1: 99, 0: 50, -47: 36}) Counter({0.0: 185})
    test data  (Y, S): Counter({1: 94, 0: 46, -47: 36}) Counter({0.0: 176})


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[329 115]
selected_drug:  bosutinib
462 973 2
sensitivity data
    train data (Y, S): Counter({0: 329, 1: 115, -47: 113}) Counter({0.0: 557})
    valid data (Y, S): Counter({0: 110, 1: 39, -47: 37}) Counter({0.0: 186})
    test data  (Y, S): Counter({0: 109, 1: 38, -47: 37}) Counter({0.0: 184})


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [36]:
all_stats

{'train': {'bortezomib': OrderedDict([('Train|Random Chance|Y',
                0.6761904761904762),
               ('Eval|Random Chance|Y', 0.6761904761904762),
               ('X1->Y|Acc|RF100', 1.0),
               ('X1->Y|AUROC|RF100', 1.0),
               ('X1->Y|AUPR|RF100', 1.0),
               ('X1->Y|Acc|Ridge', 0.6857142857142857),
               ('X1->Y|AUROC|Ridge', 0.8153769676884838),
               ('X1->Y|AUPR|Ridge', 0.8992530448881036),
               ('X1->Y|Acc|SVMrbf', 0.8838095238095238),
               ('X1->Y|AUROC|SVMrbf', 0.9983098591549296),
               ('X1->Y|AUPR|SVMrbf', 0.9992567004596069),
               ('X1->Y|Acc|SVMlin', 1.0),
               ('X1->Y|AUROC|SVMlin', 1.0),
               ('X1->Y|AUPR|SVMlin', 1.0)]),
  'bosutinib': OrderedDict([('Train|Random Chance|Y', 0.7510822510822511),
               ('Eval|Random Chance|Y', 0.7510822510822511),
               ('X1->Y|Acc|RF100', 1.0),
               ('X1->Y|AUROC|RF100', 0.9999999999999999),
 