In [24]:
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score
from statistics import median
import pickle
import cgan
from sklearn.neighbors import KernelDensity
import random 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
def process_data():
    final_data_csv = '/Users/kanetian7/omic-features-successful-targets/kane/final_data.csv'
    data = pd.read_csv(final_data_csv)
    Y = np.expand_dims(np.array(data['class'].eq('positive').mul(1)), axis=1)
    X = preprocessing.scale(np.array(data.drop(columns=['GeneSym', 'class', 'GeneID']), dtype='float64'))
    processed_data = np.concatenate((X, Y), axis=1)
    return processed_data

In [None]:
def single_pass(X_tr, Y_tr, X_v, Y_v, params):
    generator, discriminator = cgan.train(X_tr, Y_tr, params)
    
    gen_pos = cgan.sample_out(generator, n_examples=500, cl=1)
    gen_neg = cgan.sample_out(generator, n_examples=500, cl=0)
    
    kd_pos = KernelDensity(bandwidth=1.0, kernel='gaussian')
    kd_pos.fit(gen_pos)
    logprob_pos = kd_pos.score_samples(X_v[Y_v==1, :])
    
    kd_neg = KernelDensity(bandwidth=1.0, kernel='gaussian')
    kd_neg.fit(gen_neg)
    logprob_neg = kd_neg.score_samples(X_v[Y_v==0, :])
    
    score = logprob_pos + logprob_neg
    return score, gen_pos, gen_neg, generator, discriminator

In [None]:
def best_model(X_tr, Y_tr, X_v, Y_v):
    for trial in range(100):
        lr = random.random() * (1e-3 - 1e-4)  + 1e-4
        b1 = random.random() * (0.95 - 0.4)  + 0.4
        b2 = random.random() * (0.999 - 0.9)  + 0.9
        wd = random.random() * (1 - 0)  + 0
        latent_dim = random.randint(10, 150)
        adversarial_loss = [torch.nn.MSELoss(), torch.nn.BCELoss()][random.randint(0, 1)]
        params = (lr, b1, b2, latent_dim, adversarial_loss)
        
        score, gen_pos, gen_neg, generator, discriminator = single_pass(X_tr, Y_tr, X_v, Y_v, params)
        if score > best[0]:
            best = (score, gen_pos, gen_neg, generator, discriminator)
    return best

In [None]:
def max_disc_model(X_tr, Y_tr):
    max_auroc_v = 0
    max_model = None
    for C in np.arange(0.1, 10.1, 0.1):
        logreg = LogisticRegression(C=C, class_weight='balanced')
        logreg.fit(X_tr, Y_tr)
        scores_v = logreg.predict_proba(X_v)[:, 1]
        auroc_v = roc_auc_score(Y_v, scores_v)
        if auroc_v > max_auroc_v:
            max_auroc_v = auroc_v
            max_model = logreg
    return max_model

In [72]:
def pipeline(reps=30):
    max_models, max_aurocs, aps, max_models_gen, max_aurocs_gen, aps_gen = [], [], [], [], []
    processed_data = process_data()
    for rep in range(reps):
        np.random.shuffle(processed_data)
        tr_num = int(processed_data.shape[0] * 0.6)
        val_num = int(processed_data.shape[0] * 0.2)
        X_tr, Y_tr = processed_data[:tr_num, :-1], processed_data[:tr_num, -1]
        X_v, Y_v = processed_data[tr_num:tr_num + val_num, :-1], processed_data[tr_num:tr_num + val_num, -1]
        X_te, Y_te = processed_data[tr_num + val_num:, :-1], processed_data[tr_num + val_num:, -1]
            
        max_model = max_disc_model(X_tr, Y_tr)
        max_models.append(max_model)

        scores_te = max_model.predict_proba(X_te)[:, 1]
        auroc_te = roc_auc_score(Y_te, scores_te)
        ap_te = average_precision_score(Y_te, scores_te)
        max_aurocs.append(auroc_te)
        aps.append(ap_te)
        
        (s, gp, gn, g, d) = best_model(X_tr, Y_tr, X_v, Y_v)
        X_tr = np.concatenate((gp, gn, X_tr), axis=0)
        Y_tr = np.concatenate((np.ones(gp.shape[0]), np.zeros(gn.shape[0]), Y_tr), axis=0)
        
        max_model_gen = max_disc_model(X_tr, Y_tr)
        max_models.append(max_model_gen)
        
        scores_te = max_model_gen.predict_proba(X_te)[:, 1]
        auroc_te = roc_auc_score(Y_te, scores_te)
        ap_te = average_precision_score(Y_te, scores_te)
        max_aurocs_gen.append(auroc_te)
        aps_gen.append(ap_te)
        
    return max_models, max_aurocs, aps, max_models_gen, max_aurocs_gen, aps_gen

In [73]:
max_models, max_aurocs, aps, max_models_gen, max_aurocs_gen, aps_gen = pipeline()

0.5659544159544159
0.807640667340467
