In [65]:
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score
from statistics import median
import pickle

In [66]:
def process_data():
    final_data_csv = '/Users/kanetian7/omic-features-successful-targets/kane/final_data.csv'
    data = pd.read_csv(final_data_csv)
    Y = np.expand_dims(np.array(data['class'].eq('positive').mul(1)), axis=1)
    X = preprocessing.scale(np.array(data.drop(columns=['GeneSym', 'class', 'GeneID']), dtype='float64'))
    processed_data = np.concatenate((X, Y), axis=1)
    return processed_data

In [67]:
def gen_data(X_tr, Y_tr, X_v, Y_v):
    # train model here, append results to data
    
    return X_tr, Y_tr

In [72]:
def pipeline(reps=30):
    max_models, max_aurocs, aps, max_aurocs_gen, aps_gen = [], [], [], [], []
    processed_data = process_data()
    for rep in range(reps):
        # discriminative stuff
        np.random.shuffle(processed_data)
        tr_num = int(processed_data.shape[0] * 0.6)
        val_num = int(processed_data.shape[0] * 0.2)
        X_tr, Y_tr = processed_data[:tr_num, :-1], processed_data[:tr_num, -1]
        X_v, Y_v = processed_data[tr_num:tr_num + val_num, :-1], processed_data[tr_num:tr_num + val_num, -1]
        X_te, Y_te = processed_data[tr_num + val_num:, :-1], processed_data[tr_num + val_num:, -1]

        max_auroc_v = 0
        max_model = None
        for C in np.arange(0.1, 10.1, 0.1):
            logreg = LogisticRegression(C=C, class_weight='balanced')
            logreg.fit(X_tr, Y_tr)
            scores_v = logreg.predict_proba(X_v)[:, 1]
            auroc_v = roc_auc_score(Y_v, scores_v)
            if auroc_v > max_auroc_v:
                max_auroc_v = auroc_v
                max_model = logreg
        max_models.append(max_model)

        scores_te = logreg.predict_proba(X_te)[:, 1]
        auroc_te = roc_auc_score(Y_te, scores_te)
        ap_te = average_precision_score(Y_te, scores_te)
        
        max_aurocs.append(auroc_te)
        aps.append(ap_te)
        
#         # generative stuff
#         X_tr, Y_tr = gen_data(X_tr, Y_tr, X_v, Y_v)
#         logreg.fit(X_tr, Y_tr)
        
#         scores_te = logreg.predict_proba(X_te)[:, 1]
#         auroc_te = roc_auc_score(Y_te, scores_te)
#         ap_te = average_precision_score(Y_te, scores_te)
        
#         max_aurocs_gen.append(auroc_te)
#         aps_gen.append(ap_te)
    return max_models, max_aurocs, aps, max_aurocs_gen, aps_gen

In [73]:
max_models, max_aurocs, aps, max_aurocs_gen, aps_gen = pipeline()
print(median(max_aurocs))
print(median(aps))

0.5659544159544159
0.807640667340467
