In [1]:
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import statistics
import math
from sklearn.model_selection import train_test_split
import random
import sklearn
from sklearn import ensemble
from itertools import chain
from typing import TextIO
import re

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import PredefinedSplit

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestClassifier
#from sklearn import svm
#from sklearn.svm import SVC
#from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve, confusion_matrix

In [2]:
# data
genes = ['RPS14', 'CDC5L', 'POLR2I', 'RPS7', 'XAB2', 'RPS19BP1', 'RPL23A', 'SUPT6H', 'PRPF31', 'U2AF1', 'PSMD7',
         'Hsp10', 'RPS13', 'PHB', 'RPS9', 'EIF5B', 'RPS6', 'RPS11', 'SUPT5H', 'SNRPD2', 'RPL37', 'RPSA', 'COPS6',
         'DDX51', 'EIF4A3', 'KARS', 'RPL5', 'RPL32', 'SF3A1', 'RPS3A', 'SF3B3', 'POLR2D', 'RPS15A', 'RPL31', 'PRPF19',
         'SF3B2', 'RPS4X', 'CSE1L', 'RPL6', 'COPZ1', 'PSMB2', 'RPL7', 'PHB2', 'ARCN1', 'RPA2', 'NUP98', 'RPS3', 'EEF2',
         'USP39', 'PSMD1', 'NUP93', 'AQR', 'RPL34', 'PSMA1', 'RPS27A']

genes_filter_1 = ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B', 'RPL31',
       'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H', 'EEF2', 'RPS11',
       'SNRPD2', 'RPL37', 'SF3B3', 'DDX51', 'RPL7', 'RPS9', 'KARS',
       'SF3A1', 'RPL32', 'PSMB2', 'RPS7', 'EIF4A3', 'U2AF1', 'PSMA1',
       'PHB', 'POLR2D', 'RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2',
       'SUPT5H', 'RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1',
       'RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L', 'RPL5',
       'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']

gene_split_index = {}
for i in range(len(genes_filter_1)):
    gene = genes_filter_1[i]
    gene_split_index[gene]= math.floor(i/6)

base_positions = {
    'A': 0,
    'T': 1,
    'C': 2,
    'G': 3,
    0: 'A',
    1: 'T',
    2: 'C',
    3: 'G',
}


In [3]:
def create_gene_splits_filter1_kfold_noval(gene_strings, values_to_split: list, kfold, split):
    # use number [0, 1, 2, 3, 4,...] as index
    genes_filter_1 = ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B', 'RPL31',
       'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H', 'EEF2', 'RPS11',
       'SNRPD2', 'RPL37', 'SF3B3', 'DDX51', 'RPL7', 'RPS9', 'KARS',
       'SF3A1', 'RPL32', 'PSMB2', 'RPS7', 'EIF4A3', 'U2AF1', 'PSMA1',
       'PHB', 'POLR2D', 'RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2',
       'SUPT5H', 'RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1',
       'RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L', 'RPL5',
       'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']
    assert split >= 0 and split < kfold
    if kfold == 9:
        #val_genes = genes_filter_1[split * 6: (split + 1) * 6]
        if split != 8:
            test_genes = genes_filter_1[((split + 1) * 6): (split + 2) * 6]
        else:
            test_genes = genes_filter_1[0:6]
    #print('val:', val_genes)
    print('test:', test_genes)

    #val_ids = list(chain(*[np.where(gene_strings == g)[0] for g in val_genes]))
    test_ids = list(chain(*[np.where(gene_strings == g)[0] for g in test_genes]))
    train_ids = list((set(range(len(gene_strings))) - set(test_ids)))

    train = [[arr[i] for i in train_ids] for arr in values_to_split]
    #val = [[arr[i] for i in val_ids] for arr in values_to_split]
    test = [[arr[i] for i in test_ids] for arr in values_to_split]

    return train, test

def normalize(a: np.ndarray):
    """
    :param a: numpy array of size N x D, where N is number of examples, D is number of features
    :return: a, normalized so that all feature columns are now between 0 and 1
    """
    a_normed, norms = sklearn.preprocessing.normalize(a, norm='max', axis=0, return_norm=True)
    print("Norms:", norms)
    return a_normed

def one_hot_encode_sequence(seq, pad_to_len=-1):
    output_len = len(seq)
    if pad_to_len > 0:
        assert pad_to_len >= output_len
        output_len = pad_to_len

    encoded_seq = np.zeros((output_len, 4), dtype=np.float32)
    for i, base in enumerate(seq):
        encoded_seq[i][base_positions[base]] = 1
    return encoded_seq

In [4]:
dataset_filtered_csv_path = '../../../data/integrated_guide_feature_filtered_f24_mismatch3_all_flanks.csv'

#dataset
dataframe = pd.read_csv(dataset_filtered_csv_path)
dataframe = dataframe[dataframe['gene'].isin(genes_filter_1)] #filter out 1 gene

num_examples = len(dataframe['gene'].values)
encoded_guides = [one_hot_encode_sequence(guide).flatten() for guide in dataframe['guide'].values]

# guide seq only classification

classes = dataframe['binary_relative_ratio_075f'].values

outputs = classes.astype(np.float32)
    
all_cols = [encoded_guides, outputs]

# group label to split
groups = dataframe['gene'].values

# predefined split index
for g in gene_split_index.keys():
    dataframe.loc[dataframe['gene']== g,'predefined split index']= gene_split_index[g]
ps = PredefinedSplit(dataframe['predefined split index'].values)
print(ps.get_n_splits())

9


In [5]:
len(all_cols[1])

119399

## hp tuning

In [6]:
# LogisticRegression, L1
logreg = LogisticRegression(penalty='l1',solver='saga',random_state=0,max_iter=10000)
grid = {'C': np.logspace(-5, 5, 11)}

#predefined splits
#gs = GridSearchCV(logreg, grid, cv=ps.split(),scoring='accuracy')
gs = GridSearchCV(logreg, grid, cv=ps.split(),scoring=['roc_auc','average_precision'],refit='roc_auc')
gs.fit(all_cols[0], all_cols[1])
print(gs.best_params_)
print(gs.best_score_) #best cv score
df_gridsearch = pd.DataFrame(gs.cv_results_)

df_gridsearch.to_csv('model_hp_results/guideonly_gene20_075f_classi_LogisticRegression_L1_hp.csv')

{'C': 0.1}
0.7214640628190946


In [7]:
# LogisticRegression, L2
logreg = LogisticRegression(penalty='l2',solver='saga',random_state=0,max_iter=10000)
grid = {'C': np.logspace(-5, 5, 11)}

#predefined splits
gs = GridSearchCV(logreg, grid, cv=ps.split(),scoring=['roc_auc','average_precision'],refit='roc_auc')
gs.fit(all_cols[0], all_cols[1])
print(gs.best_params_)
print(gs.best_score_) #best cv score
df_gridsearch = pd.DataFrame(gs.cv_results_)

df_gridsearch.to_csv('model_hp_results/guideonly_gene20_075f_classi_LogisticRegression_L2_hp.csv')

{'C': 0.01}
0.7214300947964725


In [8]:
# LogisticRegression, elasticnet
logreg = LogisticRegression(penalty='elasticnet',solver='saga',random_state=0,max_iter=10000)
grid = {'C': np.logspace(-4, 4, 9),'l1_ratio':np.linspace(0.1, 1, num=10)}

gs = GridSearchCV(logreg, grid, cv=ps.split(),scoring=['roc_auc','average_precision'],refit='roc_auc')
gs.fit(all_cols[0], all_cols[1])
print(gs.best_params_)
print(gs.best_score_) #best cv score
df_gridsearch = pd.DataFrame(gs.cv_results_)

df_gridsearch.to_csv('model_hp_results/guideonly_gene20_075f_classi_LogisticRegression_elasticnet_hp.csv')

{'C': 0.01, 'l1_ratio': 0.2}
0.7214917118945797


In [8]:
# https://www.programcreek.com/python/example/91158/sklearn.model_selection.GroupKFold
#random forest
clf = RandomForestClassifier(random_state=0)
grid = {'n_estimators':[100,200,400,800,1000,1200,1500],'max_features':['auto','sqrt','log2']}
gs = GridSearchCV(clf, grid, cv=GroupKFold(n_splits=5))
gs.fit(all_cols[0], all_cols[1], groups=groups)


KeyboardInterrupt: 

In [None]:
#GradientBoostingClassifier
gb = ensemble.GradientBoostingClassifier(random_state=0)
grid = {'learning_rate':np.logspace(-2, 0, 3),'n_estimators':[100,200,400,800,1000,1200,1500],'max_depth':[2,3,4,8],'max_features':['auto','sqrt','log2']}
gs = GridSearchCV(gb, grid, cv=GroupKFold(n_splits=5))
gs.fit(all_cols[0], all_cols[1], groups=groups)
print(gs.best_score_) #best cv score
print(gs.best_params_)
df_gridsearch = pd.DataFrame(gs.cv_results_)

df_gridsearch.to_csv('linearmodel_hp_results/classi_gb_hp.csv')

## Test models

In [7]:
def classification_analysis(model_name, split, y_pred,y_true):
    test_df = pd.DataFrame(list(zip(list(y_pred), list(y_true))),
               columns =['predicted_value', 'true_binary_label'])
    
    thres_list = [0.8, 0.9,0.95]
    tp_thres = []
    #print('thres_stats')
    for thres in thres_list:
        df_pre_good = test_df[test_df['predicted_value']>thres]
        true_good_label = df_pre_good['true_binary_label'].values
        num_real_gg = np.count_nonzero(true_good_label)
        if len(true_good_label)>0:
            gg_ratio = num_real_gg/len(true_good_label)
            tp_thres.append(gg_ratio)
            #print('true good guide percent '+str(gg_ratio))
        else:
            tp_thres.append('na')
    
    outputs = np.array(y_pred)
    labels = np.array(y_true)
    #plt.clf()
    #fig.suptitle('AUC and PRC')
    score = roc_auc_score(labels, outputs)
    fpr, tpr, _ = roc_curve(labels, outputs)
    #print('AUROC '+str(score))
    average_precision = average_precision_score(labels, outputs)
    precision, recall, thres_prc  = precision_recall_curve(labels, outputs)
    #print('AUPRC '+str(average_precision))
    #plt.savefig(fname='results/linear_models/'+str(model_name)+'precision-recall_'+str(split)+'.png',dpi=600,bbox_inches='tight')
    return score,average_precision,tp_thres
   

In [9]:
#LogisticRegression, little regularization
logreg = LogisticRegression(penalty='l1',solver='saga',random_state=0,max_iter=10000,C=100000000)
auroc_l = []
auprc_l = []
tp_80 = []
tp_90 = []
for s in range(9):
    #tr, val, te = create_gene_splits_kfold(dataframe['gene'].values, all_cols, 11, s)
    tr, te = create_gene_splits_filter1_kfold_noval(dataframe['gene'].values, all_cols, 9, s)
    # training input and output
    d_input = tr[0]
    d_output = tr[1]
    logreg.fit(d_input, d_output) #fit models
    #test set
    xt =  te[0] 
    #pred = logreg.predict(xt)
    pred = logreg.predict_proba(xt)
    pred = pred[:,1]
    auroc,auprc,tp_thres = classification_analysis('LogisticRegression-L1', s,pred,te[1])
    auroc_l.append(auroc)
    auprc_l.append(auprc)
    if tp_thres[0]!= 'na':
        tp_80.append(tp_thres[0])
    if tp_thres[1]!= 'na':
        tp_90.append(tp_thres[1])
    
auroc_mean = statistics.mean(auroc_l)
auroc_sd = statistics.stdev(auroc_l)
print('auroc_mean: '+str(auroc_mean))
print('auroc_sd: '+str(auroc_sd))
auprc_mean = statistics.mean(auprc_l)
auprc_sd = statistics.stdev(auprc_l)
print('auprc_mean: '+str(auprc_mean))
print('auprc_sd: '+str(auprc_sd))
        
tp_80_mean = statistics.mean(tp_80)
tp_80_sd = statistics.stdev(tp_80)
print('tp_80_mean: '+str(tp_80_mean))
print('tp_80_sd: '+str(tp_80_sd))
tp_90_mean = statistics.mean(tp_90)
tp_90_sd = statistics.stdev(tp_90)
print('tp_90_mean: '+str(tp_90_mean))
print('tp_90_sd: '+str(tp_90_sd))

test: ['RPL31', 'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H']
test: ['EEF2', 'RPS11', 'SNRPD2', 'RPL37', 'SF3B3', 'DDX51']
test: ['RPL7', 'RPS9', 'KARS', 'SF3A1', 'RPL32', 'PSMB2']
test: ['RPS7', 'EIF4A3', 'U2AF1', 'PSMA1', 'PHB', 'POLR2D']
test: ['RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2', 'SUPT5H']
test: ['RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1']
test: ['RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L']
test: ['RPL5', 'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']
test: ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B']
auroc_mean: 0.7213890843218597
auroc_sd: 0.01830150463668443
auprc_mean: 0.3253363736827444
auprc_sd: 0.02084537819584787
tp_80_mean: 0.3
tp_80_sd: 0.4760952285695233


StatisticsError: mean requires at least one data point

In [20]:
# LogisticRegression, L1
logreg = LogisticRegression(penalty='l1',solver='saga',random_state=0,max_iter=10000,C=0.1)
auroc_l = []
auprc_l = []
tp_80 = []
tp_90 = []
for s in range(9):
    #tr, val, te = create_gene_splits_kfold(dataframe['gene'].values, all_cols, 11, s)
    tr, te = create_gene_splits_filter1_kfold_noval(dataframe['gene'].values, all_cols, 9, s)
    # training input and output
    d_input = tr[0]
    d_output = tr[1]
    logreg.fit(d_input, d_output) #fit models
    #test set
    xt =  te[0] 
    #pred = logreg.predict(xt)
    pred = logreg.predict_proba(xt)
    pred = pred[:,1]
    auroc,auprc,tp_thres = classification_analysis('LogisticRegression-L1', s,pred,te[1])
    auroc_l.append(auroc)
    auprc_l.append(auprc)
    if tp_thres[0]!= 'na':
        tp_80.append(tp_thres[0])
    if tp_thres[1]!= 'na':
        tp_90.append(tp_thres[1])
    
auroc_mean = statistics.mean(auroc_l)
auroc_sd = statistics.stdev(auroc_l)
print('auroc_mean: '+str(auroc_mean))
print('auroc_sd: '+str(auroc_sd))
auprc_mean = statistics.mean(auprc_l)
auprc_sd = statistics.stdev(auprc_l)
print('auprc_mean: '+str(auprc_mean))
print('auprc_sd: '+str(auprc_sd))
        
tp_80_mean = statistics.mean(tp_80)
tp_80_sd = statistics.stdev(tp_80)
print('tp_80_mean: '+str(tp_80_mean))
print('tp_80_sd: '+str(tp_80_sd))
tp_90_mean = statistics.mean(tp_90)
tp_90_sd = statistics.stdev(tp_90)
print('tp_90_mean: '+str(tp_90_mean))
print('tp_90_sd: '+str(tp_90_sd))

test: ['RPL31', 'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H']
test: ['EEF2', 'RPS11', 'SNRPD2', 'RPL37', 'SF3B3', 'DDX51']
test: ['RPL7', 'RPS9', 'KARS', 'SF3A1', 'RPL32', 'PSMB2']
test: ['RPS7', 'EIF4A3', 'U2AF1', 'PSMA1', 'PHB', 'POLR2D']
test: ['RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2', 'SUPT5H']
test: ['RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1']
test: ['RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L']
test: ['RPL5', 'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']
test: ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B']
auroc_mean: 0.7214640628190947
auroc_sd: 0.018183248177440187
auprc_mean: 0.32520716564837393
auprc_sd: 0.020724462125722667
tp_80_mean: 0.3125
tp_80_sd: 0.4732423621500228


StatisticsError: mean requires at least one data point

In [23]:
# LogisticRegression, L2
logreg = LogisticRegression(penalty='l2',solver='saga',random_state=0,max_iter=10000,C=0.01)
auroc_l = []
auprc_l = []
tp_80 = []
tp_90 = []
for s in range(9):
    #tr, val, te = create_gene_splits_kfold(dataframe['gene'].values, all_cols, 11, s)
    tr, te = create_gene_splits_filter1_kfold_noval(dataframe['gene'].values, all_cols, 9, s)
    # training input and output
    d_input = tr[0]
    d_output = tr[1]
    logreg.fit(d_input, d_output) #fit models
    #test set
    xt =  te[0] 
    #pred = logreg.predict(xt)
    pred = logreg.predict_proba(xt)
    pred = pred[:,1]
    auroc,auprc,tp_thres = classification_analysis('LogisticRegression-L2', s,pred,te[1])
    auroc_l.append(auroc)
    auprc_l.append(auprc)
    if tp_thres[0]!= 'na':
        tp_80.append(tp_thres[0])
    if tp_thres[1]!= 'na':
        tp_90.append(tp_thres[1])
    
auroc_mean = statistics.mean(auroc_l)
auroc_sd = statistics.stdev(auroc_l)
print('auroc_mean: '+str(auroc_mean))
print('auroc_sd: '+str(auroc_sd))
auprc_mean = statistics.mean(auprc_l)
auprc_sd = statistics.stdev(auprc_l)
print('auprc_mean: '+str(auprc_mean))
print('auprc_sd: '+str(auprc_sd))
        
tp_80_mean = statistics.mean(tp_80)
tp_80_sd = statistics.stdev(tp_80)
print('tp_80_mean: '+str(tp_80_mean))
print('tp_80_sd: '+str(tp_80_sd))
tp_90_mean = statistics.mean(tp_90)
tp_90_sd = statistics.stdev(tp_90)
print('tp_90_mean: '+str(tp_90_mean))
print('tp_90_sd: '+str(tp_90_sd))

test: ['RPL31', 'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H']
test: ['EEF2', 'RPS11', 'SNRPD2', 'RPL37', 'SF3B3', 'DDX51']
test: ['RPL7', 'RPS9', 'KARS', 'SF3A1', 'RPL32', 'PSMB2']
test: ['RPS7', 'EIF4A3', 'U2AF1', 'PSMA1', 'PHB', 'POLR2D']
test: ['RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2', 'SUPT5H']
test: ['RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1']
test: ['RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L']
test: ['RPL5', 'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']
test: ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B']
auroc_mean: 0.7214300947964725
auroc_sd: 0.0182392049171858
auprc_mean: 0.3252897453354815
auprc_sd: 0.02076884919984052
tp_80_mean: 0.3333333333333333
tp_80_sd: 0.5773502691896258


StatisticsError: mean requires at least one data point

In [29]:
# LogisticRegression, elasticnet
logreg = LogisticRegression(penalty='elasticnet',solver='saga',random_state=0,max_iter=10000,l1_ratio=0.50,C=0.1)
auroc_l = []
auprc_l = []
tp_80 = []
tp_90 = []
for s in range(9):
    #tr, val, te = create_gene_splits_kfold(dataframe['gene'].values, all_cols, 11, s)
    tr, te = create_gene_splits_filter1_kfold_noval(dataframe['gene'].values, all_cols, 9, s)
    # training input and output
    d_input = tr[0]
    d_output = tr[1]
    logreg.fit(d_input, d_output) #fit models
    #test set
    xt =  te[0] 
    #pred = logreg.predict(xt)
    pred = logreg.predict_proba(xt)
    pred = pred[:,1]
    auroc,auprc,tp_thres = classification_analysis('LogisticRegression-elasticnet', s,pred,te[1])
    auroc_l.append(auroc)
    auprc_l.append(auprc)
    if tp_thres[0]!= 'na':
        tp_80.append(tp_thres[0])
    if tp_thres[1]!= 'na':
        tp_90.append(tp_thres[1])
    
auroc_mean = statistics.mean(auroc_l)
auroc_sd = statistics.stdev(auroc_l)
print('auroc_mean: '+str(auroc_mean))
print('auroc_sd: '+str(auroc_sd))
auprc_mean = statistics.mean(auprc_l)
auprc_sd = statistics.stdev(auprc_l)
print('auprc_mean: '+str(auprc_mean))
print('auprc_sd: '+str(auprc_sd))
        
tp_80_mean = statistics.mean(tp_80)
tp_80_sd = statistics.stdev(tp_80)
print('tp_80_mean: '+str(tp_80_mean))
print('tp_80_sd: '+str(tp_80_sd))
tp_90_mean = statistics.mean(tp_90)
tp_90_sd = statistics.stdev(tp_90)
print('tp_90_mean: '+str(tp_90_mean))
print('tp_90_sd: '+str(tp_90_sd))

test: ['RPL31', 'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H']
test: ['EEF2', 'RPS11', 'SNRPD2', 'RPL37', 'SF3B3', 'DDX51']
test: ['RPL7', 'RPS9', 'KARS', 'SF3A1', 'RPL32', 'PSMB2']
test: ['RPS7', 'EIF4A3', 'U2AF1', 'PSMA1', 'PHB', 'POLR2D']
test: ['RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2', 'SUPT5H']
test: ['RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1']
test: ['RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L']
test: ['RPL5', 'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']
test: ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B']
auroc_mean: 0.721435369649852
auroc_sd: 0.018237601384323694
auprc_mean: 0.3252827677556211
auprc_sd: 0.02078180094189064
tp_80_mean: 0.3125
tp_80_sd: 0.4732423621500228


StatisticsError: mean requires at least one data point

In [None]:
#SVM, linear


clf = svm.SVC(kernel='linear',probability=True,random_state=0,C=0.001)

#clf = LinearSVC(dual= False, random_state=0, max_iter=10000,C=1,penalty='l2')

auroc_l = []
auprc_l = []
tp_80 = []
tp_90 = []
for s in range(9):
    #tr, val, te = create_gene_splits_kfold(dataframe['gene'].values, all_cols, 11, s)
    tr, te = create_gene_splits_filter1_kfold_noval(dataframe['gene'].values, all_cols, 9, s)
    # training input and output
    d_input = tr[0]
    d_output = tr[1]
    clf.fit(d_input, d_output) #fit models
    #test set
    xt =  te[0] 
    pred = clf.predict_proba(xt)
    pred = pred[:,1]
    #pred = clf.predict(xt)
    auroc,auprc,tp_thres = classification_analysis('svm', s,pred,te[1])
    auroc_l.append(auroc)
    auprc_l.append(auprc)
    if tp_thres[0]!= 'na':
        tp_80.append(tp_thres[0])
    if tp_thres[1]!= 'na':
        tp_90.append(tp_thres[1])
    
auroc_mean = statistics.mean(auroc_l)
auroc_sd = statistics.stdev(auroc_l)
print('auroc_mean: '+str(auroc_mean))
print('auroc_sd: '+str(auroc_sd))
auprc_mean = statistics.mean(auprc_l)
auprc_sd = statistics.stdev(auprc_l)
print('auprc_mean: '+str(auprc_mean))
print('auprc_sd: '+str(auprc_sd)) 
tp_80_mean = statistics.mean(tp_80)
tp_80_sd = statistics.stdev(tp_80)
#print('tp_80_mean: '+str(tp_80_mean))
#print('tp_80_sd: '+str(tp_80_sd))
tp_90_mean = statistics.mean(tp_90)
tp_90_sd = statistics.stdev(tp_90)
#print('tp_90_mean: '+str(tp_90_mean))
#print('tp_90_sd: '+str(tp_90_sd))

test: ['RPL31', 'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H']
test: ['EEF2', 'RPS11', 'SNRPD2', 'RPL37', 'SF3B3', 'DDX51']


In [18]:
# random forest
#clf = RandomForestClassifier(n_estimators=32,min_samples_split=2, min_samples_leaf=2, max_features='auto',random_state=0)
clf = RandomForestClassifier(n_estimators=1500,max_features='auto',random_state=0)
auroc_l = []
auprc_l = []
tp_80 = []
tp_90 = []
for s in range(9):
    #tr, val, te = create_gene_splits_kfold(dataframe['gene'].values, all_cols, 11, s)
    #tr, val, te = create_gene_splits_filter1_kfold(dataframe['gene'].values, all_cols, 9, args.split)
    tr, te = create_gene_splits_filter1_kfold_noval(dataframe['gene'].values, all_cols, 9, s)
    # training input and output
    d_input = tr[0]
    d_output = tr[1]
    clf.fit(d_input, d_output) #fit models
    #test set
    xt =  te[0] 
    #pred = logreg.predict(xt)
    pred = clf.predict_proba(xt)
    pred = pred[:,1]
    auroc,auprc,tp_thres = classification_analysis('random forest', s,pred,te[1])
    auroc_l.append(auroc)
    auprc_l.append(auprc)
    if tp_thres[0]!= 'na':
        tp_80.append(tp_thres[0])
    if tp_thres[1]!= 'na':
        tp_90.append(tp_thres[1])
    
auroc_mean = statistics.mean(auroc_l)
auroc_sd = statistics.stdev(auroc_l)
print('auroc_mean: '+str(auroc_mean))
print('auroc_sd: '+str(auroc_sd))
auprc_mean = statistics.mean(auprc_l)
auprc_sd = statistics.stdev(auprc_l)
print('auprc_mean: '+str(auprc_mean))
print('auprc_sd: '+str(auprc_sd))
        
tp_80_mean = statistics.mean(tp_80)
tp_80_sd = statistics.stdev(tp_80)
print('tp_80_mean: '+str(tp_80_mean))
print('tp_80_sd: '+str(tp_80_sd))
tp_90_mean = statistics.mean(tp_90)
tp_90_sd = statistics.stdev(tp_90)
print('tp_90_mean: '+str(tp_90_mean))
print('tp_90_sd: '+str(tp_90_sd))

test: ['RPL31', 'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H']
test: ['EEF2', 'RPS11', 'SNRPD2', 'RPL37', 'SF3B3', 'DDX51']
test: ['RPL7', 'RPS9', 'KARS', 'SF3A1', 'RPL32', 'PSMB2']
test: ['RPS7', 'EIF4A3', 'U2AF1', 'PSMA1', 'PHB', 'POLR2D']
test: ['RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2', 'SUPT5H']
test: ['RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1']
test: ['RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L']
test: ['RPL5', 'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']
test: ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B']
auroc_mean: 0.788279464458613
auroc_sd: 0.01428358067005656
auprc_mean: 0.4395069525801463
auprc_sd: 0.023567763312645224


StatisticsError: mean requires at least one data point

In [9]:
#GradientBoostingClassifier
clf = ensemble.GradientBoostingClassifier(random_state=0,max_depth=4,
                                         max_features='auto', n_estimators=1500)

auroc_l = []
auprc_l = []
tp_80 = []
tp_90 = []
#for s in range(11):
for s in range(9):
    #tr, val, te = create_gene_splits_kfold(dataframe['gene'].values, all_cols, 11, s)
    #tr, val, te = create_gene_splits_filter1_kfold(dataframe['gene'].values, all_cols, 9, args.split)
    tr, te = create_gene_splits_filter1_kfold_noval(dataframe['gene'].values, all_cols, 9, s)
    # training input and output
    d_input = tr[0]
    d_output = tr[1]
    clf.fit(d_input, d_output) #fit models
    #test set
    xt =  te[0] 
    pred = clf.predict_proba(xt)
    pred = pred[:,1]
    auroc,auprc,tp_thres = classification_analysis('GradientBoostingClassifier_hpnew', s,pred,te[1])
    auroc_l.append(auroc)
    auprc_l.append(auprc)
    if tp_thres[0]!= 'na':
        tp_80.append(tp_thres[0])
    if tp_thres[1]!= 'na':
        tp_90.append(tp_thres[1])
    
auroc_mean = statistics.mean(auroc_l)
auroc_sd = statistics.stdev(auroc_l)
print('auroc_mean: '+str(auroc_mean))
print('auroc_sd: '+str(auroc_sd))
auprc_mean = statistics.mean(auprc_l)
auprc_sd = statistics.stdev(auprc_l)
print('auprc_mean: '+str(auprc_mean))
print('auprc_sd: '+str(auprc_sd))
        
tp_80_mean = statistics.mean(tp_80)
tp_80_sd = statistics.stdev(tp_80)
print('tp_80_mean: '+str(tp_80_mean))
print('tp_80_sd: '+str(tp_80_sd))
tp_90_mean = statistics.mean(tp_90)
tp_90_sd = statistics.stdev(tp_90)
print('tp_90_mean: '+str(tp_90_mean))
print('tp_90_sd: '+str(tp_90_sd))

test: ['RPL31', 'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H']
test: ['EEF2', 'RPS11', 'SNRPD2', 'RPL37', 'SF3B3', 'DDX51']
test: ['RPL7', 'RPS9', 'KARS', 'SF3A1', 'RPL32', 'PSMB2']
test: ['RPS7', 'EIF4A3', 'U2AF1', 'PSMA1', 'PHB', 'POLR2D']
test: ['RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2', 'SUPT5H']
test: ['RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1']
test: ['RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L']
test: ['RPL5', 'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']
test: ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B']
auroc_mean: 0.8419599405820187
auroc_sd: 0.01700381890902823
auprc_mean: 0.5374780612257355
auprc_sd: 0.0307901999759122
tp_80_mean: 0.813415325970473
tp_80_sd: 0.060331083529782635
tp_90_mean: 0.8539377289377289
tp_90_sd: 0.14257472730230836


In [10]:
print(auroc_l)
print(auprc_l)
print(tp_80)
print(tp_90)

[0.8584233873947144, 0.8178238022371683, 0.8167461147806936, 0.835408453266539, 0.8520322898053144, 0.8454747683961216, 0.8636994996978711, 0.8343741102400096, 0.8536570394197361]
[0.566563028022413, 0.4848862901265541, 0.4920029175065841, 0.5422250027559219, 0.556203199805487, 0.5660378943500952, 0.5615582396576324, 0.5413416546584848, 0.5264843241484469]
[0.8656716417910447, 0.676056338028169, 0.7966101694915254, 0.8414634146341463, 0.8536585365853658, 0.8666666666666667, 0.8387096774193549, 0.8041237113402062, 0.7777777777777778]
[0.9285714285714286, 0.8461538461538461, 0.5833333333333334, 0.9285714285714286, 0.875, 1.0, 1.0, 0.8571428571428571, 0.6666666666666666]


In [11]:
#GradientBoostingClassifier, hp2
clf = ensemble.GradientBoostingClassifier(random_state=0,max_depth=4,
                                         max_features='sqrt', n_estimators=1800)

auroc_l = []
auprc_l = []
tp_80 = []
tp_90 = []

for s in range(9):
    tr, te = create_gene_splits_filter1_kfold_noval(dataframe['gene'].values, all_cols, 9, s)
    # training input and output
    d_input = tr[0]
    d_output = tr[1]
    clf.fit(d_input, d_output) #fit models
    #test set
    xt =  te[0] 
    pred = clf.predict_proba(xt)
    pred = pred[:,1]
    auroc,auprc,tp_thres = classification_analysis('GradientBoostingClassifier_hpnew', s,pred,te[1])
    auroc_l.append(auroc)
    auprc_l.append(auprc)
    if tp_thres[0]!= 'na':
        tp_80.append(tp_thres[0])
    if tp_thres[1]!= 'na':
        tp_90.append(tp_thres[1])
    
auroc_mean = statistics.mean(auroc_l)
auroc_sd = statistics.stdev(auroc_l)
print('auroc_mean: '+str(auroc_mean))
print('auroc_sd: '+str(auroc_sd))
auprc_mean = statistics.mean(auprc_l)
auprc_sd = statistics.stdev(auprc_l)
print('auprc_mean: '+str(auprc_mean))
print('auprc_sd: '+str(auprc_sd))
        
tp_80_mean = statistics.mean(tp_80)
tp_80_sd = statistics.stdev(tp_80)
print('tp_80_mean: '+str(tp_80_mean))
print('tp_80_sd: '+str(tp_80_sd))
tp_90_mean = statistics.mean(tp_90)
tp_90_sd = statistics.stdev(tp_90)
print('tp_90_mean: '+str(tp_90_mean))
print('tp_90_sd: '+str(tp_90_sd))

test: ['RPL31', 'RPS3A', 'CSE1L', 'XAB2', 'PSMD7', 'SUPT6H']
test: ['EEF2', 'RPS11', 'SNRPD2', 'RPL37', 'SF3B3', 'DDX51']
test: ['RPL7', 'RPS9', 'KARS', 'SF3A1', 'RPL32', 'PSMB2']
test: ['RPS7', 'EIF4A3', 'U2AF1', 'PSMA1', 'PHB', 'POLR2D']
test: ['RPSA', 'RPL23A', 'NUP93', 'AQR', 'RPA2', 'SUPT5H']
test: ['RPL6', 'RPS13', 'SF3B2', 'RPS27A', 'PRPF31', 'COPZ1']
test: ['RPS4X', 'PSMD1', 'RPS14', 'NUP98', 'USP39', 'CDC5L']
test: ['RPL5', 'PHB2', 'RPS15A', 'RPS3', 'ARCN1', 'COPS6']
test: ['RPS6', 'PRPF19', 'RPL34', 'Hsp10', 'POLR2I', 'EIF5B']
auroc_mean: 0.8402434698783054
auroc_sd: 0.017096114410535924
auprc_mean: 0.5326705713945947
auprc_sd: 0.029089488378007556
tp_80_mean: 0.8210448665312134
tp_80_sd: 0.08843235627451937
tp_90_mean: 0.8753086419753087
tp_90_sd: 0.19982845866986979
