In [1]:
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_recall_curve,average_precision_score
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
def cross_validate(x, y,folds=10, model=None):

    kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    cv_results={}
    cv_results['estimators']=[]
    cv_results['test_acc']=[]
    cv_results['precision']=[]
    cv_results['recall']=[]
    cv_results['average_precision']=[]
    print('cross validating...')
    
    for train_idx, test_idx in kfold.split(x, y):

        x_train = x[train_idx]
        y_train = y[train_idx]
        x_test = x[test_idx]
        y_test = y[test_idx]


        model.fit(x_train, y_train.flatten())
        y_proba = model.predict_proba(x_test)[:,1]
        acc = model.score(x_test, y_test)
        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        mean_precision = average_precision_score(y_test, y_proba)
        
        cv_results['estimators'] += [model]
        cv_results['test_acc'] += [acc]
        cv_results['precision'] += [precision]
        cv_results['recall'] += [recall]
        cv_results['average_precision'] += [mean_precision]

    
    return cv_results

def get_best_model(cv_results):
    '''
    Function takes in results from function cross_validate and finds the nest estimator
    If multiple estimators have achieved the same precision, then a random one is selected
    '''
    best_score=None
    best_estimator=None
    for i,e in enumerate(cv_results['estimators']):
        score = cv_results['precision']
        
        if best_score is None or score>best_score:
            best_score = score
            best_estimator = e
    return best_score, best_estimator
    
    

## Cross validating with balanced data
   * after get_best_model is run, the best balanced model will be saved

In [3]:

    
data = "../data_test/MergedData_Balanced.tsv"
labels = "../data_test/MergedLabels_Balanced.tsv"
save_best = True
model_fn = "../models/RiboVsPoly_balanced.sav"

classifier_genes = np.loadtxt('../data/ClassifierGenes.txt', dtype='str')

if '.tsv' in data:
    X = pd.read_csv(data, sep='\t', index_col=0)
    Y = pd.read_csv(labels, sep='\t', index_col=0)
else:
    raise ValueError('File does not appear to be tab delimited due to erronious extension. Make sure the file is tab delimited')

X = X.T.loc[classifier_genes].T #making sure genes match dimensionality of trained classifier
X = X.fillna(0)


model = RandomForestClassifier(n_estimators=1000, max_depth=5,random_state=42, oob_score=True, n_jobs=-1, verbose=1)
cv_results = cross_validate(X.values, Y.values, model=model)

cv_mean_precision = np.mean(cv_results['average_precision'])
print('10-Fold CV average precision: %.3f'%cv_mean_precision)

if save_best:
    best_score, best_estimator = get_best_model(cv_results)
    pickle.dump(best_estimator, open(model_fn, 'wb'))

#np.save('./results/RF_10-foldCV.npy',cv_results)



cross validating...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 420 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 770 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.0s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 170 tasks      | elapsed:    0.1s
[Parallel(n_jobs=15)]: Done 420 tasks      | elapsed:    0.2s
[Parallel(n_jobs=15)]: Done 770 tasks      | elapsed:    0.3s
[Parallel(n_jobs=15)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 170 tasks      | elaps

10-Fold CV average precision: 0.998


## Cross validating with unbalanced data
   * after get_best_model is run, the best unbalanced model will be saved

In [5]:

    
data = "../data_test/MergedData_Unabalanced.tsv"
labels = "../data_test/MergedLabels_Unabalanced.tsv"
save_best = True
model_fn = "../models/RiboVsPoly_unbalanced.sav"

classifier_genes = np.loadtxt('../data/ClassifierGenes.txt', dtype='str')

if '.tsv' in data:
    X = pd.read_csv(data, sep='\t', index_col=0)
    Y = pd.read_csv(labels, sep='\t', index_col=0)
else:
    raise ValueError('File does not appear to be tab delimited due to erronious extension. Make sure the file is tab delimited')

X = X.T.loc[classifier_genes].T #making sure genes match dimensionality of trained classifier
X = X.fillna(0)


model = RandomForestClassifier(n_estimators=1000, max_depth=5,random_state=42, oob_score=True, n_jobs=-1, verbose=1)
cv_results = cross_validate(X.values, Y.values, model=model)

cv_mean_precision = np.mean(cv_results['average_precision'])
print('10-Fold CV average precision: %.3f'%cv_mean_precision)

if save_best:
    best_score, best_estimator = get_best_model(cv_results)
    pickle.dump(best_estimator, open(model_fn, 'wb'))

#np.save('./results/RF_10-foldCV.npy',cv_results)



cross validating...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 420 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 770 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.3s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 170 tasks      | elapsed:    0.1s
[Parallel(n_jobs=15)]: Done 420 tasks      | elapsed:    0.2s
[Parallel(n_jobs=15)]: Done 770 tasks      | elapsed:    0.4s
[Parallel(n_jobs=15)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 170 tasks      | elaps

10-Fold CV average precision: 0.998
