In [1]:
import pandas as pd
import numpy as np
import pickle
import argparse
import sys
import os

### Following is code from RF_deploy.py

In [2]:
def gene_checker(input_file):
    '''
    Function checks the input file's genes.
    The intersection between the input file's genes and the predetermined classifier genes is taken.
    Then the function checks for genes are not present in the input file.
    For those genes, the expression vector is set to 0.
    A new dataframe is returned with the correct order of genes.

    Parameters
    ----------
    input_file; pandas df - shape should be (samples x genes)
    
    Returns
    ---------
    new_input_file; pandas df - shape (samples x genes)
    '''
    
    classifier_genes = np.loadtxt('../ClassifierGenes.txt', dtype='str')
    new_input_file = input_file.T.loc[classifier_genes].T # seleting classifier selected genes in the classifier determined order
    
    # will fill genes that do not exist in the input with zero
    # if no NAN values, none will be filled
    new_input_file = new_input_file.fillna(0) 
    
    return new_input_file

def deploy(input_file, model_path):
    classifier_genes = np.loadtxt('../ClassifierGenes.txt', dtype='str')

    expr_input = input_file
    
    print('reading input...') 
    expr_input = pd.read_csv(expr_input, sep='\t', index_col=0)

    print("before gene intersection...")
    print(expr_input.shape)
    expr_input = gene_checker(expr_input) # making sure genes are correct for classifier
    print("After gene intersection...")
    print(expr_input.shape)

    print('applying model...')
    model = pickle.load(open(model_path, 'rb'))
    print(model)
    
    predictions = model.predict(expr_input)
    predict_proba = model.predict_proba(expr_input)
    
    print("saving results...")
    predictions = pd.DataFrame(np.hstack([predictions.reshape(-1,1), predict_proba]), index=expr_input.index, columns=['Ribo', 'Proba_0', 'Proba_1'])

    return predictions

## Applying balanced model on openPBTA polyA samples

In [3]:
balanced_predictions = deploy("../data/pbta-gene-expression-rsem-tpm.polya.csv", "../RiboVsPoly_balanced.sav")

reading input...
before gene intersection...
(58, 58347)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


After gene intersection...
(58, 25924)
applying model...
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=1,
                       warm_start=False)


[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 170 tasks      | elapsed:    0.1s
[Parallel(n_jobs=15)]: Done 420 tasks      | elapsed:    0.1s
[Parallel(n_jobs=15)]: Done 770 tasks      | elapsed:    0.3s
[Parallel(n_jobs=15)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 170 tasks      | elapsed:    0.1s
[Parallel(n_jobs=15)]: Done 420 tasks      | elapsed:    0.2s


saving results...


[Parallel(n_jobs=15)]: Done 770 tasks      | elapsed:    0.3s
[Parallel(n_jobs=15)]: Done 1000 out of 1000 | elapsed:    0.4s finished


In [4]:
# All samples were predicted to be polyA
balanced_predictions.Ribo.value_counts()

0.0    58
Name: Ribo, dtype: int64

## Applying unbalanced model on openPBTA polyA samples

In [5]:
unbalanced_predictions = deploy("../data/pbta-gene-expression-rsem-tpm.polya.csv", "../RiboVsPoly_unbalanced.sav")

reading input...
before gene intersection...
(58, 58347)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 170 tasks      | elapsed:    0.0s


After gene intersection...
(58, 25924)
applying model...
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=1,
                       warm_start=False)


[Parallel(n_jobs=15)]: Done 420 tasks      | elapsed:    0.1s
[Parallel(n_jobs=15)]: Done 770 tasks      | elapsed:    0.2s
[Parallel(n_jobs=15)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.0s
[Parallel(n_jobs=15)]: Done 170 tasks      | elapsed:    0.1s
[Parallel(n_jobs=15)]: Done 420 tasks      | elapsed:    0.1s
[Parallel(n_jobs=15)]: Done 770 tasks      | elapsed:    0.3s
[Parallel(n_jobs=15)]: Done 1000 out of 1000 | elapsed:    0.4s finished


saving results...


In [6]:
unbalanced_predictions.Ribo.value_counts()

0.0    58
Name: Ribo, dtype: int64

# All polyA samples were classified correctly