In [1]:
import pandas as pd
import numpy as np
import pickle
import argparse
import sys
import os

In [2]:
stjude = "../data/THR24_lib_prep_details.tsv"
stjude_df = pd.read_csv(stjude, sep='\t', index_col=0)

# removing samples with histone outlier flagged
stjude_df = stjude_df[stjude_df.histone_outlier_flagged!=True ]
# keeping samples that are included in v10 compendium
stjude_df = stjude_df[stjude_df.inv10=="Yes" ]

In [3]:
stjude_df.head()

Unnamed: 0_level_0,site_sampleid,QC_Pass,original_reported_libPrep,histone_outlier_flagged,revised_lib_type,atypical_per_st_jude,inv10
THid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
THR24_1458_S01,SJACT006_D,True,polyA,False,,,Yes
THR24_1459_S01,SJAMLM7001_D,True,polyA,False,,,Yes
THR24_1460_S01,SJAMLM7003_D,True,polyA,False,,,Yes
THR24_1462_S01,SJAMLM7005_D,True,polyA,False,,,Yes
THR24_1463_S01,SJAMLM7006_D,True,polyA,False,,,Yes


In [4]:
v10_df = pd.read_csv("../data/Poly.tsv", index_col=0, sep='\t')


In [5]:
stjudev10_expr = v10_df.loc[stjude_df.index]

In [8]:
def gene_checker(input_file):
    '''
    Function checks the input file's genes.
    The intersection between the input file's genes and the predetermined classifier genes is taken.
    Then the function checks for genes are not present in the input file.
    For those genes, the expression vector is set to 0.
    A new dataframe is returned with the correct order of genes.

    Parameters
    ----------
    input_file; pandas df - shape should be (samples x genes)
    
    Returns
    ---------
    new_input_file; pandas df - shape (samples x genes)
    '''
    
    classifier_genes = np.loadtxt('../data_test/ClassifierGenes.txt', dtype='str')
    new_input_file = input_file.T.loc[classifier_genes].T # seleting classifier selected genes in the classifier determined order
    
    # will fill genes that do not exist in the input with zero
    # if no NAN values, none will be filled
    new_input_file = new_input_file.fillna(0) 
    
    return new_input_file

def deploy(input_file, model_path):

    expr_input = input_file
    print('reading input...') 
    if isinstance(expr_input, str):
        expr_input = pd.read_csv(expr_input, sep='\t', index_col=0)


    print("before gene intersection...")
    print(expr_input.shape)
    expr_input = gene_checker(expr_input) # making sure genes are correct for classifier
    print("After gene intersection...")
    print(expr_input.shape)

    print('applying model...')
    model = pickle.load(open(model_path, 'rb'))
    print(model)
    
    predictions = model.predict(expr_input)
    predict_proba = model.predict_proba(expr_input)
    
    print("saving results...")
    predictions = pd.DataFrame(np.hstack([predictions.reshape(-1,1), predict_proba]), index=expr_input.index, columns=['Ribo', 'Proba_0', 'Proba_1'])

    return predictions

## Applying balanced model on St. Jude samples

In [9]:
balanced_predictions = deploy(stjudev10_expr, "../models/RiboVsPoly_balanced.sav")

reading input...
before gene intersection...
(630, 25924)
After gene intersection...
(630, 5000)
applying model...
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


saving results...


In [10]:
balanced_predictions.Ribo.value_counts()

0.0    630
Name: Ribo, dtype: int64

## Applying unbalanced model on St. Jude samples

In [11]:
unbalanced_predictions = deploy(stjudev10_expr, "../models/RiboVsPoly_unbalanced.sav")

reading input...
before gene intersection...
(630, 25924)
After gene intersection...
(630, 5000)
applying model...
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=1, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


saving results...


In [12]:
unbalanced_predictions.Ribo.value_counts()

0.0    628
1.0      2
Name: Ribo, dtype: int64

## All of st. Judes samples are thought to be PolyA
   * will extract missclassified samples
   * produce confusion matrices 

In [13]:
unbalanced_predictions[unbalanced_predictions.Ribo==1]

Unnamed: 0_level_0,Ribo,Proba_0,Proba_1
THid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
THR24_1858_S01,1.0,0.445577,0.554423
THR24_2125_S01,1.0,0.441047,0.558953


In [14]:
true_labels = balanced_predictions.shape[0]*[0]
pred_labels = balanced_predictions.Ribo.values

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,7))
mat = confusion_matrix(true_labels, pred_labels)
ax = sns.heatmap(mat, annot=True, fmt='g')
ax.set_ylabel("True Label", fontsize=16)
ax.set_xlabel("Predicted Label", fontsize=16)
ax.set_title("Balanced Model on PolyA St.Jude", fontsize=17)
plt.show()


<Figure size 1000x700 with 2 Axes>