# Random Forests

In [1]:
%matplotlib inline
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 5)

from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale
from sklearn.feature_selection import SelectFromModel

import os
import pickle

In [2]:
exp_dir = '/mnt/rna-seq-analysis/rna-seq-analysis/data/tissue-pairs'
tissues = os.listdir(exp_dir)
tsv = 'combined-gtex-tcga-counts-protein-coding.tsv'
exp_df = pd.concat([pd.read_csv(os.path.join(exp_dir, t, tsv), sep='\t', index_col=0) for t in tissues], axis=1)
# Remove dupes
exp_df = exp_df.T.groupby(level=0).first().T
# Subset by normal / tumor samples
samples = [x for x in exp_df.columns if x.startswith('GTEX') or (x.endswith('01') or x.endswith('11'))]
exp_df = exp_df[samples]
# Transpose so genes (features) are columns
exp_df = exp_df.T

# Get Y
tissue_map = pickle.load(open('../../data/tissue_map.pickle', 'rb'))
y = np.array([tissue_map[x] for x in exp_df.index])

# UCSF Subset
with open('/mnt/rna-seq-analysis/rna-seq-analysis/metadata/UCSF-RNAPanel-Final-412-genes.csv', 'r') as f:
    ucsf_genes = [x.strip() for x in f.readlines()]

gene_map = pickle.load(open('../../data/gene_map.pickle', 'r'))
genes = [gene_map[x] if x in gene_map else x for x in exp_df.columns]
exp_df.columns = genes
ucsf_genes = [x for x in ucsf_genes if x in genes]
ucsf_df = exp_df[ucsf_genes]

def test_clf(clf, X, y):
    # Raw
    raw_scores = cross_val_score(clf, X, y, n_jobs=-1)
    print 'Raw', np.mean(raw_scores), np.std(raw_scores)
    # Log scale
    ln_X = X.apply(lambda x: np.log2(x + 1))
    log_scores = cross_val_score(clf, ln_X, y, n_jobs=-1)
    print 'log2', np.mean(log_scores), np.std(log_scores)
    # Feature Scaling
    scale_X = scale(X)
    scale_scores = cross_val_score(clf, scale_X, y, n_jobs=-1)
    print 'scale', np.mean(scale_scores), np.std(scale_scores)
    return raw_scores, log_scores, scale_scores

## RandomForest

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
clf = RandomForestClassifier()

In [5]:
raw_rf, ln_rf, scale_rf = test_clf(clf, exp_df, y)

Raw 0.839888196127 0.0557017425924
log2 0.840533909584 0.0553764807566
scale 0.839888196127 0.0557017425924


In [6]:
raw_rf_ucsf, ln_rf_ucsf, scale_rf_ucsf = test_clf(clf, ucsf_df, y)

Raw 0.791026355257 0.0662608180184
log2 0.792128077838 0.0654161513826
scale 0.791118207925 0.0661498338899


## ExtraTrees

In [7]:
from sklearn.ensemble import ExtraTreesClassifier

In [8]:
clf = ExtraTreesClassifier()

In [9]:
raw_et, ln_et, scale_et = test_clf(clf, exp_df, y)

Raw 0.787157414782 0.0673018753215
log2 0.831152056753 0.0619894331496
scale 0.787157414782 0.0673018753215


In [10]:
raw_et_ucsf, ln_et_ucsf, scale_et_ucsf = test_clf(clf, ucsf_df, y)

Raw 0.742786346135 0.0685745015462
log2 0.801669789498 0.0464330150942
scale 0.742786346135 0.0685745015462


# AdaBoost

In [11]:
from sklearn.ensemble import AdaBoostClassifier

In [12]:
clf = AdaBoostClassifier()

In [13]:
raw_ad, ln_ad, scale_ad = test_clf(clf, exp_df, y)

Raw 0.271439645742 0.0583492633856
log2 0.271531854133 0.0584754966475
scale 0.26646039263 0.0515634900851


In [14]:
raw_ad_ucsf, ln_ad_ucsf, scale_ad_ucsf = test_clf(clf, ucsf_df, y)

Raw 0.409128730937 0.0531409818434
log2 0.409128730937 0.0531409818434
scale 0.409128730937 0.0531409818434
