In [1]:
%matplotlib inline
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 5)

from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale
from sklearn.feature_selection import SelectFromModel

import os
import pickle

In [2]:
exp_dir = '/mnt/rna-seq-analysis/rna-seq-analysis/data/tissue-pairs'
tissues = os.listdir(exp_dir)
tsv = 'combined-gtex-tcga-counts-protein-coding.tsv'
exp_df = pd.concat([pd.read_csv(os.path.join(exp_dir, t, tsv), sep='\t', index_col=0) for t in tissues], axis=1)
# Remove dupes
exp_df = exp_df.T.groupby(level=0).first().T
# Subset by normal / tumor samples
samples = [x for x in exp_df.columns if x.startswith('GTEX') or (x.endswith('01') or x.endswith('11'))]
exp_df = exp_df[samples]
# Transpose so genes (features) are columns
exp_df = exp_df.T

# Get Y
tissue_map = pickle.load(open('../../data/tissue_map.pickle', 'rb'))
y = np.array([tissue_map[x] for x in exp_df.index])

# UCSF Subset
with open('/mnt/rna-seq-analysis/rna-seq-analysis/metadata/UCSF-RNAPanel-Final-412-genes.csv', 'r') as f:
    ucsf_genes = [x.strip() for x in f.readlines()]

gene_map = pickle.load(open('../../data/gene_map.pickle', 'r'))
genes = [gene_map[x] if x in gene_map else x for x in exp_df.columns]
exp_df.columns = genes
ucsf_genes = [x for x in ucsf_genes if x in genes]
ucsf_df = exp_df[ucsf_genes]

def test_clf(clf, X, y):
    # Raw
    raw_scores = cross_val_score(clf, X, y, n_jobs=-1)
    print 'Raw', np.mean(raw_scores), np.std(raw_scores)
    # Log scale
    ln_X = X.apply(lambda x: np.log2(x + 1))
    log_scores = cross_val_score(clf, ln_X, y, n_jobs=-1)
    print 'log2', np.mean(log_scores), np.std(log_scores)
    # Feature Scaling
    scale_X = scale(X)
    scale_scores = cross_val_score(clf, scale_X, y, n_jobs=-1)
    print 'scale', np.mean(scale_scores), np.std(scale_scores)
    return raw_scores, log_scores, scale_scores

In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [4]:
clf = KNeighborsClassifier()

In [5]:
raw, ln, scal = test_clf(clf, exp_df, y)

Raw 0.719224629171 0.067722356217
log2 0.87137831082 0.0553835576314
scale 0.773951486398 0.0933382433949


In [6]:
raw_ucsf, ln_ucsf, scale_ucsf = test_clf(clf, ucsf_df, y)

Raw 0.629278815942 0.062727746354
log2 0.882490019985 0.0408082576634
scale 0.78384979218 0.0643725431969
