In [23]:
%matplotlib inline
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 5)

from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

import os
import pickle

In [18]:
exp_dir = '/mnt/rna-seq-analysis/rna-seq-analysis/data/tissue-pairs'
tissues = os.listdir(exp_dir)
tsv = 'combined-gtex-tcga-counts-protein-coding.tsv'
exp_df = pd.concat([pd.read_csv(os.path.join(exp_dir, t, tsv), sep='\t', index_col=0) for t in tissues], axis=1)
# Remove dupes
exp_df = exp_df.T.groupby(level=0).first().T
# Subset by normal / tumor samples
samples = [x for x in exp_df.columns if x.startswith('GTEX') or (x.endswith('01') or x.endswith('11'))]
exp_df = exp_df[samples]
# Transpose so genes (features) are columns
exp_df = exp_df.T

# Get Y
tissue_map = pickle.load(open('../../data/tissue_map.pickle', 'rb'))
y = np.array([tissue_map[x] for x in exp_df.index])

# UCSF Subset
with open('/mnt/rna-seq-analysis/rna-seq-analysis/metadata/UCSF-RNAPanel-Final-412-genes.csv', 'r') as f:
    ucsf_genes = [x.strip() for x in f.readlines()]

gene_map = pickle.load(open('../../data/gene_map.pickle', 'r'))
genes = [gene_map[x] if x in gene_map else x for x in exp_df.columns]
exp_df.columns = genes
ucsf_genes = [x for x in ucsf_genes if x in genes]
ucsf_df = exp_df[ucsf_genes]

def test_clf(clf, X, y):
    # Raw
    raw_scores = cross_val_score(clf, X, y, n_jobs=-1)
    print 'Raw', np.mean(raw_scores), np.std(raw_scores)
    # Log scale
    ln_X = X.apply(lambda x: np.log2(x + 1))
    log_scores = cross_val_score(clf, ln_X, y, n_jobs=-1)
    print 'log2', np.mean(log_scores), np.std(log_scores)
    # Feature Scaling
    scale_X = scale(X)
    scale_scores = cross_val_score(clf, scale_X, y, n_jobs=-1)
    print 'scale', np.mean(scale_scores), np.std(scale_scores)
    return raw_scores, log_scores, scale_scores

In [19]:
from sklearn.linear_model import SGDClassifier

In [20]:
clf = SGDClassifier(loss="hinge", penalty="l2")

In [21]:
raw, ln, sca = test_clf(clf, exp_df, y)

Raw 0.794983802024 0.0605639907105
log2 0.831320329812 0.0632056122518
scale 0.870463301741 0.0644092770492


In [22]:
raw_ucsf, ln_ucsf, scale_ucsf = test_clf(clf, ucsf_df, y)

Raw 0.669619079012 0.042804635202
log2 0.811544414175 0.0359492988998
scale 0.854815134315 0.0629421208063


## Pipeline with Feature Selection

In [25]:
clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC())),
  ('classification', SGDClassifier(loss='hinge', penalty='l2'))
])

In [None]:
p_score = cross_val_score(clf, scale(exp_df), y, n_jobs=-1)

In [30]:
print np.mean(p_score), np.std(p_score)

0.876627316106 0.0592648140207
