In [1]:
%matplotlib inline
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 5)

from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale
from sklearn.feature_selection import SelectFromModel

import os
import pickle

# Tissue Classifier 

Goal: Build a classifier that calls what tissue a sample originated from, given a set of genes as input.

We'll test a myriad of options:

- Data normalization
- Model Selection
- Feature Selection
- Hyperparameter Tuning
- UCSF Oncogene subset (feature selection based on priors)

First, we'll prepare our dataframe, which in machine learning land is our data matrix $X$, organized _tidily_, i.e. **rows** are observations (samples) and **columns** are features (genes). 

In [3]:
exp_dir = '/mnt/rna-seq-analysis/rna-seq-analysis/data/tissue-pairs'
tissues = os.listdir(exp_dir)
tsv = 'combined-gtex-tcga-counts-protein-coding.tsv'
exp_df = pd.concat([pd.read_csv(os.path.join(exp_dir, t, tsv), sep='\t', index_col=0) for t in tissues], axis=1)
# Remove dupes
exp_df = exp_df.T.groupby(level=0).first().T
# Subset by normal / tumor samples
samples = [x for x in exp_df.columns if x.startswith('GTEX') or (x.endswith('01') or x.endswith('11'))]
exp_df = exp_df[samples]
# Transpose so genes (features) are columns
exp_df = exp_df.T

In [4]:
print exp_df.shape
exp_df.head()

(10864, 19797)


Unnamed: 0,ENSG00000116032.5,ENSG00000171174.13,...,ENSG00000167747.13,ENSG00000181518.3
GTEX-1117F-2226-SM-5N9CH,24.000759,259.998915,...,632.982848,0.0
GTEX-1117F-2426-SM-5EGGH,11.999642,57.000764,...,445.990389,0.0
GTEX-1117F-2826-SM-5GZXL,16.000438,81.002728,...,549.004648,0.0
GTEX-111CU-0126-SM-5GZWZ,3.999903,934.30948,...,913.982284,0.0
GTEX-111CU-0226-SM-5GZXC,22.000607,229.992148,...,1629.953574,1.0


### Unnormalized
Let's see how well the raw data performs before adding in normalization (guess: not great).

Since we're doing _supervised_ tissue classification, we'll need a y-vector that maps each sample to its corresponding tissue of origin. 

In [5]:
tissue_map = pickle.load(open('../../data/tissue_map.pickle', 'rb'))

In [6]:
y = np.array([tissue_map[x] for x in exp_df.index])

We'll start naively by selecting a model based on the [SKLearn cheat sheet](http://scikit-learn.org/stable/tutorial/machine_learning_map/), although we will reconsider model selection later on.

In [7]:
model = LinearSVC()

In [7]:
raw_score = cross_val_score(model, exp_df, y, n_jobs=-1)

In [8]:
raw_mean = np.mean(raw_score)
raw_std = np.std(raw_score)
print raw_mean, raw_std

0.843759973539 0.0598843137001


### Log Normalization
We'll start with log2(x + 1) normalization, which we don't necessarily expect to perform well given SVM's preference for data centered around 0. 

In [None]:
ln_df = exp_df.apply(lambda x: np.log2(x + 1))

In [None]:
ln_df.head()

In [11]:
ln_score = cross_val_score(model, ln_df, y, n_jobs=-1)

In [12]:
ln_mean = np.mean(ln_score)
ln_std = np.std(ln_score)
print ln_mean, ln_std

0.903411473996 0.0581555737476


### Scale

In [8]:
scale_df = scale(exp_df)

In [9]:
pd.DataFrame(scale_df).head()

Unnamed: 0,0,1,...,19795,19796
0,-0.062156,-0.275664,...,-0.610636,-0.180868
1,-0.17154,-0.726695,...,-0.649652,-0.180868
2,-0.135075,-0.673366,...,-0.628158,-0.180868
3,-0.244454,1.222552,...,-0.552005,-0.180868
4,-0.080387,-0.342335,...,-0.402617,0.782809


In [17]:
scale_score = cross_val_score(model, scale_df, y, n_jobs=-1)

In [18]:
scale_mean = np.mean(scale_score)
scale_std = np.std(scale_score)
print scale_mean, scale_std

0.874711009657 0.0677085124036


### Feature Selection

In [10]:
sfm = SelectFromModel(model)

In [12]:
new = sfm.fit_transform(scale_df, y)

In [13]:
new.shape

(10864, 9021)

In [15]:
fs_score = cross_val_score(model, new, y, n_jobs=-1)

In [19]:
fs_mean = np.mean(fs_score)
fs_std = np.std(fs_score)
print fs_mean, fs_std

0.884736283471 0.0626260771137


#### log df

In [None]:
new = sfm.fit_transform(ln_df, y)

In [None]:
new.shape

In [None]:
fs_score = cross_val_score(model, new, y, n_jobs=-1)

In [30]:
fs_mean = np.mean(fs_score)
fs_std = np.std(fs_score)
print fs_mean, fs_std

0.909842732963 0.0523932713832
