In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Set up cashdir
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -v

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')


last updated: 2016-09-20 

CPython 3.5.2
IPython 5.1.0


In [2]:
#imports
import os
import pickle
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib notebook

from bokeh.charts import Line, output_notebook, show
from bokeh.models import Span

from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import IncrementalPCA

In [3]:
output_notebook()

In [4]:
# import data
if (not os.path.exists('../../output/htseq_cleaned_matrix.npy')) | \
    (not os.path.exists('../../output/htseq_cleaned_matrix_cols.pkl')) | \
    (not os.path.exists('../../output/htseq_cleaned_matrix_rows.pkl')):

    htseq = pd.read_csv('../../data/zhenxia/htseq_merge_10755.txt', sep='\t', index_col=0)
    print(htseq.shape)

    # Drop ERCC and htseq columns
    htseq.drop([x for x in htseq.columns if x.startswith('ERCC')], inplace=True, axis=1)
    htseq.drop([x for x in htseq.columns if x.startswith('__')], inplace=True, axis=1)
    htseq.drop(htseq.columns[htseq.sum() == 0], inplace=True, axis=1)
    print(htseq.shape)

    # Save data matrix for memmap
    np.save('../../output/htseq_cleaned_matrix.npy', htseq)
    
    with open('../../output/htseq_cleaned_matrix_cols.pkl', 'wb') as OUT:
        pickle.dump(htseq.columns.tolist(), OUT)
        
    with open('../../output/htseq_cleaned_matrix_rows.pkl', 'wb') as OUT:
        pickle.dump(htseq.index.tolist(), OUT)
    
    del htseq

# grab memmap object
mm = np.memmap('../../output/htseq_cleaned_matrix.npy', shape=(10755,16995))

with open('../../output/htseq_cleaned_matrix_cols.pkl', 'rb') as IN:
    cols = np.array(pickle.load(IN))
    
with open('../../output/htseq_cleaned_matrix_rows.pkl', 'rb') as IN:
    rows = np.array(pickle.load(IN))

In [8]:
print(cols.shape)
print(rows.shape)
mm.shape

(16995,)
(10755,)


(10755, 16995)

In [5]:
# drop rows that are all 0's
r_all_zero = np.all(mm == 0, axis=1).ravel()
rows = rows[~r_all_zero.ravel()]
mm = np.array(mm[~r_all_zero,:])

In [6]:
c_all_zero = np.all(mm == 0, axis=0).ravel()
cols = cols[~c_all_zero.ravel()]
mm = np.array(mm[:,~c_all_zero])

In [7]:
# Standardized data by centering mean and scaling std
scaled = scale(mm)
scaled.shape



(10726, 16995)

In [None]:
# Fit initial PCA
pca = IncrementalPCA(whiten=True)
res = pca.fit(scaled)

In [None]:
res.components_.shape

In [None]:
# Plot cumulative explained variance to decide cutoff
p = Line(np.cumsum(res.explained_variance_ratio_), legend=False, tools="pan,box_zoom,wheel_zoom,save,reset,crosshair")
nintyFive = Span(location=.95, dimension='width', line_color='blue', line_dash='dashed')
p.renderers.extend([nintyFive])
show(p)

In [None]:
# Re-Run PCA setting the number of clusters to XXX
%%cache -s pca_reduced_htseq_counts.pkl reduced
pca = IncrementalPCA(n_clusters=XXX, whiten=True)
reduced = pca.fit_transform(scaled)