In [1]:
from hmmlearn import hmm
import pandas as pd
import numpy as np
import warnings
import os
from collections import Counter


from sklearn.mixture import BayesianGaussianMixture


import matplotlib as mpl
from matplotlib import pyplot as plt


from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
##https://github.com/ShawnLYU/Quantile_Normalize/blob/master/quantile_norm.py

def quantileNormalize(df_input):
    df = df_input.copy()
    #compute rank
    dic = {}
    for col in df:
        dic.update({col : sorted(df[col])})
    sorted_df = pd.DataFrame(dic)
    rank = sorted_df.mean(axis = 1).tolist()
    #sort
    for col in df:
        t = np.searchsorted(np.sort(df[col]), df[col])
        df[col] = [rank[i] for i in t]
    return df





files = [f for f in os.listdir('features')  if 'matrix.bed' in f]

samples = []
for f in files:
    fn = open('features/%s' % f,'r')
    fn.readline()
    for line in fn:
        samples.append([f.replace('_matrix.bed','')] + line.rstrip().split('\t')[1:])
    fn.close()
    
    
    
    
    
sample_df = pd.DataFrame(samples)
sample_df.index = sample_df[0]
del sample_df[0]

fn = open('features/%s' % f,'r')
sample_df.columns = np.array(fn.readline().rstrip().split('\t')[1:])
fn.close()

sample_df = sample_df.astype('float')
sample_df.to_csv('Features.csv',sep='\t')

In [3]:
### plot the raw features

plt.figure()
X = np.array(sample_df)
X = np.ma.masked_where(np.abs(X) < 1e-3, X)
cmap = mpl.cm.jet
cmap.set_bad('white')
cax = plt.imshow(X, interpolation='nearest', cmap=cmap, aspect='auto')
plt.colorbar(cax)
plt.savefig('Feaures_Raw.png')



In [5]:
data = sample_df.copy()

X = quantileNormalize(data)
X = np.log2(X + 1e-10)
X = X.apply(lambda x: (x-np.mean(x)) / np.std(x), axis=0)

#X = np.array(data)[:,np.array([0,1,2,3,4,5]+[29,30])]

X.to_csv('Features_normalized.csv',sep='\t')

In [6]:

X = np.array(X)
pca = PCA(n_components=10)
pca.fit(X)
print(pca.explained_variance_ratio_) 

Y = pca.transform(X)

[ 0.08078187  0.03244479  0.02356611  0.02035036  0.01815709  0.01577288
  0.01508609  0.01330848  0.01262597  0.01163276]


In [8]:
idx = np.random.choice(range(len(Y)),3000)

model = TSNE(n_components=2, random_state=0)
Z = model.fit_transform(Y[idx,:])

In [10]:
Y[idx,:]

array([[ -2.96828119,   0.02485503],
       [ -2.72430126,   0.02880321],
       [ -1.41933932,  -0.0556558 ],
       ..., 
       [ -0.03357866,   0.12995673],
       [ -1.61678278,  -0.08446209],
       [ 10.93606255,  -0.42815075]])

In [9]:
Z

array([[ 13.60837024, -11.42311558],
       [-13.4452101 ,  -8.94927782],
       [  4.09377423,   3.98402401],
       ..., 
       [ -3.9211241 ,   8.03168165],
       [  9.06260523,  -3.11880229],
       [-10.53713215, -10.72338951]])

In [7]:
idx = np.random.choice(range(len(Y)),3000)

plt.figure()
plt.scatter(Y[idx,0], Y[idx,1])
plt.show()

In [8]:
plt.figure()
plt.plot(range(20), pca.explained_variance_ratio_)
plt.xlabel('Number of PCs')
plt.ylabel('Explained variance')
plt.savefig('Feaures_PC_explained.png')

In [18]:
model = TSNE(n_components=2, random_state=0)
Z = model.fit_transform(Y[np.random.choice(range(len(Y)),3000)]) 


plt.figure()
plt.scatter(Z[:,0], Z[:,1])
plt.savefig('Feaures_tSNE_space_PC20.png')

In [297]:
model = BayesianGaussianMixture(4)
model.fit(X)
Y_ = model.predict(X)
data['group'] = Y_

In [314]:
df_plot = data.groupby('group')[np.array(data.columns[:-1])].mean()

df_plot = df_plot[df_plot.columns[np.std(df_plot, axis=0) > np.percentile(np.std(df_plot, axis=0), 50)]]
df_plot.index = [Counter(Y_)[t] for t in np.unique(df_plot.index)]


fig, ax = plt.subplots()
cax = ax.imshow(df_plot, interpolation='nearest')
ax.set_xticks(range(len(df_plot.columns)))
ax.set_xticklabels(df_plot.columns, rotation = 90)
ax.set_yticks(range(len(df_plot.index)))
ax.set_yticklabels(df_plot.index)
fig.colorbar(cax)
plt.savefig('Feature_enrichment.png')

In [315]:
def plot_cov(covM, Y, gridXn = 3, gridYn = 4):
    cov_matrix = covM.copy()
    fig, ax = plt.subplots(gridXn, gridYn)
    for k in range(len(cov_matrix)):
        X = cov_matrix[k]
        np.fill_diagonal(X,0)
               
    for i in range(gridXn):
        for j in range(gridYn):
            if i * gridYn + j  < len(cov_matrix):
                X = cov_matrix[i*gridYn+j]
                X = np.ma.masked_where(np.abs(X) < 1e-3, X)
                cmap = mpl.cm.jet
                cmap.set_bad('white')
                cax = ax[i][j].imshow(X, interpolation='nearest', vmin=np.min(cov_matrix), vmax=np.max(cov_matrix), cmap=cmap)
                ax[i][j].set_title(str(Counter(Y)[i*gridYn+j]))
            ax[i][j].set_xticks([])
            ax[i][j].set_yticks([])
            #ax[i][j].axis('off')
    
    #fig.subplots_adjust(right=0.)
    #cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
    fig.colorbar(cax)

    plt.savefig('Features_esimated_covariance.png')
    
    return

In [316]:
plot_cov(model.covariances_, Y_, 2, 2)