In [1]:
import sys
import miner2 
import miner2.preprocess
import miner2.mechanistic_inference
import pandas as pd
import os
import scanpy as sc
import numpy
import matplotlib.pyplot
import dill

%matplotlib inline

2019-08-09 14:12:55 	 hello from miner2 version 0.0.9


In [2]:
# read file from adata
aData = sc.read_h5ad('./write/pbmc3k.h5ad')

# construct dataframe and add index/column names
df = pd.DataFrame(aData.X)
df.index = aData.obs.index
df.columns = aData.var.index
del df.columns.name
del df.index.name
df = df.T

In [3]:
results_dir='./results/GSM3587977_AML707B/'

num_cores = 4          # required for coexpression
min_number_genes = 6   # required for coexpression
min_correlation = 0.2  # required for mechanistic inference. Bulk RNAseq default=0.2;single cell RNAseq default=0.05

if os.path.exists(results_dir) == False:
    os.mkdir(results_dir)
    os.mkdir(results_dir+'figures')
    os.mkdir(results_dir+'info')


In [4]:
# preprocess data
expression_data, conversion_table = miner2.preprocess.main(df)

In [5]:
# post processed data plots
individual_expression_data = [expression_data.iloc[:,i] for i in range(50)]
matplotlib.pyplot.boxplot(individual_expression_data)
matplotlib.pyplot.title("Sample expression profiles")
matplotlib.pyplot.ylabel("Relative expression")
matplotlib.pyplot.xlabel("Sample ID")
matplotlib.pyplot.xticks(fontsize=6)

figure_name=results_dir+'figures/boxplots.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()

matplotlib.pyplot.hist(expression_data.iloc[0,:],bins=100,alpha=0.75)
matplotlib.pyplot.title("Expression of single gene")
matplotlib.pyplot.ylabel("Frequency")
matplotlib.pyplot.xlabel("Relative expression")

figure_name=results_dir+'figures/singleGene.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()

matplotlib.pyplot.hist(expression_data.iloc[:,0],bins=200,color=[0,0.4,0.8],alpha=0.75)
matplotlib.pyplot.ylim(0,350)
matplotlib.pyplot.title("Expression of single sample",FontSize=14)
matplotlib.pyplot.ylabel("Frequency")
matplotlib.pyplot.xlabel("Relative expression")

figure_name=results_dir+'figures/singleSample.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()


<Figure size 432x288 with 0 Axes>

In [6]:
# STEP 1: clustering
initial_clusters = miner2.coexpression.cluster(expression_data,min_number_genes=min_number_genes,num_cores=num_cores)
revised_clusters = miner2.coexpression.revise_initial_clusters(initial_clusters,expression_data)

2019-08-09 14:12:59 	 coexpression
2019-08-09 14:12:59 	 working on coexpression step 1 out of 5
2019-08-09 14:13:00 	 working on coexpression step 2 out of 5
2019-08-09 14:13:01 	 working on coexpression step 3 out of 5
2019-08-09 14:13:02 	 working on coexpression step 4 out of 5
2019-08-09 14:13:03 	 working on coexpression step 5 out of 5
2019-08-09 14:13:03 	 genes clustered: 1338
2019-08-09 14:13:03 	 revising initial clusters
2019-08-09 14:13:04 	 revision completed
2019-08-09 14:13:04 	 genes clustered: 1338
2019-08-09 14:13:04 	 unique clusters: 112


In [8]:
# QC: visualize coexpression clusters

# retrieve first 10 clusters for visual inspection
first_clusters = numpy.hstack([revised_clusters[i] for i in numpy.arange(10).astype(str)])
# visualize first 10 clusters
matplotlib.pyplot.imshow(expression_data.loc[first_clusters,:],aspect="auto",cmap="viridis",vmin=-1,vmax=1)
matplotlib.pyplot.grid(False)
matplotlib.pyplot.ylabel("Genes")
matplotlib.pyplot.xlabel("Samples")
matplotlib.pyplot.title("First 10 coexpression clusters")
figure_name=results_dir+'figures/first.coexpression.clusters.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()
# visualize 10 random clusters
matplotlib.pyplot.imshow(expression_data.loc[numpy.random.choice(expression_data.index,len(first_clusters),replace=False),:],aspect="auto",cmap="viridis",vmin=-1,vmax=1)
matplotlib.pyplot.grid(False)
matplotlib.pyplot.ylabel("Genes")
matplotlib.pyplot.xlabel("Samples")
matplotlib.pyplot.title("Random coexpression genes")
figure_name=results_dir+'figures/random.coexpression.clusters.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()

<Figure size 432x288 with 0 Axes>

In [9]:
# STEP 2: mechanistic inference
dill.dump_session(results_dir+'info/bottle.dill')
dill.load_session(results_dir+'info/bottle.dill')

# get first principal component axes of clusters
axes = miner2.mechanistic_inference.get_principal_df(revised_clusters,expression_data,subkey=None,min_number_genes=1)

# analyze revised clusters for enrichment in relational database 
# mechanistic_output = miner2.mechanistic_inference.enrichment(axes,revised_clusters,expression_data,correlation_threshold=min_correlation,num_cores=num_cores)

2019-08-09 14:13:04 	 preparing mechanistic inference


In [12]:
revised_clusters

{'0': ['ENSG00000023572',
  'ENSG00000049541',
  'ENSG00000050426',
  'ENSG00000055950',
  'ENSG00000059588',
  'ENSG00000066926',
  'ENSG00000069956',
  'ENSG00000087263',
  'ENSG00000089685',
  'ENSG00000091164',
  'ENSG00000095485',
  'ENSG00000110400',
  'ENSG00000111726',
  'ENSG00000116717',
  'ENSG00000120685',
  'ENSG00000122882',
  'ENSG00000123739',
  'ENSG00000125445',
  'ENSG00000131747',
  'ENSG00000133742',
  'ENSG00000134077',
  'ENSG00000135446',
  'ENSG00000135451',
  'ENSG00000135838',
  'ENSG00000143751',
  'ENSG00000147324',
  'ENSG00000154237',
  'ENSG00000156873',
  'ENSG00000164292',
  'ENSG00000164300',
  'ENSG00000168393',
  'ENSG00000170185',
  'ENSG00000172845',
  'ENSG00000173163',
  'ENSG00000173207',
  'ENSG00000173875',
  'ENSG00000177842',
  'ENSG00000178105',
  'ENSG00000178295',
  'ENSG00000183137',
  'ENSG00000184357',
  'ENSG00000186998',
  'ENSG00000189091',
  'ENSG00000214517',
  'ENSG00000215421',
  'ENSG00000233561',
  'ENSG00000281601'],
 '1': [