In [16]:
import sys
import miner2 
import miner2.preprocess
import miner2.mechanistic_inference
import pandas as pd
import os
import scanpy as sc
import numpy
import matplotlib.pyplot
import dill

%matplotlib inline

In [34]:
# read file from adata
aData = sc.read_h5ad('./write/pbmc3k.h5ad')

# construct dataframe and add index/column names
df = pd.DataFrame(aData.X)
df.index = aData.obs.index
df.columns = aData.var.index
del df.columns.name
del df.index.name
df = df.T

In [35]:
results_dir='./results/GSM3587977_AML707B/'

num_cores = 4          # required for coexpression
min_number_genes = 6   # required for coexpression
min_correlation = 0.2  # required for mechanistic inference. Bulk RNAseq default=0.2;single cell RNAseq default=0.05

if os.path.exists(results_dir) == False:
    os.mkdir(results_dir)
    os.mkdir(results_dir+'figures')
    os.mkdir(results_dir+'info')


In [36]:
# preprocess data
expression_data, conversion_table = miner2.preprocess.main(df)

In [37]:
# post processed data plots
individual_expression_data = [expression_data.iloc[:,i] for i in range(50)]
matplotlib.pyplot.boxplot(individual_expression_data)
matplotlib.pyplot.title("Sample expression profiles")
matplotlib.pyplot.ylabel("Relative expression")
matplotlib.pyplot.xlabel("Sample ID")
matplotlib.pyplot.xticks(fontsize=6)

figure_name=results_dir+'figures/boxplots.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()

matplotlib.pyplot.hist(expression_data.iloc[0,:],bins=100,alpha=0.75)
matplotlib.pyplot.title("Expression of single gene")
matplotlib.pyplot.ylabel("Frequency")
matplotlib.pyplot.xlabel("Relative expression")

figure_name=results_dir+'figures/singleGene.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()

matplotlib.pyplot.hist(expression_data.iloc[:,0],bins=200,color=[0,0.4,0.8],alpha=0.75)
matplotlib.pyplot.ylim(0,350)
matplotlib.pyplot.title("Expression of single sample",FontSize=14)
matplotlib.pyplot.ylabel("Frequency")
matplotlib.pyplot.xlabel("Relative expression")

figure_name=results_dir+'figures/singleSample.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()


<Figure size 432x288 with 0 Axes>

In [38]:
# STEP 1: clustering
initial_clusters = miner2.coexpression.cluster(expression_data,min_number_genes=min_number_genes,num_cores=num_cores)
revised_clusters = miner2.coexpression.revise_initial_clusters(initial_clusters,expression_data)

2019-08-09 13:06:58 	 coexpression
2019-08-09 13:06:58 	 working on coexpression step 1 out of 5
2019-08-09 13:06:59 	 working on coexpression step 2 out of 5
2019-08-09 13:07:00 	 working on coexpression step 3 out of 5
2019-08-09 13:07:01 	 working on coexpression step 4 out of 5
2019-08-09 13:07:01 	 working on coexpression step 5 out of 5
2019-08-09 13:07:02 	 genes clustered: 1338
2019-08-09 13:07:02 	 revising initial clusters
2019-08-09 13:07:02 	 revision completed
2019-08-09 13:07:02 	 genes clustered: 1338
2019-08-09 13:07:02 	 unique clusters: 112


In [39]:
# QC: visualize coexpression clusters

# retrieve first 10 clusters for visual inspection
first_clusters = numpy.hstack([revised_clusters[i] for i in numpy.arange(10).astype(str)])
# visualize first 10 clusters
matplotlib.pyplot.imshow(expression_data.loc[first_clusters,:],aspect="auto",cmap="viridis",vmin=-1,vmax=1)
matplotlib.pyplot.grid(False)
matplotlib.pyplot.ylabel("Genes")
matplotlib.pyplot.xlabel("Samples")
matplotlib.pyplot.title("First 10 coexpression clusters")
figure_name=results_dir+'figures/first.coexpression.clusters.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()
# visualize 10 random clusters
matplotlib.pyplot.imshow(expression_data.loc[numpy.random.choice(expression_data.index,len(first_clusters),replace=False),:],aspect="auto",cmap="viridis",vmin=-1,vmax=1)
matplotlib.pyplot.grid(False)
matplotlib.pyplot.ylabel("Genes")
matplotlib.pyplot.xlabel("Samples")
matplotlib.pyplot.title("Random coexpression genes")
figure_name=results_dir+'figures/random.coexpression.clusters.pdf'
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig(figure_name)
matplotlib.pyplot.clf()

<Figure size 432x288 with 0 Axes>

In [40]:
# STEP 2: mechanistic inference
dill.dump_session(results_dir+'info/bottle.dill')
dill.load_session(results_dir+'info/bottle.dill')

# get first principal component axes of clusters
axes = miner2.mechanistic_inference.get_principal_df(revised_clusters,expression_data,subkey=None,min_number_genes=1)

# analyze revised clusters for enrichment in relational database 
# mechanistic_output = miner2.mechanistic_inference.enrichment(axes,revised_clusters,expression_data,correlation_threshold=min_correlation,num_cores=num_cores)

2019-08-09 13:07:07 	 preparing mechanistic inference


In [42]:
axes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
AML707B-D97_AAAAGGCTGAGA,-0.017481,-0.033952,-0.022605,-0.021802,-0.035471,-0.012336,-0.008253,-0.005856,-0.010369,-0.007843,...,-0.009633,-0.050666,-0.040271,-0.030241,0.002394,0.171356,-0.031493,0.893490,-0.054707,0.237860
AML707B-D97_AACTTTGTGCCG,0.003932,0.010068,-0.022840,-0.010385,-0.010720,-0.018704,-0.022334,0.019065,0.006514,0.038065,...,0.068837,-0.025414,-0.000305,-0.043155,-0.035950,-0.111306,0.865097,-0.033194,-0.058522,-0.049347
AML707B-D97_AATTGAGTGCTN,-0.051243,-0.039117,-0.006467,-0.011758,-0.061707,-0.005678,-0.007812,-0.002968,-0.010383,-0.015350,...,-0.021348,-0.048327,-0.051359,-0.015379,-0.014784,-0.091817,-0.030008,-0.044934,-0.044385,-0.062135
AML707B-D97_ACACTTGACTAA,0.006902,-0.001214,0.015125,0.048959,-0.088580,0.045293,0.037725,0.017796,0.098998,0.021411,...,-0.039127,0.026356,0.054463,0.036004,0.040465,0.238523,0.098480,-0.056335,-0.024705,-0.055006
AML707B-D97_ACCCGCGAAGAC,-0.022294,-0.042233,0.002596,-0.007120,-0.074943,-0.017598,-0.030652,-0.016923,-0.026847,-0.018035,...,-0.029057,0.096562,-0.062299,0.060367,-0.021895,0.083834,-0.033240,-0.043877,-0.036244,-0.053853
AML707B-D97_ACCGTCTTTGAC,0.032083,-0.031357,-0.031533,0.011528,-0.027950,-0.030169,0.001919,0.026753,-0.050181,0.012005,...,-0.077452,0.009086,-0.110034,-0.024548,-0.091205,0.199181,0.038326,0.002276,-0.016491,0.001597
AML707B-D97_ACTATTCAACGG,-0.022457,0.008643,-0.014701,-0.033618,-0.031482,-0.023554,-0.004293,-0.038418,0.010441,-0.039718,...,-0.001225,-0.050820,0.208876,0.011635,-0.014921,-0.027296,-0.035762,-0.047062,-0.044931,-0.062787
AML707B-D97_ACTTAGTCGGTC,-0.011181,-0.031275,-0.045271,0.030060,0.008418,-0.031339,-0.029466,-0.024069,0.009082,-0.025398,...,-0.054400,-0.007799,-0.085481,-0.028064,-0.060374,0.021937,0.015751,0.032552,0.096781,0.221545
AML707B-D97_AGAGATTGACAG,-0.011597,-0.030436,-0.035822,-0.046791,0.010878,0.009779,-0.014380,-0.023599,0.024025,-0.036829,...,0.030615,0.180220,-0.042387,-0.041817,0.009083,-0.013681,-0.037001,-0.059348,-0.059321,-0.088189
AML707B-D97_AGAGTTATCTCA,0.033618,-0.014660,-0.104427,-0.016395,0.022747,-0.049188,-0.030566,0.020268,-0.048248,0.016164,...,-0.036754,0.083629,-0.081597,-0.081333,-0.019865,0.124567,-0.058375,-0.043364,0.016285,-0.070996
