Simulating a gene expression matrix.

In [2]:
import numpy as np

genes = np.loadtxt('/data/jessez/Correct_Gene_Counts_No_Controls/genes.txt',dtype='str') # load genes

np.random.seed(42)

M = 326               # number of cells
N = np.size(genes)    # number of genes
d1_genes = ['Isl1','Drd1','Sfxn1','Nrxn1']
d2_genes = ['Drd2','Penk','Sp9','Gpr52','Gpr88']
n_d1_cells = 150
n_d2_cells = M-n_d1_cells

# Randomly choose a bunch of Poisson parameters (uniformly?)
poiss_lambdas = np.random.uniform(0.1,20,N)

# Randomly choose a bunch of proportions of D1 Poisson to D2 Poisson parameters
poiss_proportion = np.random.uniform(0,1,N)

# # Randomly choose 9 genes. 4 of these will be D1, the other 5 will be D2
# d1d2_genes = np.sort(np.random.choice(range(N),n_d1_genes+n_d2_genes,replace=False))
# d1_genes = np.sort(np.random.choice(d1d2_genes,n_d1_genes,replace=False))
# d2_genes = np.sort(np.setdiff1d(d1d2_genes,d1_genes))

# Randomly choose half the cells to be D1 (remaining ones will be D2)
d1_cells = np.sort(np.random.choice(range(M),n_d1_cells,replace=False))
d2_cells = np.sort(np.setdiff1d(range(M),d1_cells))

# Randomly generate an MxN expression matrix
X = np.zeros([M,N])
for i in range(M):
    for j in range(N):
        # The d2 genes need to have smaller Poisson lambda in D1 cells, and vice versa
        if (i in d1_cells and genes[j] in d1_genes) or (i in d2_cells and genes[j] in d2_genes):
            X[i,j] = np.random.poisson(poiss_lambdas[j])/poiss_proportion[j]
        else:
            X[i,j] = np.random.poisson(poiss_lambdas[j])

In [5]:
np.savetxt('/data/jessez/Correct_Gene_Counts_No_Controls/read_counts_rand.txt', X, fmt='%d', delimiter='\t')

In [2]:
# Look at correlation matrix of X
R = np.corrcoef(np.transpose(X))

In [27]:
# Look at correlations of D1 and D2 genes
d1d2_genes_as_str_d1_first = np.array([d1_genes.tolist()+d2_genes.tolist()]).astype(str)[0]
for gene1 in d1d2_genes_as_str_d1_first:
    print gene1 + ':'
    for gene2 in d1d2_genes_as_str_d1_first:
        if gene2 != gene1:
            print gene1 + ' and ' + gene2 + ' has a correlation of ' + str(R[gene1,gene2]) + '.'
    print ' '

2251:
2251 and 3715 has a correlation of 0.330229392073.
2251 and 5842 has a correlation of 0.139310448475.
2251 and 7541 has a correlation of -0.0337058139416.
2251 and 1742 has a correlation of -0.363653489208.
2251 and 2632 has a correlation of -0.146815977941.
2251 and 4147 has a correlation of -0.361996247846.
2251 and 5471 has a correlation of -0.136978198864.
2251 and 9447 has a correlation of -0.271109133651.
 
3715:
3715 and 2251 has a correlation of 0.330229392073.
3715 and 5842 has a correlation of 0.344535340524.
3715 and 7541 has a correlation of 0.133798079409.
3715 and 1742 has a correlation of -0.68796401238.
3715 and 2632 has a correlation of -0.206124339371.
3715 and 4147 has a correlation of -0.649452385429.
3715 and 5471 has a correlation of -0.305055679844.
3715 and 9447 has a correlation of -0.645737530458.
 
5842:
5842 and 2251 has a correlation of 0.139310448475.
5842 and 3715 has a correlation of 0.344535340524.
5842 and 7541 has a correlation of 0.073699182279

In [35]:
%matplotlib inline
import matplotlib.pyplot as plt

# How well do we do with just the d1d2 genes?
test_names = d1d2_genes_as_str_d1_first

ind = d1d2_genes_as_str_d1_first.astype(int)

# Truncate R
a = np.squeeze(R[:,ind],axis=1)
R_test = np.squeeze(a[ind,:],axis=0)

print np.shape(R_test)

# Biclustering on test data

from sklearn.cluster.bicluster import SpectralCoclustering
model = SpectralCoclustering(n_clusters=2, random_state=0)
model.fit(R_test)

fit_data = R_test[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.show()

ValueError: cannot select an axis to squeeze out which has size greater than one

In [42]:
np.array(range(N))[ind]

array([2251, 3715, 5842, 7541, 1742, 2632, 4147, 5471, 9447])