In [1]:

from sklearn.datasets import make_blobs
import numpy as np
import pandas as pd
import time
#from SMD_serial import SMD
from SMD import SMD
import ray
## using conda env cellrank from CBE, not local conda

In [None]:
## test an example

In [2]:
N = 1000
#make 5 clusters in 10 dimensions
X_s,y = make_blobs(N,n_features = 10,centers = 5)
D_s = 10

In [3]:
#Add 100 noisy dimensions, so D_s = 10, and D_n = 100. D/D_s = (D_N + D_s)/D_s = 11
X = np.hstack([X_s,np.random.randn(N,100)])
D_n = 20

In [4]:
X

array([[-1.46559589, -6.51524177, -6.7234546 , ...,  1.24334305,
         0.66576692, -0.2807417 ],
       [ 3.03432723, -5.0989274 , -1.55042815, ..., -0.07051904,
         0.28538962,  0.01845486],
       [ 8.10453605,  4.91450666,  2.44191981, ..., -1.02663853,
         0.81866514, -0.05601722],
       ...,
       [ 5.02430313,  0.16278266,  8.27158166, ..., -0.61265682,
        -0.25574222,  2.57170779],
       [ 2.77647331, -7.85640236, -0.73731338, ...,  0.47202092,
         0.08788283, -0.92072655],
       [-1.50675799, -6.0145573 , -6.74267367, ..., -0.80564964,
        -0.46813067, -0.02787389]])

In [5]:
X.shape

(1000, 110)

In [6]:
ray.init(ignore_reinit_error=True, num_cpus=25)

2023-05-09 17:25:29,307	INFO worker.py:1538 -- Started a local Ray instance.


0,1
Python version:,3.7.12
Ray version:,2.2.0


In [9]:
#Try SMD, pretend we don't know that there are 5 clusters, and guess 6
z = SMD(X,k_guess = 6,trials = 100)

In [10]:
print(f'Good dimensions have an average Z-score: {z[:D_s].mean():.2f} +/- {z[:D_s].std():.2f}\n')
print(f'Noisy dimensions have an average Z-score: {z[D_s:].mean():.2f} +/- {z[D_s:].std():.2f}')

Good dimensions have an average Z-score: 18.79 +/- 7.79

Noisy dimensions have an average Z-score: -1.88 +/- 0.18


In [11]:
z[:D_s]

array([14.21702057, 19.9863236 , 22.52872833, 19.88853881,  3.94961686,
       17.63948847, 13.63031178, 16.6616405 , 35.92524556, 23.50657631])

In [12]:
z[D_s:]

array([-2.01525578, -1.91747098, -2.11304057, -1.81968618, -2.01525578,
       -1.62411659, -2.01525578, -2.01525578, -2.11304057, -2.01525578,
       -1.81968618, -1.72190138, -2.01525578, -1.91747098, -1.81968618,
       -1.72190138, -2.01525578, -2.01525578, -1.81968618, -1.62411659,
       -2.11304057, -1.91747098, -2.01525578, -1.81968618, -1.52633179,
       -2.01525578, -1.81968618, -2.11304057, -1.81968618, -1.1351926 ,
       -1.91747098, -2.01525578, -1.91747098, -2.11304057, -1.81968618,
       -2.11304057, -1.81968618, -2.01525578, -1.91747098, -1.72190138,
       -1.42854699, -1.81968618, -1.91747098, -2.11304057, -1.81968618,
       -1.72190138, -1.72190138, -2.01525578, -1.72190138, -2.11304057,
       -1.42854699, -2.01525578, -1.81968618, -1.91747098, -2.01525578,
       -2.01525578, -1.72190138, -2.01525578, -1.72190138, -1.72190138,
       -1.81968618, -1.91747098, -1.81968618, -2.01525578, -2.01525578,
       -2.01525578, -1.91747098, -2.01525578, -1.91747098, -2.01

In [None]:
## import the scRNA-seq data 

In [7]:
# import the scRNA-seq data, lognormalized of TF expression matrix, scaled and transposed 
tf_matrix = "../results/scRNAseq_R13547_10x_mNT_20220813/RA_symetryBreaking/sparse_featureSelection_d2_d2.5_d3_d3.5_d4_d5/exp_matrix_TFs_SPs_4SMD_12k.cells.csv"
expMat = pd.read_csv(tf_matrix, 
                index_col=None)
X = np.genfromtxt(tf_matrix, 
                  delimiter=',', 
                 skip_header= 1)

In [8]:
X

array([[-0.36614657,  2.69204041, -0.46025243, ..., -0.39374638,
        -0.26359799, -0.27150784],
       [-0.36614657, -0.35937716, -0.46025243, ..., -0.39374638,
        -0.26359799, -0.27150784],
       [-0.36614657, -0.35937716, -0.46025243, ..., -0.39374638,
        -0.26359799, -0.27150784],
       ...,
       [-0.36614657, -0.35937716, -0.46025243, ..., -0.39374638,
        -0.26359799, -0.27150784],
       [-0.36614657, -0.35937716, -0.46025243, ..., -0.39374638,
        -0.26359799, -0.27150784],
       [-0.3661466 , -0.3593772 , -0.4602524 , ..., -0.3937464 ,
        -0.263598  , -0.2715078 ]])

In [9]:
X.shape

(12000, 1144)

In [10]:
expMat.head()

Unnamed: 0,Sox17,Mybl1,Prex2,Sulf1,Ncoa2,Eya1,Terf1,Rab23,Zfp451,Pou3f3,...,Rarb,Hoxb4,Hoxb3,Hoxb2,Hoxb1,Hoxc5,Hoxc4,Robo2,Shh,Nkx6-1
0,-0.366147,2.69204,-0.460252,-0.619824,-0.608774,-0.273523,0.967464,-0.693987,-1.004996,-0.334225,...,-0.540034,-0.306788,-0.291582,-0.667937,-0.428313,-0.303389,-0.453652,-0.393746,-0.263598,-0.271508
1,-0.366147,-0.359377,-0.460252,-0.619824,-0.608774,-0.273523,0.568549,0.421005,-0.071639,-0.334225,...,-0.540034,-0.306788,-0.291582,-0.667937,-0.428313,-0.303389,-0.453652,-0.393746,-0.263598,-0.271508
2,-0.366147,-0.359377,-0.460252,-0.619824,0.68841,-0.273523,0.323709,-0.693987,-1.004996,-0.334225,...,-0.540034,-0.306788,-0.291582,-0.667937,-0.428313,-0.303389,-0.453652,-0.393746,-0.263598,-0.271508
3,-0.366147,0.713019,-0.460252,0.020696,0.093384,-0.273523,1.093011,0.450703,-0.046778,-0.334225,...,-0.540034,-0.306788,-0.291582,-0.667937,-0.428313,-0.303389,-0.453652,-0.393746,-0.263598,-0.271508
4,-0.366147,1.25812,-0.460252,0.346274,-0.608774,-0.273523,0.366736,0.962231,0.38142,-0.334225,...,-0.540034,-0.306788,-0.291582,-0.667937,-0.428313,-0.303389,-0.453652,-0.393746,-0.263598,-0.271508


In [11]:
ray.init(ignore_reinit_error=True, num_cpus=30)

2023-05-09 17:28:58,340	INFO worker.py:1370 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.7.12
Ray version:,2.2.0


In [12]:
#Try SMD, pretend we don't know that there are 5 clusters, and guess 6
start_time = time.time()

z = SMD(X,k_guess = 12, trials = 1000)

print("--- %s seconds ---" % (time.time() - start_time))

--- 6546.4992933273315 seconds ---


In [13]:
z;

In [14]:
np.argwhere(z > 1)

array([[   9],
       [ 114],
       [ 119],
       [ 121],
       [ 160],
       [ 163],
       [ 180],
       [ 252],
       [ 375],
       [ 425],
       [ 433],
       [ 547],
       [ 628],
       [ 721],
       [ 877],
       [ 883],
       [ 990],
       [1053],
       [1080],
       [1127],
       [1142]])

In [15]:
expMat.columns[np.argwhere(z > 1)]

  """Entry point for launching an IPython kernel.


array([['Pou3f3'],
       ['Mdk'],
       ['Pax6'],
       ['Meis2'],
       ['Cebpb'],
       ['Tfap2c'],
       ['Gpc3'],
       ['Pou3f2'],
       ['Peg10'],
       ['Peg3'],
       ['Apoe'],
       ['Rfx4'],
       ['Cdh1'],
       ['Nedd4'],
       ['Sox11'],
       ['Foxa1'],
       ['Dll1'],
       ['Cdh2'],
       ['Tshz1'],
       ['Cyp26a1'],
       ['Shh']], dtype=object)

In [16]:
df = pd.DataFrame({'gene':expMat.columns, 'SMD_z':z})
df

Unnamed: 0,gene,SMD_z
0,Sox17,-0.055222
1,Mybl1,-0.140760
2,Prex2,-0.140760
3,Sulf1,-0.140760
4,Ncoa2,-0.140760
...,...,...
1139,Hoxc5,-0.140760
1140,Hoxc4,-0.140760
1141,Robo2,-0.140760
1142,Shh,12.125111


In [17]:
df.to_csv('../results/scRNAseq_R13547_10x_mNT_20220813/RA_symetryBreaking/sparse_featureSelection_d2_d2.5_d3_d3.5_d4_d5/output_SMD_12k.cells_tfs.sps_v4.csv', sep='\t')