- 1000 cell: done
- 2000 cell done
- 4000 cell: inprogress

In [4]:
#STEP 1: importing all needed moduels

import os, glob, re, pickle
from functools import partial
from collections import OrderedDict
import operator as op
from cytoolz import compose

import pandas as pd
import seaborn as sns
import numpy as np
import scanpy as sc
import anndata as ad
import matplotlib as mpl
import matplotlib.pyplot as plt
import skmisc

from pyscenic.export import export2loom, add_scenic_metadata
from pyscenic.utils import load_motifs
from pyscenic.transform import df2regulons
from pyscenic.aucell import aucell
from pyscenic.binarization import binarize
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_binarization, plot_rss

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell
from dask.diagnostics import ProgressBar
from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase

#STEP 1+: Set maximum number of jobs
sc.settings.njobs = 32

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:


Divide each file into by states and then running each state for network reconstruction
- A is patient pretreatment pdx
- B is patient posttreatment pdx

In [5]:
#prepping
DATASET_ID = 'lx108'
GENE_NUM = 2000

In [6]:
#STEP 1++: preparing pathway constant variables for easy coding

SOHRAB_RESOURCES_FOLDERNAME = "/work/shah/users/salehis/projects/cdm/data/sclc/{}".format(DATASET_ID)
RESULTS_FOLDERNAME = "/home/linl5/project/SCLC/results/{}".format(DATASET_ID)
FIGURES_FOLDERNAME = "/home/linl5/project/SCLC/figures"
AUXILLIARIES_FOLDERNAME = "/home/linl5/project/SCLC/auxilliaries"
RESOURCES_FOLDERNAME = "/home/linl5/project/SCLC/resources"
DATA_FOLDERNAME = "/home/linl5/project/SCLC/data/{}".format(DATASET_ID)

In [7]:
#Downloaded fromm pySCENIC github repo: https://github.com/aertslab/pySCENIC/tree/master/resources Aug-1-2023
#lambert2018.txt used in their cancer patient tutorial
HUMAN_TFS_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'allTFs_hg38.txt')

In [8]:
#STEP 2: Downloading metadata and expression matrix
#input: expression matrix and metadata file

#organized by cell ID and Gene matrix, values are counts of read in that cell
COUNTS_MTX_FNAME = os.path.join(SOHRAB_RESOURCES_FOLDERNAME, 'rna.h5ad')
COUNTS_MTX_FNAME

'/work/shah/users/salehis/projects/cdm/data/sclc/lx108/rna.h5ad'

In [9]:
#STEP 3: Importing and Analyzing the rna DATA

# Read the H5AD file using anndata
adata = ad.read(COUNTS_MTX_FNAME)
COUNTS_MTX_FNAME

'/work/shah/users/salehis/projects/cdm/data/sclc/lx108/rna.h5ad'

In [10]:
#STEP4: Understanding Dataset

#print out information about anndata
print(adata)

#print out shape
print("Shape of expression matrix: ", adata.shape)

# the columns (variables) of the expression matrix
print("Columns (variables):")
print(adata.var)

# the rows (observations) of the expression matrix
print("\nRows (observations):")
print(adata.obs)


AnnData object with n_obs × n_vars = 18481 × 33538
    obs: 'timepoint', 'datatag', 'batch', 'state'
    var: 'gene_ids', 'feature_types'
Shape of expression matrix:  (18481, 33538)
Columns (variables):
                    gene_ids    feature_types
MIR1302-2HG  ENSG00000243485  Gene Expression
FAM138A      ENSG00000237613  Gene Expression
OR4F5        ENSG00000186092  Gene Expression
AL627309.1   ENSG00000238009  Gene Expression
AL627309.3   ENSG00000239945  Gene Expression
...                      ...              ...
AC233755.2   ENSG00000277856  Gene Expression
AC233755.1   ENSG00000275063  Gene Expression
AC240274.1   ENSG00000271254  Gene Expression
AC213203.1   ENSG00000277475  Gene Expression
FAM231C      ENSG00000268674  Gene Expression

[33538 rows x 2 columns]

Rows (observations):
                              timepoint datatag batch state
AAACCCAAGTCGCTAT-1_Lx108_UUUU      UUUU   Lx108     0  UUUU
AAACCCAGTTGTGCCG-1_Lx108_UUUU      UUUU   Lx108     0  UUUU
AAACCCATCTCGTGAA-

In [11]:
#STEP 5: preprocessing and filtering

#make gene name unique
adata.var_names_make_unique()

#processing out data-prefilter
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

#saving a copy of the power filtered count into raw
adata.raw = adata

#post filer: previous was n_obs × n_vars = 33207 × 33538
print("Post Filter: ", adata.shape)


Post Filter:  (18436, 22723)


LX599: 
- Pre-Filter: 33207 × 33538
- Post Filter:  (33108, 28701)


Lx108: 
- Pre-Filter: 18481 × 33538
- Post Filter: (18436, 22723)

Lx33:
- pre filter: 23691 × 33538
- Post Filter:  (23051, 22899)

In [12]:
# STEP 6: Prepping for timepoint splitting
adata.layers['counts'] = adata.X
adata.raw = adata


In [13]:
#STEP 7: subsampling time point (IDEA-> in original sample filte by hvg, then split the following into states, then cluster and subsample from cluster)

# set the n_top_genes as necessary
sc.pp.highly_variable_genes(adata, n_top_genes=GENE_NUM, subset=True, flavor='seurat_v3')

#Splitting the cells by states
unique_state = adata.obs['state'].unique()
print(unique_state)

#storing
adata_by_state = {}

#selecting out by TP
for state in unique_state:
    adata_subset = adata[adata.obs['state'] == state].copy()
    adata_by_state[state] = adata_subset
    print(state, ": ", adata_by_state[state].shape)

    

['UUUU', 'UUUT', 'UU', 'UTTU']
Categories (4, object): ['UTTU', 'UU', 'UUUT', 'UUUU']
UUUU :  (4128, 2000)
UUUT :  (4564, 2000)
UU :  (5035, 2000)
UTTU :  (4709, 2000)


lx33 (post cluster):
- UTTU :  (4021, 2000)
- UU :  (6176, 2000)
- UUUT :  (6619, 2000)
- UUUU :  (6235, 2000)

lx108
- UUUU :  (4128, 2000)
- UUUT :  (4564, 2000)
- UU :  (5035, 2000)
- UTTU :  (4709, 2000)

The Leiden algorithm improves upon Louvain by using the "agglomerative" approach to optimize a different quality function known as the "improved modularity." Improved modularity has a resolution parameter that allows Leiden to control the granularity of the clustering solution. It also uses a smart local move algorithm to avoid some of the resolution limit issues present in Louvain. Higher paramter means more identified communities. 

#The Leiden algorithm is a hierarchical clustering algorithm, that recursively merges communities into single nodes by greedily optimizing the modularity and the process repeats in the condensed graph.
#The Leiden algorithm improves upon Louvain by using the "agglomerative" approach to optimize a different quality function known as the "improved modularity." Improved modularity has a resolution parameter that allows Leiden to control the granularity of the clustering solution. It also uses a smart local move algorithm to avoid some of the resolution limit issues present in Louvain.

In [14]:
#STEP 8: cluster definition

def cluster_rna(bdata):
    #I have already did filtering and HVG selection on main anndata, we want to keep same hvg across our states
    sc.pp.normalize_total(bdata)    
    sc.pp.log1p(bdata)
    sc.pp.pca(bdata)
    sc.pp.neighbors(bdata)
    sc.tl.umap(bdata)
    sc.tl.leiden(bdata, resolution=2)
    return bdata

In [15]:
#STEP 9: Clustering

for state in unique_state:
    adata_by_state[state] = cluster_rna(adata_by_state[state])

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [20]:
#STEP 10: Subsampling

from tqdm import tqdm
frac_cells = 1
data_sub = {}

#Consistent randomization
np.random.seed(0)

for state in unique_state:
    sub_cells = []
    for clust in tqdm(adata_by_state[state].obs['leiden'].unique()):
        # sample 10% of cells from each cluster
        cells_in_clust = adata_by_state[state].obs_names[adata_by_state[state].obs['leiden'] == clust].copy()
        #dropping out clusters with less than 5 cells
        if (len(cells_in_clust) > 5):
            chosen_cells = np.random.choice(cells_in_clust, size=int(len(cells_in_clust)*frac_cells), replace=False)
            sub_cells.extend(chosen_cells)
    data_sub[state] = adata_by_state[state][sub_cells, :].copy()

100%|██████████| 21/21 [00:00<00:00, 1747.97it/s]
100%|██████████| 19/19 [00:00<00:00, 3192.91it/s]
100%|██████████| 22/22 [00:00<00:00, 2553.75it/s]
100%|██████████| 19/19 [00:00<00:00, 3171.18it/s]


In [21]:
#checking for subsampling population

for state in unique_state:
    print(state, data_sub[state].shape)
    set1 = set(data_sub[state].var_names)
    set2 = set(data_sub[unique_state[1]].var_names)
    print("Same Gene set check: ", state, unique_state[1], len(set1.intersection(set2)))

print(COUNTS_MTX_FNAME)

UUUU (4128, 2000)
Same Gene set check:  UUUU UUUT 2000
UUUT (4564, 2000)
Same Gene set check:  UUUT UUUT 2000
UU (5035, 2000)
Same Gene set check:  UU UUUT 2000
UTTU (4706, 2000)
Same Gene set check:  UTTU UUUT 2000
/work/shah/users/salehis/projects/cdm/data/sclc/lx108/rna.h5ad


LX599

10%
UUa (715, 2000)
Same Gene set check:  UUa UUa  2000
UTb (515, 2000)
Same Gene set check:  UTb UUa  2000
UUb (991, 2000)
Same Gene set check:  UUb UUa  2000
UTa (1042, 2000)
Same Gene set check:  UTa UUa  2000

25%
UUa (1812, 2000)
Same Gene set check:  UUa UUa  2000
UTb (1307, 2000)
Same Gene set check:  UTb UUa  2000
UUb (2500, 2000)
Same Gene set check:  UUb UUa  2000
UTa (2618, 2000)
Same Gene set check:  UTa UUa  2000

50%
UUa (3633, 2000)
Same Gene set check:  UUa UUa  2000
UTb (2633, 2000)
Same Gene set check:  UTb UUa  2000
UUb (5011, 2000)
Same Gene set check:  UUb UUa  2000
UTa (5248, 2000)
Same Gene set check:  UTa UUa  2000

LX108

20%
UUUU (818, 2000)
Same Gene set check:  UUUU UUUT 2000
UUUT (904, 2000)
Same Gene set check:  UUUT UUUT 2000
UU (998, 2000)
Same Gene set check:  UU UUUT 2000
UTTU (934, 2000)
Same Gene set check:  UTTU UUUT 2000

40%
UUUU (1643, 2000)
Same Gene set check:  UUUU UUUT 2000
UUUT (1818, 2000)
Same Gene set check:  UUUT UUUT 2000
UU (2004, 2000)
Same Gene set check:  UU UUUT 2000
UTTU (1878, 2000)
Same Gene set check:  UTTU UUUT 2000

100%
UUUU (4128, 2000)
Same Gene set check:  UUUU UUUT 2000
UUUT (4564, 2000)
Same Gene set check:  UUUT UUUT 2000
UU (5035, 2000)
Same Gene set check:  UU UUUT 2000
UTTU (4706, 2000)
Same Gene set check:  UTTU UUUT 2000

LX33

15%
UTTU (594, 2000)
Same Gene set check:  UTTU UU 2000
UU (917, 2000)
Same Gene set check:  UU UU 2000
UUUT (981, 2000)
Same Gene set check:  UUUT UU 2000
UUUU (922, 2000)
Same Gene set check:  UUUU UU 2000

33%
UTTU (1316, 2000)
Same Gene set check:  UTTU UU 2000
UU (2026, 2000)
Same Gene set check:  UU UU 2000
UUUT (2171, 2000)
Same Gene set check:  UUUT UU 2000
UUUU (2044, 2000)
Same Gene set check:  UUUU UU 2000


66%
UTTU (2644, 2000)
Same Gene set check:  UTTU UU 2000
UU (4065, 2000)
Same Gene set check:  UU UU 2000
UUUT (4357, 2000)
Same Gene set check:  UUUT UU 2000
UUUU (4102, 2000)
Same Gene set check:  UUUU UU 2000

100%
UTTU (4021, 2000)
Same Gene set check:  UTTU UU 2000
UU (6176, 2000)
Same Gene set check:  UU UU 2000
UUUT (6619, 2000)
Same Gene set check:  UUUT UU 2000
UUUU (6235, 2000)
Same Gene set check:  UUUU UU 2000
/work/shah/users/salehis/projects/cdm/data/sclc/lx33/rna.h5ad

In [22]:
#STEP 11: copying over raw count 

for state in unique_state:
    data_sub[state].X = data_sub[state].layers['counts'].copy()
    
#check for sample output
data_sub[unique_state[1]].to_df()

Unnamed: 0,FAM87B,LINC00115,HES4,ISG15,ATAD3C,GABRD,AL590822.2,HES5,AL139246.5,AL139246.3,...,BACE2,TFF1,AP001626.1,ERVH48-1,FRGCA,AIRE,TRPM2,C21orf58,PCNT,MT-ND6
TATGTTCAGGGACACT-1_Lx108_UUUT,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
TTTGGAGGTCAGGAGT-1_Lx108_UUUT,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
ATTCCATAGGTTCCGC-1_Lx108_UUUT,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,6.0
AGGGTTTAGGGTGAGG-1_Lx108_UUUT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
TTCAATCTCGTCTCAC-1_Lx108_UUUT,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCATTGTTCATTGAGC-1_Lx108_UUUT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
CAATCGATCTTTACAC-1_Lx108_UUUT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACGTAGTAGTTAGTAG-1_Lx108_UUUT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
TTATTGCGTCTGTGTA-1_Lx108_UUUT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
#STEP 12: output Timepoint specific count matrix after subsampling by state
for state in unique_state:
    EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.{}.{}.qc.tpm.csv'.format(DATASET_ID, state, frac_cells))
    data_sub[state].to_df().to_csv(EXP_MTX_QC_FNAME, index=False, sep='\t')
    print("Finish with: ", EXP_MTX_QC_FNAME, " Shape: ", data_sub[state].shape)


Finish with:  /home/linl5/project/SCLC/results/lx108/lx108.UUUU.1.qc.tpm.csv  Shape:  (4128, 2000)
Finish with:  /home/linl5/project/SCLC/results/lx108/lx108.UUUT.1.qc.tpm.csv  Shape:  (4564, 2000)
Finish with:  /home/linl5/project/SCLC/results/lx108/lx108.UU.1.qc.tpm.csv  Shape:  (5035, 2000)
Finish with:  /home/linl5/project/SCLC/results/lx108/lx108.UTTU.1.qc.tpm.csv  Shape:  (4706, 2000)


In [None]:
# STEP 13: prepping for GRN, Loading in expression matrix and TF files

#Loading TF
tf_names = load_tf_names(HUMAN_TFS_FNAME)
print(HUMAN_TFS_FNAME, ": Size of TF list", len(tf_names))

#selecting !!!! CHANGE
frac_cells = 1

#expression matrix
for state in unique_state:
    EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.{}.{}.qc.tpm.csv'.format(DATASET_ID, state, frac_cells))
    ex_matrix = pd.read_csv(EXP_MTX_QC_FNAME, sep='\t', header=0)
   
    #Input Checking 
    print("\nExpression matrix shape for", DATASET_ID, state, ex_matrix.shape)

#STEP 14: Running GRNBOOST2 for coexpression modules
    adjacencies = grnboost2(expression_data=ex_matrix, tf_names=tf_names, verbose=True)
    print("\nCOMPLETED GRNBOOST2 RUNNING FOR", DATASET_ID, state)
    print("\n", adjacencies.head())
    ADJACENCIES_FNAME = os.path.join(DATA_FOLDERNAME, "{}.{}.{}.adjacencies.tsv".format(DATASET_ID, state, frac_cells))
    adjacencies.to_csv(ADJACENCIES_FNAME, index=False, sep='\t')
    print("SUCCESSFUL WRITING TO", ADJACENCIES_FNAME, "\n")
    

/home/linl5/project/SCLC/resources/allTFs_hg38.txt : Size of TF list 1892

Expression matrix shape for lx108 UUUU (4128, 2000)
preparing dask client
parsing input
creating dask graph
8 partitions
computing dask graph


In [86]:
#STEP 15: Prepping for RCistarget: Loading Database and motif

DATABASE_FOLDER = "/home/linl5/project/SCLC/auxilliaries/"
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "hg38_*.mc9nr.genes_vs_motifs.rankings.feather")

db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
    return os.path.splitext(os.path.basename(fname))[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
dbs

MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDERNAME,"motifs-v9-nr.hgnc-m0.001-o0.0.tbl")

In [87]:
#STEP 16: Running RCistarget

for state in unique_state:
    #reading necessary files for rcistarget
    ADJACENCIES_FNAME = os.path.join(DATA_FOLDERNAME, "{}.{}.{}.adjacencies.tsv".format(DATASET_ID, state, frac_cells))
    adjacencies = pd.read_csv(ADJACENCIES_FNAME, sep='\t')
    print("\nFINISHED READING ADJACENCIES FILE", ADJACENCIES_FNAME,"\n")
    EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.{}.{}.qc.tpm.csv'.format(DATASET_ID, state, frac_cells))
    ex_matrix = pd.read_csv(EXP_MTX_QC_FNAME, sep='\t', header=0, index_col=0)
    print("\nFINISHED READING EXPRESSION MATRIX", EXP_MTX_QC_FNAME,"\n")
    MODULES_FNAME = os.path.join(DATA_FOLDERNAME, '{}.{}.{}.modules.p'.format(DATASET_ID, state, frac_cells))
    MOTIFS_FNAME = os.path.join(DATA_FOLDERNAME, '{}.{}.{}.motifs.csv'.format(DATASET_ID, state, frac_cells))
    REGULONS_FNAME = os.path.join(DATA_FOLDERNAME, '{}.{}.{}.regulons.p'.format(DATASET_ID, state, frac_cells))

    #making modules from adjacencies
    modules = list(modules_from_adjacencies(adjacencies, ex_matrix))
    
    #writing modules object to file
    with open(MODULES_FNAME, 'wb') as f:
        pickle.dump(modules, f)
    print("\nCOMPLETED COEXPRESSION MODULE WRITING:", MODULES_FNAME,"\n")
    
    #running Rcistarget with progress bar: searching for enriched motifs and true candidate genes
    with ProgressBar():
        df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") 
    
    #writing enriched motifs with candidate target to file
    df.to_csv(MOTIFS_FNAME)
    print("\nCOMPLETED WRITING ENRICHED MOTIFS", MOTIFS_FNAME,"\n")
    print(df.head())
    
    #making regulon objects
    regulons = df2regulons(df)
    
    #writing regulon objects to file
    with open(REGULONS_FNAME, 'wb') as f:
        pickle.dump(regulons, f)
    print("\nCOMPLETED WRITING DISCOVERED REGULON", REGULONS_FNAME,"\n")


FINISHED READING ADJACENCIES FILE /home/linl5/project/SCLC/data/lx33/lx33.UTTU.0.33.adjacencies.tsv 




2023-08-07 16:31:28,203 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].



FINISHED READING EXPRESSION MATRIX /home/linl5/project/SCLC/results/lx33/lx33.UTTU.0.33.qc.tpm.csv 




2023-08-07 16:31:29,178 - pyscenic.utils - INFO - Creating modules.



COMPLETED COEXPRESSION MODULE WRITING: /home/linl5/project/SCLC/data/lx33/lx33.UTTU.0.33.modules.p 

[                                        ] | 0% Completed | 12.45 sms




[                                        ] | 0% Completed | 19.46 s




[                                        ] | 0% Completed | 19.97 s




[                                        ] | 0% Completed | 20.48 s





[                                        ] | 0% Completed | 25.75 s




[                                        ] | 0% Completed | 30.12 s




[                                        ] | 0% Completed | 30.42 s




[                                        ] | 0% Completed | 31.34 s




[                                        ] | 0% Completed | 52.48 s




[                                        ] | 0% Completed | 54.41 s




[                                        ] | 0% Completed | 55.83 s




[                                        ] | 0% Completed | 62.95 s




[                                        ] | 0% Completed | 64.27 s




[                                        ] | 0% Completed | 65.59 s




[                                        ] | 0% Completed | 66.30 s




[                                        ] | 0% Completed | 66.81 s




[                                        ] | 0% Completed | 71.49 s




[                                        ] | 0% Completed | 77.07 s




[                                        ] | 0% Completed | 77.38 s




[                                        ] | 0% Completed | 77.89 s





[                                        ] | 0% Completed | 80.53 s




[                                        ] | 0% Completed | 85.51 s




[                                        ] | 0% Completed | 87.85 s




[                                        ] | 0% Completed | 89.17 s




[                                        ] | 0% Completed | 95.37 s




[                                        ] | 0% Completed | 98.31 s




[                                        ] | 0% Completed | 106.14 s




[                                        ] | 0% Completed | 107.97 s




[                                        ] | 0% Completed | 108.48 s




[                                        ] | 0% Completed | 112.85 s




[                                        ] | 0% Completed | 119.36 s




[                                        ] | 0% Completed | 120.68 s




[                                        ] | 0% Completed | 121.29 s





[                                        ] | 0% Completed | 126.57 s




[                                        ] | 0% Completed | 129.31 s




[                                        ] | 0% Completed | 131.14 s




[                                        ] | 0% Completed | 132.97 s




[                                        ] | 0% Completed | 141.01 s




[                                        ] | 0% Completed | 142.94 s




[                                        ] | 0% Completed | 145.28 s




[                                        ] | 0% Completed | 151.28 s




[                                        ] | 0% Completed | 153.11 s




[                                        ] | 0% Completed | 154.33 s




[                                        ] | 0% Completed | 155.04 s




[                                        ] | 0% Completed | 155.65 s




[                                        ] | 0% Completed | 159.41 s




[                                        ] | 0% Completed | 166.53 s




[                                        ] | 0% Completed | 166.73 s




[                                        ] | 0% Completed | 168.16 s





[                                        ] | 0% Completed | 171.21 s




[                                        ] | 0% Completed | 174.26 s




[                                        ] | 0% Completed | 175.17 s




[                                        ] | 0% Completed | 175.68 s




[                                        ] | 0% Completed | 183.10 s




[                                        ] | 0% Completed | 184.53 s




[                                        ] | 0% Completed | 191.95 s




[                                        ] | 0% Completed | 193.78 s




[                                        ] | 0% Completed | 195.30 s




[########################################] | 100% Completed | 204.75 s

COMPLETED WRITING ENRICHED MOTIFS /home/linl5/project/SCLC/data/lx33/lx33.UTTU.0.33.motifs.csv 

                                         Enrichment            \
                                                AUC       NES   
TF    MotifID                                                   
ASCL2 dbcorrdb__RCOR1__ENCSR000EFG_1__m4   0.104411  3.002820   
      taipale__Ascl2_DBD_RRCAGCTGYY_repr   0.109456  3.239857   
      cisbp__M5987                         0.106915  3.120456   
ATF3  taipale__ATF4_DBD_NNATGAYGCAATN      0.106607  3.889506   
      cisbp__M5292                         0.108366  3.997711   

                                                                \
                                         MotifSimilarityQvalue   
TF    MotifID                                                    
ASCL2 dbcorrdb__RCOR1__ENCSR000EFG_1__m4              0.000098   
      taipale__Ascl2_DBD_RRCAGCTGYY_repr      


2023-08-07 16:35:02,305 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].



FINISHED READING EXPRESSION MATRIX /home/linl5/project/SCLC/results/lx33/lx33.UU.0.33.qc.tpm.csv 




2023-08-07 16:35:03,275 - pyscenic.utils - INFO - Creating modules.



COMPLETED COEXPRESSION MODULE WRITING: /home/linl5/project/SCLC/data/lx33/lx33.UU.0.33.modules.p 

[                                        ] | 0% Completed | 9.66 s ms




[                                        ] | 0% Completed | 10.27 s




[                                        ] | 0% Completed | 12.71 s




[                                        ] | 0% Completed | 15.15 s




[                                        ] | 0% Completed | 25.31 s




[                                        ] | 0% Completed | 32.63 s




[                                        ] | 0% Completed | 34.36 s




[                                        ] | 0% Completed | 51.23 s




[                                        ] | 0% Completed | 63.94 s




[                                        ] | 0% Completed | 68.93 s




[                                        ] | 0% Completed | 69.84 s




[                                        ] | 0% Completed | 75.43 s




[                                        ] | 0% Completed | 80.52 s




[                                        ] | 0% Completed | 83.16 s




[                                        ] | 0% Completed | 92.31 s




[                                        ] | 0% Completed | 93.22 s




[                                        ] | 0% Completed | 116.92 s




[                                        ] | 0% Completed | 118.34 s




[                                        ] | 0% Completed | 122.51 s




[                                        ] | 0% Completed | 124.34 s




[                                        ] | 0% Completed | 133.69 s




[                                        ] | 0% Completed | 141.01 s




[                                        ] | 0% Completed | 145.38 s




[                                        ] | 0% Completed | 157.19 s




[                                        ] | 0% Completed | 165.12 s




[                                        ] | 0% Completed | 167.35 s




[                                        ] | 0% Completed | 168.27 s




[                                        ] | 0% Completed | 172.54 s




[                                        ] | 0% Completed | 179.05 s




[                                        ] | 0% Completed | 182.60 s




[                                        ] | 0% Completed | 191.75 s




[                                        ] | 0% Completed | 192.67 s




[########################################] | 100% Completed | 217.18 s

COMPLETED WRITING ENRICHED MOTIFS /home/linl5/project/SCLC/data/lx33/lx33.UU.0.33.motifs.csv 

                                         Enrichment            \
                                                AUC       NES   
TF    MotifID                                                   
ASCL1 transfac_pro__M01302                 0.107635  3.989219   
      swissregulon__hs__MYFfamily.p2       0.110273  4.123076   
ASCL2 dbcorrdb__RCOR1__ENCSR000EFG_1__m4   0.134790  3.670968   
      taipale__Ascl2_DBD_RRCAGCTGYY_repr   0.124685  3.248325   
      cisbp__M5987                         0.122565  3.159653   

                                                                \
                                         MotifSimilarityQvalue   
TF    MotifID                                                    
ASCL1 transfac_pro__M01302                            0.000986   
      swissregulon__hs__MYFfamily.p2            


2023-08-07 16:38:49,738 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].



FINISHED READING EXPRESSION MATRIX /home/linl5/project/SCLC/results/lx33/lx33.UUUT.0.33.qc.tpm.csv 




2023-08-07 16:38:50,727 - pyscenic.utils - INFO - Creating modules.



COMPLETED COEXPRESSION MODULE WRITING: /home/linl5/project/SCLC/data/lx33/lx33.UUUT.0.33.modules.p 

[                                        ] | 0% Completed | 15.75 sms




[                                        ] | 0% Completed | 26.31 s




[                                        ] | 0% Completed | 39.51 s




[                                        ] | 0% Completed | 44.79 s




[####                                    ] | 10% Completed | 48.25 s




[####                                    ] | 10% Completed | 50.68 s





[####                                    ] | 10% Completed | 50.99 s




[####                                    ] | 10% Completed | 53.02 s




[####                                    ] | 10% Completed | 54.85 s




[####                                    ] | 10% Completed | 56.57 s




[####                                    ] | 10% Completed | 61.35 s




[####                                    ] | 10% Completed | 62.06 s




[####                                    ] | 10% Completed | 63.99 s




[####                                    ] | 10% Completed | 64.49 s




[####                                    ] | 10% Completed | 108.75 s




[####                                    ] | 10% Completed | 111.09 s




[####                                    ] | 10% Completed | 111.80 s




[####                                    ] | 10% Completed | 118.00 s




[####                                    ] | 10% Completed | 118.91 s




[####                                    ] | 10% Completed | 119.32 s




[####                                    ] | 10% Completed | 119.62 s




[####                                    ] | 10% Completed | 120.84 s




[####                                    ] | 10% Completed | 121.15 s




[####                                    ] | 10% Completed | 124.60 s




[####                                    ] | 10% Completed | 125.82 s




[####                                    ] | 10% Completed | 126.12 s




[####                                    ] | 10% Completed | 135.27 s




[####                                    ] | 10% Completed | 135.58 s




[####                                    ] | 10% Completed | 138.02 s




[####                                    ] | 10% Completed | 140.66 s




[####                                    ] | 10% Completed | 143.10 s




[####                                    ] | 10% Completed | 143.92 s




[####                                    ] | 10% Completed | 146.15 s




[####                                    ] | 10% Completed | 146.66 s




[####                                    ] | 10% Completed | 153.07 s




[####                                    ] | 10% Completed | 162.02 s




[####                                    ] | 10% Completed | 173.20 s




[####                                    ] | 10% Completed | 181.84 s




[####                                    ] | 10% Completed | 183.47 s





[####                                    ] | 10% Completed | 183.77 s




[####                                    ] | 10% Completed | 187.13 s




[####                                    ] | 10% Completed | 189.57 s




[####                                    ] | 10% Completed | 190.38 s




[####                                    ] | 10% Completed | 194.35 s




[####                                    ] | 10% Completed | 195.16 s




[####                                    ] | 10% Completed | 195.77 s




[####                                    ] | 10% Completed | 196.28 s




[####                                    ] | 10% Completed | 196.78 s




[####                                    ] | 10% Completed | 199.53 s




[####                                    ] | 10% Completed | 201.56 s




[############################            ] | 70% Completed | 213.82 s




[############################            ] | 70% Completed | 214.73 s




[############################            ] | 70% Completed | 215.13 s




[############################            ] | 70% Completed | 215.34 s




[############################            ] | 70% Completed | 215.74 s




[############################            ] | 70% Completed | 216.45 s




[############################            ] | 70% Completed | 220.91 s




[############################            ] | 70% Completed | 221.12 s




[############################            ] | 70% Completed | 227.31 s




[############################            ] | 70% Completed | 228.12 s




[############################            ] | 70% Completed | 228.73 s




[############################            ] | 70% Completed | 230.86 s




[############################            ] | 70% Completed | 234.41 s




[############################            ] | 70% Completed | 236.34 s




[############################            ] | 70% Completed | 240.40 s




[############################            ] | 70% Completed | 240.91 s




[########################################] | 100% Completed | 242.58 s

COMPLETED WRITING ENRICHED MOTIFS /home/linl5/project/SCLC/data/lx33/lx33.UUUT.0.33.motifs.csv 

                                                  Enrichment            \
                                                         AUC       NES   
TF    MotifID                                                            
ASCL2 dbcorrdb__TCF12__ENCSR000BGZ_1__m1            0.110078  4.259958   
      transfac_pro__M02737                          0.089683  3.116124   
ATF3  swissregulon__hs__FOS_FOS_B_L1__JUN_B_D_.p2   0.080327  3.217880   
      hocomoco__FOS_HUMAN.H11MO.0.A                 0.078251  3.036769   
      hocomoco__JUNB_MOUSE.H11MO.0.A                0.079459  3.142146   

                                                                         \
                                                  MotifSimilarityQvalue   
TF    MotifID                                                             
ASCL2 dbcorrd


2023-08-07 16:43:02,268 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].



FINISHED READING EXPRESSION MATRIX /home/linl5/project/SCLC/results/lx33/lx33.UUUU.0.33.qc.tpm.csv 




2023-08-07 16:43:03,114 - pyscenic.utils - INFO - Creating modules.



COMPLETED COEXPRESSION MODULE WRITING: /home/linl5/project/SCLC/data/lx33/lx33.UUUU.0.33.modules.p 

[                                        ] | 0% Completed | 11.99 sms




[                                        ] | 0% Completed | 17.78 s




[                                        ] | 0% Completed | 19.81 s




[                                        ] | 0% Completed | 21.64 s




[                                        ] | 0% Completed | 23.26 s





[                                        ] | 0% Completed | 24.17 s




[                                        ] | 0% Completed | 24.57 s




[                                        ] | 0% Completed | 27.40 s




[                                        ] | 0% Completed | 27.61 s




[                                        ] | 0% Completed | 27.81 s




[                                        ] | 0% Completed | 34.80 s




[                                        ] | 0% Completed | 35.91 s




[                                        ] | 0% Completed | 38.66 s




[                                        ] | 0% Completed | 40.18 s




[                                        ] | 0% Completed | 41.50 s




[                                        ] | 0% Completed | 42.62 s




[                                        ] | 0% Completed | 43.53 s





[                                        ] | 0% Completed | 43.94 s




[                                        ] | 0% Completed | 44.65 s




[                                        ] | 0% Completed | 45.05 s




[####                                    ] | 10% Completed | 47.02 s




[####                                    ] | 10% Completed | 110.88 s




[####                                    ] | 10% Completed | 125.69 s




[####                                    ] | 10% Completed | 126.91 s





[####                                    ] | 10% Completed | 127.32 s




[####                                    ] | 10% Completed | 133.00 s




[####                                    ] | 10% Completed | 136.05 s




[####                                    ] | 10% Completed | 140.11 s




[####                                    ] | 10% Completed | 144.17 s




[####                                    ] | 10% Completed | 145.70 s





[####                                    ] | 10% Completed | 146.61 s




[####                                    ] | 10% Completed | 148.44 s





[####                                    ] | 10% Completed | 148.85 s




[####                                    ] | 10% Completed | 150.37 s





[####                                    ] | 10% Completed | 150.68 s




[####                                    ] | 10% Completed | 151.39 s




[####                                    ] | 10% Completed | 153.63 s




[####                                    ] | 10% Completed | 155.76 s




[####                                    ] | 10% Completed | 156.27 s




[####                                    ] | 10% Completed | 160.24 s




[####                                    ] | 10% Completed | 167.26 s




[####                                    ] | 10% Completed | 172.95 s




[####                                    ] | 10% Completed | 176.10 s




[####                                    ] | 10% Completed | 180.88 s




[####                                    ] | 10% Completed | 184.94 s




[####                                    ] | 10% Completed | 185.65 s





[####                                    ] | 10% Completed | 186.87 s





[####                                    ] | 10% Completed | 187.48 s




[####                                    ] | 10% Completed | 188.39 s





[####                                    ] | 10% Completed | 189.61 s




[####                                    ] | 10% Completed | 190.63 s




[####                                    ] | 10% Completed | 191.54 s




[####                                    ] | 10% Completed | 191.75 s




[####                                    ] | 10% Completed | 195.61 s




[####                                    ] | 10% Completed | 198.96 s





[####                                    ] | 10% Completed | 200.69 s




[####                                    ] | 10% Completed | 201.71 s




[####                                    ] | 10% Completed | 203.23 s





[####                                    ] | 10% Completed | 203.84 s




[####                                    ] | 10% Completed | 205.77 s




[####                                    ] | 10% Completed | 207.70 s




[####                                    ] | 10% Completed | 208.31 s





[####                                    ] | 10% Completed | 208.82 s




[##########                              ] | 25% Completed | 210.05 s





[################                        ] | 40% Completed | 210.47 s




[################                        ] | 40% Completed | 210.87 s




[################                        ] | 40% Completed | 211.28 s





[################                        ] | 40% Completed | 211.69 s





[################                        ] | 40% Completed | 212.09 s




[################                        ] | 40% Completed | 212.80 s





[################                        ] | 40% Completed | 213.11 s




[################                        ] | 40% Completed | 214.94 s




[################                        ] | 40% Completed | 215.35 s




[################                        ] | 40% Completed | 216.36 s




[################                        ] | 40% Completed | 217.58 s




[################                        ] | 40% Completed | 220.42 s




[########################################] | 100% Completed | 250.36 s

COMPLETED WRITING ENRICHED MOTIFS /home/linl5/project/SCLC/data/lx33/lx33.UUUU.0.33.motifs.csv 

                                                   Enrichment            \
                                                          AUC       NES   
TF    MotifID                                                             
ASCL2 taipale_cyt_meth__MYOD1_NAACANNTGTYN_FL_meth   0.105471  3.310529   
      dbcorrdb__TCF12__ENCSR000BGZ_1__m1             0.117075  3.864414   
      taipale_cyt_meth__TCF21_NAACAGCTGYYN_eDBD      0.124886  4.237286   
      taipale_cyt_meth__MSC_NNACAGCTGTNN_FL_repr     0.100500  3.073247   
ATF3  transfac_pro__M07414                           0.094132  3.596068   

                                                                          \
                                                   MotifSimilarityQvalue   
TF    MotifID                                                              
AS

After grouping the data by timepoints
- How is clustering in this case difference by just making anndata subset selected by timepoint (pin)
- Do i do PCA after clustering -> pca changes the expression matrix so output to GENIE3 is not pca, just raw unnormalize, unpertrude data except basic filtering
- If you have more than one condition, it’s often helpful to perform integration to align the cells -? and then within each timepoint there would be three batches so i should find a way to remove this right