# 1 preprocessing 

In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import csv
import gzip
import anndata as ad
import glob

sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

-----
anndata     0.8.0
scanpy      1.9.3
-----
MulticoreTSNE               NA
PIL                         9.1.0
appnope                     0.1.2
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
cffi                        1.15.0
cloudpickle                 2.1.0
cycler                      0.10.0
cython_runtime              NA
cytoolz                     0.11.0
dask                        2022.9.0
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
entrypoints                 0.4
executing                   0.8.3
fsspec                      2022.8.2
h5py                        3.7.0
hypergeom_ufunc             NA
igraph                      0.10.1
ipykernel                   6.9.1
ipython_genutils            0.2.0
ipywidgets                  7.6.5
jedi                        0.18.1
jinja2                      3.0.3
joblib   

In [192]:
sample = 'SF11815'

### Reading in expression data: 10x Genomics mtx files

In [6]:
def reprocess(sample): 
    output_folder = "/Users/lidiayung/project/resource/specimens"
    resource_folder = "/Users/lidiayung/project/resource/GSE174554_RAW/"

    matrix_path = glob.glob(f"{resource_folder}/GSM*_{sample}_matrix.mtx.gz")[0]

    features_path = glob.glob(f"{resource_folder}/GSM*_{sample}_features.tsv.gz")[0]

    barcodes_path = glob.glob(f"{resource_folder}/GSM*_{sample}_barcodes.tsv.gz")[0]



    output_path = os.path.join(output_folder, sample)


# path to unfiltered loom file (this will be created in the optional steps below)
    f_loom_path_unfilt = "unfiltered.loom" # test dataset, n=500 cells

# # path to loom file with basic filtering applied (this will be created in the "initial filtering" step below). Optional.
    #f_loom_path_scenic = "filtered_scenic.loom"

# path to anndata object, which will be updated to store Scanpy results as they are generated below
    f_anndata_path = "reprocessanndata.h5ad"

# path to pyscenic output
    #f_pyscenic_output = "output.loom"
#pyscenic_output.loom
# loom output, generated from a combination of Scanpy and pySCENIC results:
    #f_final_loom = 'scenic_integrated-output.loom'

    mat = scipy.io.mmread(matrix_path)
    feature_ids = [row[0] for row in csv.reader(gzip.open(features_path, mode="rt"), delimiter="\t")]

    gene_names = [row[0] for row in csv.reader(gzip.open(features_path, mode="rt"), delimiter="\t")]

    feature_types = [row[0] for row in csv.reader(gzip.open(features_path, mode="rt"), delimiter="\t")]

    barcodes = [row[0] for row in csv.reader(gzip.open(barcodes_path, mode="rt"), delimiter="\t")]

    matrix = pd.DataFrame.sparse.from_spmatrix(mat)

    matrix.columns = barcodes

    matrix=matrix.transpose() 

    matrix.columns = gene_names

    matrix

    matrix.shape

# convert the index and columns to DataFrame objects
    obs_df = matrix.index.to_frame(index=False)
    var_df = matrix.columns.to_frame(index=False)

# create an AnnData object from the DataFrame
    adata = ad.AnnData(X=matrix.values, obs=obs_df, var=var_df)

# print the resulting AnnData object
    print(adata)
    row_attrs = { 
        "Gene": np.array(var_df[0]) ,}
    col_attrs = { 
        "CellID":  np.array(matrix.index) ,
        "nGene": np.array( np.sum(adata.X.transpose()>0 , axis=0)).flatten() ,
        "nUMI": np.array( np.sum(adata.X.transpose() , axis=0)).flatten() ,}
    lp.create( f_loom_path_unfilt, adata.X.transpose(), row_attrs, col_attrs )


    info = adata.X.transpose()

    print(info.shape)

    print('loaded an expression matrix of {} cells by {} genes'.format(info.shape[1], info.shape[0]))

    print('number of genes: {}'.format(len(adata.var.index)))

    print('number of cells: {}'.format(len(adata.obs.index)))

# read unfiltered data from a loom file
    adata = sc.read_loom( f_loom_path_unfilt )
    adata

    nCountsPerGene = np.sum(adata.X, axis=0)

    nCellsPerGene = np.sum(adata.X>0, axis=0)

# Show info
    print("Number of counts (in the dataset units) per gene:", nCountsPerGene.min(), " - " ,nCountsPerGene.max())
    print("Number of cells in which each gene is detected:", nCellsPerGene.min(), " - " ,nCellsPerGene.max())


    nCells=adata.X.shape[0]

# pySCENIC thresholds
    minCountsPerGene=3*.01*nCells # 3 counts in 1% of cells
    print("minCountsPerGene: ", minCountsPerGene)

    minSamples=.01*nCells # 1% of cells
    print("minSamples: ", minSamples)

# simply compute the number of genes per cell (computers 'n_genes' column)
    sc.pp.filter_cells(adata, min_genes=0)
# mito and genes/counts cuts
    mito_genes = adata.var_names.str.startswith('MT-')
# for each cell compute fraction of counts in mito genes vs. all genes
    adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1

# initial cuts
    sc.pp.filter_cells(adata, min_genes=200 )
#sc.pp.filter_genes(adata, min_cells=3 )
#adata = adata[adata.obs['n_genes'] < 4000, :]
    adata = adata[adata.obs['percent_mito'] <=0.025, :]


    adata.write( f_anndata_path )

# create basic row and column attributes for the loom file:
    #row_attrs = {"Gene": np.array(adata.var_names) ,}
    #col_attrs = {
    #    "CellID": np.array(adata.obs_names) ,
    #    "nGene": np.array( np.sum(adata.X.transpose()>0 , axis=0)).flatten() ,
    #    "nUMI": np.array( np.sum(adata.X.transpose() , axis=0)).flatten() ,}

    #lp.create( f_loom_path_scenic, adata.X.transpose(), row_attrs, col_attrs)


    info = adata.X.transpose()

    print(info.shape)

    print('storing for SCENIC an expression matrix of {} cells by {} genes'.format(info.shape[1], info.shape[0]))

    print('number of genes: {}'.format(len(adata.var_names)))

    print('number of cells: {}'.format(len(adata.obs_names)))
    
    return adata.obs.nUMI

In [7]:
sample='SF2777'
reprocess(sample)

  adata = ad.AnnData(X=matrix.values, obs=obs_df, var=var_df)


AnnData object with n_obs × n_vars = 4183 × 33694
    obs: 0
    var: 0
(33694, 4183)
loaded an expression matrix of 4183 cells by 33694 genes
number of genes: 33694
number of cells: 4183
Number of counts (in the dataset units) per gene: 0.0  -  605365.0
Number of cells in which each gene is detected: 0  -  4181
minCountsPerGene:  125.49
minSamples:  41.83
(33694, 3708)
storing for SCENIC an expression matrix of 3708 cells by 33694 genes
number of genes: 33694
number of cells: 3708


CellID
AAACCCAAGGGATCAC-1     579.0
AAACCCAGTCGATTTG-1     402.0
AAACCCAGTCGTCAGC-1    1518.0
AAACCCAGTTGTAAAG-1     891.0
AAACGAAGTAGACAAT-1    2795.0
                       ...  
TTTGTTGAGTACCATC-1    1559.0
TTTGTTGCACTGGCCA-1     942.0
TTTGTTGGTACTCAAC-1    1253.0
TTTGTTGTCCCAAGTA-1    1574.0
TTTGTTGTCGGATTAC-1     843.0
Name: nUMI, Length: 3708, dtype: float32

In [252]:
sample ='SF10514'
nUMI_result = reprocess(sample)

  adata = ad.AnnData(X=matrix.values, obs=obs_df, var=var_df)


AnnData object with n_obs × n_vars = 3356 × 33694
    obs: 0
    var: 0
(33694, 3356)
loaded an expression matrix of 3356 cells by 33694 genes
number of genes: 33694
number of cells: 3356
Number of counts (in the dataset units) per gene: 0.0  -  681102.0
Number of cells in which each gene is detected: 0  -  3355
minCountsPerGene:  100.67999999999999
minSamples:  33.56
(33694, 3355)
storing for SCENIC an expression matrix of 3355 cells by 33694 genes
number of genes: 33694
number of cells: 3355


### Carry out the filtering steps:

In [284]:
def percalculator(sample):
    
    file = "/Users/lidiayung/project/resource/GSE174554_RAW/GSE174554_Tumor_normal_metadata.txt"

    metadata= pd.read_csv(file,sep=' ')

    metadata.head()
    new_df = metadata[metadata["Sample#"] == sample].copy()

    
    new_df['Barcode'] = new_df['Barcode'].str.replace('\.\d', '')
    new_df['Barcode'] = new_df['Barcode']+'-1'
    print(new_df)
    
    intersection_barcodes = set(new_df['Barcode']).intersection(nUMI_result.index)
    tumor = new_df[new_df['Barcode'].isin(intersection_barcodes) & (metadata['Tumor_Normal_annotation'] == 'Tumor')]

    percentage = "{:.2%}".format(len(tumor)/len(nUMI_result))
    print("Percentage: {:.2%}".format(len(tumor)/len(nUMI_result)))
    post_filtering = f"{len(tumor)}/{len(nUMI_result) - len(tumor)}"
    print(post_filtering)

In [285]:
samples= ['SF11981', 'SF9372','SF10108']

In [316]:
sample = 'SF10108'
nUMI_result = reprocess(sample)
nUMI_result

  adata = ad.AnnData(X=matrix.values, obs=obs_df, var=var_df)


AnnData object with n_obs × n_vars = 4446 × 33694
    obs: 0
    var: 0
(33694, 4446)
loaded an expression matrix of 4446 cells by 33694 genes
number of genes: 33694
number of cells: 4446
Number of counts (in the dataset units) per gene: 0.0  -  1481281.0
Number of cells in which each gene is detected: 0  -  4423
minCountsPerGene:  133.38
minSamples:  44.46
(33694, 4405)
storing for SCENIC an expression matrix of 4405 cells by 33694 genes
number of genes: 33694
number of cells: 4405


CellID
AAACCCAAGTCTGGTT-1    2036.0
AAACCCACAGTAACGG-1    4961.0
AAACCCAGTGTTTGCA-1    1319.0
AAACCCATCGGCAGTC-1    2995.0
AAACCCATCTTGGAAC-1    5099.0
                       ...  
TTTGTTGCATCGCCTT-1    2203.0
TTTGTTGGTTTCACTT-1    3774.0
TTTGTTGTCATCTATC-1    4960.0
TTTGTTGTCCTGTACC-1    6780.0
TTTGTTGTCTCATTAC-1    6461.0
Name: nUMI, Length: 4405, dtype: float32

In [317]:
    
file = "/Users/lidiayung/project/resource/GSE174554_RAW/GSE174554_Tumor_normal_metadata.txt"

metadata= pd.read_csv(file,sep=' ')

metadata.head()


new_df = metadata[metadata["Sample#"] == 'SF10108'].copy()
new_df['Barcode'] = new_df['Barcode'].str.replace('\.\d', '')
new_df['Barcode'] = new_df['Barcode']+'-1'
print(new_df)
    

        Sample#             Barcode Tumor_Normal_annotation
194895  SF10108  AAAGAACCAGTAGAGC-1                  Normal
194896  SF10108  AACGAAAGTCAGGTAG-1                  Normal
194897  SF10108  AATTTCCCAGTCCCGA-1                  Normal
194898  SF10108  ACAACCACACAAGCTT-1                  Normal
194899  SF10108  ACGGAAGGTAGAGACC-1                  Normal
...         ...                 ...                     ...
199336  SF10108  GTGGAAGTCGCCGATG-1                  Normal
199337  SF10108  TAAGTCGGTGGTAATA-1                  Normal
199338  SF10108  AGACAGGTCATTGTGG-1                   Tumor
199339  SF10108  CATGGATAGACCTTTG-1                  Normal
199340  SF10108  GGCACGTTCTGGTCAA-1                   Tumor

[4446 rows x 3 columns]


  new_df['Barcode'] = new_df['Barcode'].str.replace('\.\d', '')


In [318]:
intersection_barcodes = set(new_df['Barcode']).intersection(nUMI_result.index)
tumor = new_df[new_df['Barcode'].isin(intersection_barcodes) & (metadata['Tumor_Normal_annotation'] == 'Tumor')]

percentage = "{:.2%}".format(len(tumor)/len(nUMI_result))
print("Percentage: {:.2%}".format(len(tumor)/len(nUMI_result)))
post_filtering = f"{len(tumor)}/{len(nUMI_result) - len(tumor)}"
print(post_filtering)
print({'Sample': sample, 'Post-filtering': post_filtering,'Percentage':percentage})

Percentage: 64.04%
2821/1584
{'Sample': 'SF10108', 'Post-filtering': '2821/1584', 'Percentage': '64.04%'}


  tumor = new_df[new_df['Barcode'].isin(intersection_barcodes) & (metadata['Tumor_Normal_annotation'] == 'Tumor')]


## Further pre-processing of expression data

In [None]:
# save a copy of the raw data
adata.raw = adata

# Total-count normalize (library-size correct) to 10,000 reads/cell
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

# log transform the data.
sc.pp.log1p(adata)

# identify highly variable genes.
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)

# keep only highly variable genes:
adata = adata[:, adata.var['highly_variable']]

# regress out total counts per cell and the percentage of mitochondrial genes expressed
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'] ) #, n_jobs=args.threads)

# scale each gene to unit variance, clip values exceeding SD 10.
sc.pp.scale(adata, max_value=10)

# update the anndata file:
adata.write( f_anndata_path )

### 1 PCA

In [None]:
# adata = sc.read_h5ad( f_anndata_path )
# principal component analysis
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log=True)
adata.write( f_anndata_path )

### 2 Visualization of highly variable genes

In [None]:
# neighborhood graph of cells (determine optimal number of PCs here)
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
# compute UMAP
sc.tl.umap(adata)
# tSNE
tsne = TSNE( n_jobs=2 )
adata.obsm['X_tsne'] = tsne.fit_transform( adata.X )
adata.write( f_anndata_path )

### 3 Clustering

In [None]:
# cluster the neighbourhood graph
sc.tl.louvain(adata, resolution=0.4)

sc.pl.umap(adata, color=['louvain'] )

In [None]:
# find marker genes
sc.tl.rank_genes_groups(adata, 'louvain', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

# sc.tl.rank_genes_groups(adata, 'louvain', method='logreg')
# sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(10)
adata.write( f_anndata_path )

In [None]:
### 4 distribution of genes per cell

In [None]:
nGenesDetectedPerCell = np.sum(adata.X>0, axis=1)
nGenesDetectedPerCell = pd.Series(nGenesDetectedPerCell)
percentiles = nGenesDetectedPerCell.quantile([0.01, 0.05, 0.10, 0.50, 1])
print(percentiles)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=150)
sns.distplot(nGenesDetectedPerCell, norm_hist=False, kde=False, bins='fd')
for i,x in enumerate(percentiles):
    fig.gca().axvline(x=x, ymin=0,ymax=1, color='red')
    ax.text(x=x, y=ax.get_ylim()[1], s=f'{int(x)} ({percentiles.index.values[i]*100}%)', color='red', rotation=30, size='x-small',rotation_mode='anchor' )
ax.set_xlabel('# of genes')
ax.set_ylabel('# of cells')
fig.tight_layout()

## SCENIC

In [None]:
f_tfs = "/home/jing/scratch/resource/allTFs_hg38.txt"

In [None]:
!pyscenic grn {f_loom_path_scenic} {f_tfs} -o adj.csv --num_workers 8

In [None]:
adjacencies = pd.read_csv("adj.csv", index_col=False, sep='\t')

In [None]:
adjacencies.head()

In [None]:
adjacencies.head()

In [None]:
import glob
# ranking databases
f_db_glob = "/home/jing/scratch/resource/*feather"
f_db_names = ' '.join( glob.glob(f_db_glob) )

# motif databases
f_motif_path = "/home/jing/scratch/resource/motifs-v9-nr.hgnc-m0.001-o0.0.tbl"

In [None]:
!pyscenic ctx adj.csv \
    {f_db_names} \
    --annotations_fname {f_motif_path} \
    --expression_mtx_fname {f_loom_path_scenic} \
    --output reg.csv \
    --mask_dropouts \
    --num_workers 8

In [None]:
nGenesDetectedPerCell = np.sum(adata.X>0, axis=1)
nGenesDetectedPerCell = pd.Series(nGenesDetectedPerCell)
percentiles = nGenesDetectedPerCell.quantile([0.01, 0.05, 0.10, 0.50, 1])
print(percentiles)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=150)
sns.distplot(nGenesDetectedPerCell, norm_hist=False, kde=False, bins='fd')
for i,x in enumerate(percentiles):
    fig.gca().axvline(x=x, ymin=0,ymax=1, color='red')
    ax.text(x=x, y=ax.get_ylim()[1], s=f'{int(x)} ({percentiles.index.values[i]*100}%)', color='red', rotation=30, size='x-small',rotation_mode='anchor' )
ax.set_xlabel('# of genes')
ax.set_ylabel('# of cells')
fig.tight_layout()

In [None]:
!pyscenic aucell \
    {f_loom_path_scenic} \
    reg.csv \
    --output {f_pyscenic_output} \
    --num_workers 8

In [None]:
import json
import zlib
import base64
import loompy as lp
import pandas as pd

# collect SCENIC AUCell output
lf = lp.connect(f_pyscenic_output, mode='r+', validate=False)
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
lf.close()

In [None]:
auc_mtx.shape

In [None]:
sns.clustermap(auc_mtx, figsize=(12,12))

In [None]:
import umap

# UMAP
runUmap = umap.UMAP(n_neighbors=10, min_dist=0.4, metric='correlation').fit_transform
dr_umap = runUmap( auc_mtx )
pd.DataFrame(dr_umap, columns=['X', 'Y'], index=auc_mtx.index).to_csv( "scenic_umap.txt", sep='\t')
# tSNE
tsne = TSNE( n_jobs=8 )
dr_tsne = tsne.fit_transform( auc_mtx )
pd.DataFrame(dr_tsne, columns=['X', 'Y'], index=auc_mtx.index).to_csv( "scenic_tsne.txt", sep='\t')

In [None]:
pdf = pd.DataFrame(dr_umap, columns=['X', 'Y'], index=auc_mtx.index)
pdf.head()

In [None]:
plt.plot(pdf['X'], pdf['Y'], 'o', alpha=1/13)
plt.show()

In [None]:
pdf = pd.DataFrame(dr_tsne, columns=['X', 'Y'], index=auc_mtx.index)
plt.plot(pdf['X'], pdf['Y'], 'o', alpha=1/13)
plt.show()

In [None]:
# scenic output
lf = lp.connect( f_pyscenic_output, mode='r+', validate=False )
meta = json.loads(zlib.decompress(base64.b64decode( lf.attrs.MetaData )))
#exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID)
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
regulons = lf.ra.Regulons
dr_umap = pd.read_csv( 'scenic_umap.txt', sep='\t', header=0, index_col=0 )
dr_tsne = pd.read_csv( 'scenic_tsne.txt', sep='\t', header=0, index_col=0 )
###

In [None]:
auc_mtx.columns = auc_mtx.columns.str.replace('\(','_(')
regulons.dtype.names = tuple( [ x.replace("(","_(") for x in regulons.dtype.names ] )
# regulon thresholds
rt = meta['regulonThresholds']
for i,x in enumerate(rt):
    tmp = x.get('regulon').replace("(","_(")
    x.update( {'regulon': tmp} )

In [None]:
tsneDF = pd.DataFrame(adata.obsm['X_tsne'], columns=['_X', '_Y'])

Embeddings_X = pd.DataFrame( index=lf.ca.CellID )
Embeddings_X = pd.concat( [
        pd.DataFrame(adata.obsm['X_umap'],index=adata.obs.index)[0] ,
        pd.DataFrame(adata.obsm['X_pca'],index=adata.obs.index)[0] ,
        dr_tsne['X'] ,
        dr_umap['X']
    ], sort=False, axis=1, join='outer' )
Embeddings_X.columns = ['1','2','3','4']

Embeddings_Y = pd.DataFrame( index=lf.ca.CellID )
Embeddings_Y = pd.concat( [
        pd.DataFrame(adata.obsm['X_umap'],index=adata.obs.index)[1] ,
        pd.DataFrame(adata.obsm['X_pca'],index=adata.obs.index)[1] ,
        dr_tsne['Y'] ,
        dr_umap['Y']
    ], sort=False, axis=1, join='outer' )
Embeddings_Y.columns = ['1','2','3','4']

In [None]:
metaJson = {}

metaJson['embeddings'] = [
    {
        "id": -1,
        "name": f"Scanpy t-SNE (highly variable genes)"
    },
    {
        "id": 1,
        "name": f"Scanpy UMAP  (highly variable genes)"
    },
    {
        "id": 2,
        "name": "Scanpy PC1/PC2"
    },
    {
        "id": 3,
        "name": "SCENIC AUC t-SNE"
    },
    {
        "id": 4,
        "name": "SCENIC AUC UMAP"
    },
]

metaJson["clusterings"] = [{
            "id": 0,
            "group": "Scanpy",
            "name": "Scanpy louvain default resolution",
            "clusters": [],
        }]

metaJson["metrics"] = [
        {
            "name": "nUMI"
        }, {
            "name": "nGene"
        }, {
            "name": "Percent_mito"
        }
]

metaJson["annotations"] = [
    {
        "name": "Louvain_clusters_Scanpy",
        "values": list(set( adata.obs['louvain'].astype(np.str) ))
    },
    #{
    #    "name": "Genotype",
    #    "values": list(set(adata.obs['Genotype'].values))
    #},
    #{
    #    "name": "Timepoint",
    #    "values": list(set(adata.obs['Timepoint'].values))
    #},
    #{
    #    "name": "Sample",
    #    "values": list(set(adata.obs['Sample'].values))
    #}
]

# SCENIC regulon thresholds:
metaJson["regulonThresholds"] = rt

for i in range(max(set([int(x) for x in adata.obs['louvain']])) + 1):
    clustDict = {}
    clustDict['id'] = i
    clustDict['description'] = f'Unannotated Cluster {i + 1}'
    metaJson['clusterings'][0]['clusters'].append(clustDict)
    
clusterings = pd.DataFrame()
clusterings["0"] = adata.obs['louvain'].values.astype(np.int64)

In [None]:
def dfToNamedMatrix(df):
    arr_ip = [tuple(i) for i in df.values]
    dtyp = np.dtype(list(zip(df.dtypes.index, df.dtypes)))
    arr = np.array(arr_ip, dtype=dtyp)
    return arr

In [None]:
col_attrs = {
    "CellID": np.array(adata.obs.index),
    "nUMI": np.array(adata.obs['n_counts'].values),
    "nGene": np.array(adata.obs['n_genes'].values),
    "Louvain_clusters_Scanpy": np.array( adata.obs['louvain'].values ),
    #"Genotype": np.array(adata.obs['Genotype'].values),
    #"Timepoint": np.array(adata.obs['Timepoint'].values),
    #"Sample": np.array(adata.obs['Sample'].values),
    "Percent_mito": np.array(adata.obs['percent_mito'].values),
    "Embedding": dfToNamedMatrix(tsneDF),
    "Embeddings_X": dfToNamedMatrix(Embeddings_X),
    "Embeddings_Y": dfToNamedMatrix(Embeddings_Y),
    "RegulonsAUC": dfToNamedMatrix(auc_mtx),
    "Clusterings": dfToNamedMatrix(clusterings),
    "ClusterID": np.array(adata.obs['louvain'].values)
}

row_attrs = {
    "Gene": lf.ra.Gene,
    "Regulons": regulons,
}

attrs = {
    "title": "sampleTitle",
    "MetaData": json.dumps(metaJson),
    "Genome": 'hg38',
    "SCopeTreeL1": "",
    "SCopeTreeL2": "",
    "SCopeTreeL3": ""
}

# compress the metadata field:
attrs['MetaData'] = base64.b64encode(zlib.compress(json.dumps(metaJson).encode('ascii'))).decode('ascii')

In [None]:
lp.create(
    filename = "f_final_loom" ,
    layers=lf[:,:],
    row_attrs=row_attrs, 
    col_attrs=col_attrs, 
    file_attrs=attrs
)
lf.close() # close original pyscenic loom file