In [1]:
import os
import matplotlib.pyplot as plt

import scvelo as scv
import scanpy as sc
import cellrank as cr
import numpy as np
import pandas as pd
import anndata as ad
import scipy as sci

sc.settings.verbosity=0 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.n_jobs=4
sc.settings.set_figure_params(dpi=50, facecolor="white", frameon=False, figsize=(4,4))
plt.rcParams['pdf.fonttype'] = 42
%matplotlib inline



# load velo files

In [2]:
def starsolo_velocity_anndata(input_dir):
    # Load Genes and Cells identifiers
    """
    input directory should contain barcodes.tsv, features.tsv with 3 mtx from spliced, ambigious, unspliced
    """
    try:
        obs = pd.read_csv(os.path.join(input_dir,'barcodes.tsv'), header = None, index_col = 0)
        # Remove index column name to make it compliant with the anndata format
        obs.index.name = None

        var = pd.read_csv(os.path.join(input_dir,"features.tsv"), sep='\t',names = ('gene_ids', 'feature_types'), index_col = 1)
        var.index.name = None

        from scipy import io,sparse

        spliced=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"spliced.mtx")).T)
        ambiguous=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"ambiguous.mtx")).T)
        unspliced=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"unspliced.mtx")).T)
        adata=ad.AnnData(X=spliced,obs=obs,var=var,layers={'spliced':spliced,"ambiguous":ambiguous,"unspliced":unspliced})
        adata.var_names_make_unique()
        return adata
    except:
        obs = pd.read_csv(os.path.join(input_dir,'barcodes.tsv.gz'), header = None, index_col = 0)
        # Remove index column name to make it compliant with the anndata format
        obs.index.name = None

        var = pd.read_csv(os.path.join(input_dir,"features.tsv.gz"), sep='\t',names = ('gene_ids', 'feature_types'), index_col = 1)
        var.index.name = None

        from scipy import io,sparse

        spliced=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"spliced.mtx.gz")).T)
        ambiguous=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"ambiguous.mtx.gz")).T)
        unspliced=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"unspliced.mtx.gz")).T)
        adata=ad.AnnData(X=spliced,obs=obs,var=var,layers={'spliced':spliced,"ambiguous":ambiguous,"unspliced":unspliced})
        adata.var_names_make_unique()
        return adata


In [1]:
adata_dict={}
for FILE in os.listdir('/nfs/team298/ls34/reprocess_public_10x/'):
    if FILE.startswith("GSE"):
        try:
            if FILE.startswith("GSM"):
                print(FILE)
                NEW_PATH = '/nfs/team298/ls34/reprocess_public_10x/GSE273559/' +FILE + '/output/Velocyto/filtered'
                adata_i = starsolo_velocity_anndata(NEW_PATH)
                adata_i.obs["DonorID"]=FILE
                adata_dict[FILE]=adata_i
                print(adata_i.shape)
        except:
            if "log" not in FILE:
                print("fail with", FILE)
            
adata_i

In [4]:
ldata = ad.concat(adata_dict.values(), join='outer') # label='sample_id', keys=list(adata_dict.keys()))


  utils.warn_names_duplicates("obs")


In [5]:
ldata.obs["barcode"]=ldata.obs.index

In [7]:
ldata.write('/nfs/team298/ls34/disease_atlas/final_fb_adatas/ldata_all.h5ad')

# MERGE

In [None]:
# Find the shared barcodes between adata and ldata
shared_barcodes = adata.obs["barcode"].isin(ldata.obs["barcode"])

# Subset ldata to only include the rows (cells) where barcodes match those in adata
ldata_subset = ldata[ldata.obs["barcode"].isin(adata.obs["barcode"])].copy()
ldata_subset



In [None]:
scv.utils.clean_obs_names(ldata_subset)
scv.utils.clean_obs_names(adata)

In [None]:
adata = scv.utils.merge(adata, ldata_subset)
adata

In [None]:
adata_path_w_ldata =  PATH + ".velo"
adata.write(adata_path_w_ldata)
adata_path_w_ldata

In [None]:
# FULL GENE VERSION
PATH=

    
adata=sc.read_h5ad(PATH)
adata.shape
adata

In [None]:
sc.pp.filter_genes(adata, min_counts=50)
adata.shape

In [None]:
scv.pl.proportions(adata, groupby='test12')


In [None]:
adata = adata[(adata.obs["test12"].str.startswith("F1"))|
              (adata.obs["test12"].str.startswith("F2"))|
             (adata.obs["test12"].str.startswith("F3"))|
              (adata.obs["test12"].str.startswith("F6"))
             ]
adata=adata[adata.obs["Site_status_binary"]=="Lesional"]

In [None]:
scv.pp.neighbors(adata, n_neighbors=30, use_rep = 'X_scvi')  #n_pcs = 10) #random_state=0)# use_rep = 'X_scvi')


In [None]:
scv.pp.moments(adata, n_pcs=None, n_neighbors=30)    # neighbor_key="n")

In [None]:
adata.write(PATH +".moments")
PATH +".moments"