In [1]:
import squidpy as sq
import scanpy as sc
import scanpy as sc
import anndata as ad
import os
import sys
import pandas as pd
import numpy as np
import tifffile as tf



In [2]:
adats_list = os.listdir('../data/non_denoised/anndata/')
img_list = os.listdir('../data/non_denoised/img/')
masks_list = os.listdir('../data/non_denoised/masks/')

In [3]:
adatas=[]
imgs=[]

def preprocess_adata(sample):
    sample = sample.split('.')[0]
    filepath= '../data/non_denoised/anndata/' + sample + '.h5ad'
    imgpath='../data/non_denoised/img/' + sample + '.tiff'
    maskpath='../data/non_denoised/masks/' + sample + '.tiff'
    library_id = sample
    sample_id = 's'+sample.split('_')[1]
    adata=sc.read(filepath)
    new_obs_names = [sample_id +'_'+ object.split()[1] for object in adata.obs_names]
    adata.obs['Cell_ID'] = adata.obs_names.str.split().str[1].astype('int')
    adata.obs_names = new_obs_names
    adata.obs['library_id']=library_id
    coordinates = pd.concat([adata.obs['centroid-0'],adata.obs['centroid-1']],axis=1).to_numpy()
    adata.obsm['spatial'] = coordinates
    img=np.transpose(tf.imread(imgpath), (1, 2, 0))
    mask=tf.imread(maskpath)
    uns_dict = {
        'spatial':{
            library_id:{
                'metadata':{
                    'source_image_path':imgpath,
                    'source_mask_path':maskpath
                },
                'images':{
                    'hires':img,
                    'segmentation':mask
                },
                'scalefactor':{
                    'tissue_hires_scalef': 1,
                    'spot_diameter_fullres': 1
                }
            }
        }
    }
    adata.uns = uns_dict
    return adata, library_id


def preprocess_images(adata):
    library_id = list(adata.uns['spatial'].keys())[0]
    img_path=adata.uns['spatial'][library_id]['metadata']['source_image_path']
    img = sq.im.ImageContainer(adata.uns['spatial'][library_id]['metadata']['source_image_path'],
                               layer='img',
                               lazy=False)
    img.add_img(adata.uns['spatial'][library_id]['metadata']['source_mask_path'],
               layer='mask',
               lazy=False)
    return img
    
    
    
# for sample in adats_list:
#     adatas.append(preprocess_adata(sample))
#     adatas[sample_id] = adata

In [4]:
adatas=[]
imgs=[]
library_ids = []
for sample in adats_list:
    adata, library_id = preprocess_adata(sample)
    library_ids.append(library_id)
    adatas.append(adata)
    imgs.append(preprocess_images(adatas[-1]))
    



In [5]:
adata = ad.concat(
    adatas, uns_merge="only")


In [6]:
adata=adata[adata.obs['area']>5]

In [7]:
def read_reindex(file):
    df=pd.read_csv('../data/non_denoised/csv_tables/'+file+'.csv', index_col=[0])
    df=df.reindex(adata.obs_names)
    return df

In [8]:
exprs=read_reindex('exprs')

counts=read_reindex('counts')
logcounts=read_reindex('logcounts')

observations_metadata=read_reindex('observations_metadata')

fastMNN=read_reindex('fastMNN')
UMAP_mnnCorrected=read_reindex('UMAP_mnnCorrected')
PCA=read_reindex('PCA')

features_metadata=pd.read_csv('../data/non_denoised/csv_tables/features_metadata.csv', index_col=[0])

In [9]:
adata.X = counts.to_numpy()
adata.layers['exprs'] = exprs.to_numpy()
adata.layers['logcounts'] = logcounts.to_numpy()

  adata.layers['exprs'] = exprs.to_numpy()


In [10]:
adata.obs

Unnamed: 0,area,centroid-0,centroid-1,axis_major_length,axis_minor_length,eccentricity,Cell_ID,library_id
s50286_1,70,2.928571,524.428571,12.020755,7.820918,0.759405,1,ROI001_50286_ROI_1
s50286_2,82,3.817073,535.573171,11.156443,9.580750,0.512372,2,ROI001_50286_ROI_1
s50286_3,59,5.254237,543.711864,12.363587,6.369913,0.857061,3,ROI001_50286_ROI_1
s50286_4,26,2.423077,613.807692,6.332965,5.390113,0.524971,4,ROI001_50286_ROI_1
s50286_5,113,6.955752,733.938053,14.364921,10.194509,0.704523,5,ROI001_50286_ROI_1
...,...,...,...,...,...,...,...,...
s55199_3766,77,1937.714286,962.857143,12.507086,8.004374,0.768386,3766,ROI001_55199_ROI_1
s55199_3767,42,1936.357143,1342.571429,7.717018,7.599590,0.173787,3767,ROI001_55199_ROI_1
s55199_3768,34,1936.500000,706.294118,6.964070,6.154864,0.467860,3768,ROI001_55199_ROI_1
s55199_3769,111,1939.423423,926.288288,12.365452,11.708776,0.321545,3769,ROI001_55199_ROI_1


In [11]:
adata.obs.columns.tolist()

['area',
 'centroid-0',
 'centroid-1',
 'axis_major_length',
 'axis_minor_length',
 'eccentricity',
 'Cell_ID',
 'library_id']

In [12]:
observations_metadata=pd.concat([adata.obs,observations_metadata.loc[:,~observations_metadata.columns.isin(adata.obs.columns.tolist())]],ignore_index=False,axis=1)
adata.obs = observations_metadata

In [13]:
adata.var_names = features_metadata.index
adata.var = features_metadata

In [14]:
adata.obsm['PCA'] = PCA.to_numpy()
adata.obsm['fastMNN'] = fastMNN.to_numpy()
adata.obsm['UMAP_mnnCorrected'] = UMAP_mnnCorrected.to_numpy()


In [15]:
adata.write('../data/non_denoised/spe_minor_celltypes.h5ad')

In [16]:
adata.obs

Unnamed: 0,area,centroid-0,centroid-1,axis_major_length,axis_minor_length,eccentricity,Cell_ID,library_id,sample_id,ObjectNumber,...,tissue_id,slide_id,ROI,width_px,height_px,major_celltype,major_cell_type,minor_cell_type,DFCI_id,MRN
s50286_1,70,2.928571,524.428571,12.020755,7.820918,0.759405,1,ROI001_50286_ROI_1,ROI001_50286_ROI_1,1,...,BS-21-N50286,29,ROI001,2000,2000,Classical Monocytes,Myeloid,Classical Monocytes,19,903927
s50286_2,82,3.817073,535.573171,11.156443,9.580750,0.512372,2,ROI001_50286_ROI_1,ROI001_50286_ROI_1,2,...,BS-21-N50286,29,ROI001,2000,2000,Endothelial,Endothelial,Endothelial,19,903927
s50286_3,59,5.254237,543.711864,12.363587,6.369913,0.857061,3,ROI001_50286_ROI_1,ROI001_50286_ROI_1,3,...,BS-21-N50286,29,ROI001,2000,2000,Macrophages-hla,Myeloid,Macrophages-hla,19,903927
s50286_4,26,2.423077,613.807692,6.332965,5.390113,0.524971,4,ROI001_50286_ROI_1,ROI001_50286_ROI_1,4,...,BS-21-N50286,29,ROI001,2000,2000,Classical Monocytes,Myeloid,Classical Monocytes,19,903927
s50286_5,113,6.955752,733.938053,14.364921,10.194509,0.704523,5,ROI001_50286_ROI_1,ROI001_50286_ROI_1,5,...,BS-21-N50286,29,ROI001,2000,2000,Macrophages,Myeloid,Macrophages,19,903927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s55199_3766,77,1937.714286,962.857143,12.507086,8.004374,0.768386,3766,ROI001_55199_ROI_1,ROI001_55199_ROI_1,3766,...,BS-22-M55199,21,ROI001,1673,1948,Megakaryocytes,Megakaryocytes,Megakaryocytes,21,542444
s55199_3767,42,1936.357143,1342.571429,7.717018,7.599590,0.173787,3767,ROI001_55199_ROI_1,ROI001_55199_ROI_1,3767,...,BS-22-M55199,21,ROI001,1673,1948,CD8 Temra,T,CD8 Temra,21,542444
s55199_3768,34,1936.500000,706.294118,6.964070,6.154864,0.467860,3768,ROI001_55199_ROI_1,ROI001_55199_ROI_1,3768,...,BS-22-M55199,21,ROI001,1673,1948,CD56 dim NK,NK,CD56 dim NK,21,542444
s55199_3769,111,1939.423423,926.288288,12.365452,11.708776,0.321545,3769,ROI001_55199_ROI_1,ROI001_55199_ROI_1,3769,...,BS-22-M55199,21,ROI001,1673,1948,CD56 dim NK,NK,CD56 dim NK,21,542444


In [17]:
import pickle

# Open a file and use dump() 
with open('../data/non_denoised/images.pkl', 'wb') as file: 
    # A new file will be created 
    pickle.dump(imgs, file) 