In [None]:
import anndata
import scanpy as sc
import pandas as pd
import numpy as np

In [None]:
# write an anndata object for each sample

# easiest if all feature/matrix/barcode files have similar name format
# specify prefixes in files list below
files = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']

for fileID in files:
    print(f'working on {fileID}...')
    
    # read features file to be used for var field of the anndata object
    file_name = f'{fileID}_features.tsv'
    df = pd.read_csv(file_name,sep = '\t',header= None, index_col=0)
    var = df
    # specify index name and column names however you want
    # (would be good to pull out the features dataframe and look at it to decide on column names
    var.index.name = 'GeneID'
    var.columns = ['Gene_Name', 'Info']
    
    # read matrix file to be used for main anndata object for this sample
    file_name = f'{fileID}_matrix.mtx'
    adata = sc.read_mtx(file_name, dtype='float64')
    adata = adata.T.copy() # Without copy - downstream does not work correctly
    
    # read barcodes file to be used for obs field of the anndata object
    file_name = f'{fileID}_barcodes.tsv'
    df = pd.read_csv(file_name,sep = '\t', header = None, index_col = 0)
    df.index.name = 'barcode'
    # Edit individual anndata objects index for obs to append patient letter/lane
    # so the barcodes match with the metadata file
    df.index = df.index + fileID
    obs = df
    
    # set anndata object obs and var fields with the barcode and features dataframes
    adata.obs = obs
    adata.var = var
    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    
    # write one sample anndata object to a file
    adata.write_h5ad(f'sample_{fileID}.h5ad',compression='gzip')

In [None]:
# read all anndata objects you want to join together
adata_A = sc.read('sample_A.h5ad')
adata_B = sc.read('sample_B.h5ad')
adata_C = sc.read('sample_C.h5ad')
adata_D = sc.read('sample_D.h5ad')
adata_E = sc.read('sample_E.h5ad')
adata_F = sc.read('sample_F.h5ad')
adata_G = sc.read('sample_G.h5ad')
adata_H = sc.read('sample_H.h5ad')

In [None]:
# join all anndata objects together
# make sure you know what order you joined things together so later the metadata can be joined appropriately
# a column will be added to obs with the batch number obtained from the order of the join here (aka A is 0, B is 1, etc.)
# important to use outer join so everything gets concatenated together
adata_all = adata_A.concatenate(adata_B, adata_C, adata_D, adata_E, adata_F, adata_G, adata_H, join='outer',index_unique=None)


In [None]:
# read metadata file and look at it to confirm what the lanes are for each sample
# if you don't have lanes but instead have patient IDs that correspond between the metadata file and the sample files
# then confirm what the patient IDs are for each sample
df_meta = pd.read_csv('metadata_pre_CD8_withclusters.csv', index_col=0)
df_meta.index.name = 'barcode'
df_meta

In [None]:
# specify a map from batch (in the joined anndata object) to lane (in the metadata dataframe) [or patient ID]
# this is to make sure the merge in the next step works correctly
batch_to_lane = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H'}

# add a lane column in the big anndata object obs field that should match exactly with the lane column in the metadata object
adata_all.obs['lane'] = adata_all.obs['batch'].astype(int).map(batch_to_lane)

In [None]:
# drop rows that have NaNs in the lane
df_meta_drop_nan = df_meta[~df_meta['lane'].isnull()]

In [None]:
# merge the big anndata object's obs field with the metadata
# important to use outer join so all the info from the metadata ends up in the final dataframe
# using reset_index and merging left/right on ['barcode', 'lane'] which should be the two columns that match exactly
# 'barcode' is the index for adata_all.obs, but when we use reset_index, it'll become a column to merge on
test_merge_reset = adata_all.obs.reset_index().merge(df_meta_drop_nan.reset_index(), how='outer', left_on = ['barcode','lane'], right_on = ['barcode','lane'])
test_merge_reset

In [None]:
# set the index of the merged dataframe to 'barcode'
test_merge = test_merge_reset.set_index('barcode')

In [None]:
# set the obs field of the big anndata object to the merged dataframe with the metadata
# if everything has worked properly, the shapes of test_merge and adata_all.obs should match in terms of number of rows
# if there is an issue here, see if there are any repeating indices for some reason
adata_all.obs = test_merge

In [None]:
# write the final anndata object to a file
adata_all.write('all_samples_clusters.h5ad')