### Put the data sets into a common format

In [1]:
import os
import pandas as pd
import random
import scipy
import scanpy as sc



In [2]:
# Specify dataset id
ds_id = 'DeZuani2024'

# Specify base file path
base = f'D:/GitHub/Data/NSCLC/{ds_id}/Data/'
base1 = f'D:/GitHub/Data/NSCLC/{ds_id}/'

# Specify path to raw data
samples = ['P10_B1', 'P17_B2', 'P18_B2'] # These are 3 random baseline samples (due to memory issues)
mtxs = [base+sample+'/matrix.mtx' for sample in samples]
barcodes = [base+sample+f'/{sample}-barcodes.tsv.gz' for sample in samples]
features = [base+sample+'/features.tsv' for sample in samples]

# Specify path to save AnnData
save_as = f'{base1}{ds_id}_anndata.h5ad'

In [3]:
# Define functions to format each piece of data
def load_mtxs(files):
    matrices = []
    for file in files:
        matrix = scipy.io.mmread(file).tocsr()
        matrices.append(matrix)
    return scipy.sparse.hstack(matrices, format='coo')
def load_features(files):
    ftrs = pd.read_csv(files[0], sep='\t', header=None).drop(2, axis=1)
    for file in files[1:]:
        ftrs2 = pd.read_csv(file, sep='\t', header=None).drop(2, axis=1)
        if not ftrs2.equals(ftrs):
            print('Need to reformat')
    ftrs.index = ftrs.iloc[:, 1].values
    ftrs.columns = ['ENSG_id', 'gene_name']
    return ftrs
def load_barcodes(files):
    bar = []
    for file in files:
        bar_df = pd.read_csv(file, sep='\t', header=None)
        bar_df.columns = ['barcode']
        bar_df['sample'] = [file.split('/')[-1].split('-')[0]]*len(bar_df) # based on file naming convention
        bar += [bar_df]
    bar_df = pd.concat(bar)
    bar_df.index = bar_df.iloc[:, 0].values
    return bar_df

In [4]:
# Run formatting functions
mtx_processed = load_mtxs(mtxs)
features_processed = load_features(features)
barcodes_processed = load_barcodes(barcodes)

In [5]:
# Add metadata
meta = barcodes_processed

In [6]:
# Create AnnData object
adata = sc.AnnData(X=mtx_processed.T.tocsr(), obs=meta, var=features_processed)
adata.var_names_make_unique()
adata.obs_names_make_unique()
adata.obs = adata.obs.astype('category')

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


In [7]:
# Save AnnData object
adata.write(save_as)
adata

AnnData object with n_obs × n_vars = 20384640 × 33538
    obs: 'barcode', 'sample'
    var: 'ENSG_id', 'gene_name'