### Put the data sets into a common format

In [136]:
import os
import pandas as pd
import random
import scipy
import scanpy as sc

In [134]:
# Specify dataset id
ds_id = 'Miller2023'

# Specify base file path
base = f'D:/GitHub/Data/NSCLC/{ds_id}/'

# Specify path to raw data
ds_path = f'{base}Data/'

# Specify path to save AnnData
save_as = f'{base}{ds_id}_anndata.h5ad'

# Extract path to all raw data files
# ds2_path contains multiple files (list here)
mtxs, features, barcodes = [], [], []
for file in os.listdir(ds_path):
    if file.endswith('.mtx'):
        mtxs += [ds_path + file + '/matrix.mtx']
    if file.endswith('features.tsv'):
        features += [ds_path + file + '/features.tsv']
    if file.endswith('barcodes.tsv.gz'):
        barcodes += [ds_path + file]  

# Due to memory constraints, I'm going to select 5 random samples with which to work
# The result should remain helpful to demonstrate the workflow
samples = ['_'.join(i.split('/')[-2].split('_')[:-1]) for i in mtxs] # sample names determined from naming convention
rand_ind = random.sample(range(0, len(mtxs)), 5)
samples_to_use = [samples[i] for i in rand_ind]
def select_values_containing_samples_to_use(lst, substring):
    selected_values = []
    for i in range(0, len(substring)):
        selected_values += [value for value in lst if substring[i] in value]
    return selected_values
mtxs = select_values_containing_samples_to_use(mtxs, samples_to_use)
features = select_values_containing_samples_to_use(features, samples_to_use)
barcodes = select_values_containing_samples_to_use(barcodes, samples_to_use)

In [128]:
# Define functions to format each piece of data
def load_mtxs(files):
    matrices = []
    for file in files:
        matrix = scipy.io.mmread(file).tocsr()
        matrices.append(matrix)
    return scipy.sparse.hstack(matrices, format='coo')
def load_features(files):
    ftrs = pd.read_csv(files[0], sep='\t', header=None).drop(2, axis=1)
    for file in files[1:]:
        ftrs2 = pd.read_csv(file, sep='\t', header=None).drop(2, axis=1)
        if not ftrs2.equals(ftrs):
            print('Need to reformat')
    ftrs.index = ftrs.iloc[:, 1].values
    ftrs.columns = ['ENSG_id', 'gene_name']
    return ftrs
def load_barcodes(files):
    bar = []
    for file in files:
        bar_df = pd.read_csv(file, sep='\t', header=None)
        bar_df.columns = ['barcode']
        bar_df['sample'] = ['_'.join(file.split('/')[-1].split('_')[1:-1])]*len(bar_df) # based on file naming convention
        bar += [bar_df]
    bar_df = pd.concat(bar)
    bar_df.index = bar_df.iloc[:, 0].values
    return bar_df

In [129]:
# Run formatting functions
mtx_processed = load_mtxs(mtxs)
features_processed = load_features(features)
barcodes_processed = load_barcodes(barcodes)

In [131]:
# Add metadata
meta = barcodes_processed

In [132]:
# Create AnnData object
adata = sc.AnnData(X=mtx_processed.T.tocsr(), obs=meta, var=features_processed)
adata.var_names_make_unique()
adata.obs_names_make_unique()
adata.obs = adata.obs.astype('category')

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


In [135]:
# Save AnnData object
adata.write(save_as)
adata

AnnData object with n_obs × n_vars = 28909 × 33538
    obs: 'barcode', 'sample'
    var: 'ENSG_id', 'gene_name'