In [4]:
import pandas as pd
import random
import numpy as np
import scanpy as sc
# import scipy
from tqdm import tqdm

data_path = '/data2/hratch/immune_CCI/covid/covid_atlas/'
load_h5 = True

In [None]:
pbmc_covid = sc.read_mtx(data_path + 'raw/GSE158055_covid19_counts.mtx.gz') # raw counts
if load_h5: 
    pch5 = sc.read_h5ad(data_path + 'raw/COVID19_ALL.h5ad') # load dataset
# pbmc_covid = sc.read_10x_mtx(data_path + 'raw/counts/')

print('Finished loading covid datasets')

In [39]:
# exclude samples with fewer than 2000 cells
md_cell = pd.read_csv(data_path + 'raw/GSE158055_cell_annotation.csv.gz')
n_samples = md_cell.sampleID.value_counts()
samples_to_keep = n_samples[n_samples > 2000].index.tolist()




In [40]:
md = pd.read_excel(data_path + 'raw/GSE158055_sample_metadata.xlsx', sheet_name = 0, skiprows=20)
md = md.iloc[range(304 - 20), range(25)]
md = md[md['Sample name'].isin(samples_to_keep)]


contexts = md['characteristics: CoVID-19 severity'].unique()
n_contexts = contexts.shape[0]

context_counts = md['characteristics: CoVID-19 severity'].value_counts() 
min_context_type = context_counts[context_counts == context_counts.min()].index.tolist()[0]
min_context_count = len(md[md['characteristics: CoVID-19 severity'] == min_context_type]['Patients'].unique())

max_samples = min_context_count*n_contexts

context_map = {context: md[md['characteristics: CoVID-19 severity'] == \
                           context][['Sample name', 'Patients']].reset_index(drop = True) for context in contexts}

In [41]:
# randomly select samples subsetted from the entire dataset
# make sure to choose an even number of each context
# make sure not to repeat patients within a context

n_iter = 1 # number of times to run subsetting
seed = 0

Samples = pd.DataFrame(columns = ['iteration', 'n_samples', 'sample_names'])
idx = 0

sample_iters = [3, 6, 12, 24, 36, 48, 60, 75]#list(range(n_contexts, max_samples + 1, n_contexts))

for iteration in range(n_iter):
    for n_samples in sample_iters:
        cmap_temp = context_map.copy()
        n_sample_per_context = int(n_samples/n_contexts)
        samples = list()
        for context in contexts:
            df = context_map[context].sample(frac=1, random_state = seed).drop_duplicates(subset = 'Patients') # shuffle rows to randomly drop duplicates
            samples += df.sample(n = n_sample_per_context, random_state = seed)['Sample name'].tolist()
            seed += 1
        Samples.loc[idx,:] = [iteration, n_samples, samples]
        idx += 1

Samples['sample_names'] = Samples.sample_names.apply(lambda x: '; '.join(x))            
Samples.to_csv(data_path + 'interim/timing_inputs/samples_for_timing.csv')            

  arr_value = np.array(value)


In [42]:
#cell_ids = pd.read_csv(data_path + 'raw/GSE158055_covid19_barcodes.tsv.gz', header = None)
gene_ids = pd.read_csv(data_path + 'raw/GSE158055_covid19_features.tsv.gz', header = None)
md_cell.set_index('cellName', drop = True, inplace = True)

pbmc_covid = pbmc_covid.transpose() 

In [None]:
if load_h5:
    pbmc_covid.obs = pch5.obs
    pbmc_covid.var = pch5.var
else:
    pbmc_covid.obs = md_cell
    pbmc_covid.var = gene_ids.set_index(0, drop = True)

In [46]:
# split by sample id 

def flatten_list(t):
    return [item for sublist in t for item in sublist]

def create_raw_counts(sample_id):
    df = pbmc_covid[pbmc_covid.obs.sampleID == sample_id]
    sc.pp.filter_cells(df, min_genes=50) 
#     sc.pp.filter_genes(df, min_cells = 3) # avoid filtering genes, will need intersection of remaining genes, which filters to many out when subsequently filtering for LR pairs
    return df

sample_ids = list(set(flatten_list([sn.split('; ') for sn in Samples.sample_names.tolist()])))
sample_counts = {sample_id: create_raw_counts(sample_id) for sample_id in tqdm(sample_ids)}


min_cells_to_keep = min([df.n_obs for sample_id, df in sample_counts.items()])


  0%|          | 0/140 [00:00<?, ?it/s][ATrying to set attribute `.obs` of view, copying.

  1%|          | 1/140 [00:06<14:38,  6.32s/it][ATrying to set attribute `.obs` of view, copying.

  1%|▏         | 2/140 [00:06<06:49,  2.97s/it][ATrying to set attribute `.obs` of view, copying.

  2%|▏         | 3/140 [00:07<04:15,  1.86s/it][ATrying to set attribute `.obs` of view, copying.

  3%|▎         | 4/140 [00:08<03:10,  1.40s/it][ATrying to set attribute `.obs` of view, copying.

  4%|▎         | 5/140 [00:08<02:09,  1.04it/s][ATrying to set attribute `.obs` of view, copying.

  4%|▍         | 6/140 [00:08<01:48,  1.24it/s][ATrying to set attribute `.obs` of view, copying.

  5%|▌         | 7/140 [00:09<01:41,  1.31it/s][ATrying to set attribute `.obs` of view, copying.

  6%|▌         | 8/140 [00:10<01:32,  1.43it/s][ATrying to set attribute `.obs` of view, copying.

  6%|▋         | 9/140 [00:10<01:28,  1.48it/s][ATrying to set attribute `.obs` of view, copying.

  7%|▋ 

 58%|█████▊    | 81/140 [01:41<05:06,  5.20s/it][ATrying to set attribute `.obs` of view, copying.

 59%|█████▊    | 82/140 [01:43<04:20,  4.49s/it][ATrying to set attribute `.obs` of view, copying.

 59%|█████▉    | 83/140 [01:46<03:38,  3.84s/it][ATrying to set attribute `.obs` of view, copying.

 60%|██████    | 84/140 [01:46<02:35,  2.78s/it][ATrying to set attribute `.obs` of view, copying.

 61%|██████    | 85/140 [01:46<01:53,  2.07s/it][ATrying to set attribute `.obs` of view, copying.

 61%|██████▏   | 86/140 [01:47<01:22,  1.52s/it][ATrying to set attribute `.obs` of view, copying.

 62%|██████▏   | 87/140 [02:03<05:11,  5.87s/it][ATrying to set attribute `.obs` of view, copying.

 63%|██████▎   | 88/140 [02:07<04:35,  5.29s/it][ATrying to set attribute `.obs` of view, copying.

 64%|██████▎   | 89/140 [02:11<04:08,  4.87s/it][ATrying to set attribute `.obs` of view, copying.

 64%|██████▍   | 90/140 [02:18<04:36,  5.52s/it][ATrying to set attribute `.obs` of view, 

In [47]:
# seed = 24
seed += 1

# subset to min cells_to_keep and write to csv
cells_to_keep = list()
for sample_id in tqdm(sample_counts):
    random.seed(seed)
    df = sample_counts[sample_id]
    df = df[df.obs.index.isin(random.sample(df.obs.index.tolist(), min_cells_to_keep))] # subset
#     df.to_df().to_csv(data_path + 'interim/umi_for_timing/' + sample_id + '.csv') # write
    df.to_df().to_hdf(data_path + 'interim/timing_inputs/umi_per_sample.h5', key = sample_id)
    cells_to_keep += df.obs.index.tolist()
    seed += 1

pbmc_covid.obs[pbmc_covid.obs.index.isin(cells_to_keep)].to_csv(data_path + 'interim/timing_inputs/metadata_for_timing.csv')


  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)



  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)



  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)



  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)



  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)



  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)



  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)

  check_attribute_name(name)



100%|██████████| 140/140 [05:54<00:00,  2.53s/it][A


# Classification

In [17]:
md = pd.read_excel(data_path + 'raw/GSE158055_sample_metadata.xlsx', sheet_name = 0, skiprows=20)
md = md.iloc[range(304 - 20), range(25)]
md = md[md['Sample name'].isin(samples_to_keep)] # still filter for datasets with > 2000 cells

In [27]:
sample_ids = md['Sample name'].unique().tolist()
pbmc_covid.obs[pbmc_covid.obs.sampleID.isin(sample_ids)].to_csv(data_path + 'interim/classification_inputs/metadata.csv')
sample_counts = {sample_id: create_raw_counts(sample_id) for sample_id in tqdm(sample_ids)}

In [37]:
for sample_id in tqdm(sample_counts):
    df = sample_counts[sample_id]
#     df.to_df().to_csv(data_path + 'interim/umi_for_classification/' + sample_id + '.csv') # write
    df.to_df().to_hdf(data_path + 'interim/classification_inputs/umi_per_sample.h5', key = sample_id)

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attrib

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attrib

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attrib

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
100%|██████████| 227/227 [27:18<00:00,  7.22s/it]
