In [9]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb

import warnings
warnings.filterwarnings('ignore')

In [10]:
# Set up data loading
#Data files

sample_strings = ['GSM4848442_cv72', 'GSM4848443_cv73', 'GSM4848444_cv75', 'GSM4848445_cv76', 'GSM4848446_flu74', 'GSM4848447_s01_01', 'GSM4848448_s03_03', 'GSM4848449_s03_09', 'GSM4848450_s04_09', 'GSM4848451_c72', 'GSM4848452_c73', 'GSM4848453_c75', 'GSM4848454_c76', 'GSM4848455_c74', 'GSM4848456_c15_12', 'GSM4848457_c15_17', 'GSM4848458_c16_18', 'GSM4848459_c16_23', 'GSM4848460_c16_24', 'GSM4848461_c16_25', 'GSM5171528_CT_01_07', 'GSM5171529_CT_02_01', 'GSM5171530_CT_02_11', 'GSM5171531_CTX_71', 'GSM5171532_CTX_78', 'GSM5171533_CTX_90', 'GSM5171534_CTX_91', 'GSM5171535_CP_78', 'GSM5171536_CP_90', 'GSM5171537_CP_91']
sample_mapping = {'GSM4848442':	'COVID-19_1',
                  'GSM4848443':	'COVID-19_2',
                  'GSM4848444':	'COVID-19_3',
                  'GSM4848445':	'COVID-19_4',
                  'GSM4848446':	'Influenza_1',
                  'GSM4848447':	'Control_1',
                  'GSM4848448':	'Control_2',
                  'GSM4848449':	'Control_3',
                  'GSM4848450':	'Control_4',
                  'GSM4848451': 'COVID-19_5',
                  'GSM4848452':	'COVID-19_6',
                  'GSM4848453':	'COVID-19_7',
                  'GSM4848454':	'COVID-19_8',
                  'GSM4848455':	'Influenza_2',
                  'GSM4848456':	'Control_5',
                  'GSM4848457':	'Control_6',
                  'GSM4848458':	'Control_7',
                  'GSM4848459':	'Control_8',
                  'GSM4848460':	'Control_9',
                  'GSM4848461':	'Control_10',
                  'GSM5171528':	'Control_11',
                  'GSM5171529':	'Control_12',
                  'GSM5171530':	'Control_13',
                  'GSM5171531':	'COVID-19_9',
                  'GSM5171532':	'COVID-19_10',
                  'GSM5171533':	'COVID-19_11',
                  'GSM5171534':	'COVID-19_12',
                  'GSM5171535':	'COVID-19_13',
                  'GSM5171536':	'COVID-19_14',
                  'GSM5171537':	'COVID-19_15'
}
# sample_strings1 = ['GSM4848442_cv72', 'GSM4848443_cv73', 'GSM4848444_cv75', 'GSM4848445_cv76', 'GSM4848446_flu74', 'GSM4848447_s01_01', 'GSM4848448_s03_03', 'GSM4848449_s03_09', 'GSM4848450_s04_09', 'GSM4848451_c72', 'GSM4848452_c73']
# sample_strings2 = ['GSM4848453_c75', 'GSM4848454_c76', 'GSM4848455_c74', 'GSM4848456_c15_12']
# sample_strings3 = ['GSM4848457_c15_17', 'GSM4848458_c16_18', 'GSM4848459_c16_23', 'GSM4848460_c16_24', 'GSM4848461_c16_25', 'GSM5171528_CT_01_07']
# sample_strings4 = ['GSM5171529_CT_02_01', 'GSM5171530_CT_02_11', 'GSM5171531_CTX_71', 'GSM5171532_CTX_78', 'GSM5171533_CTX_90', 'GSM5171534_CTX_91', 'GSM5171535_CP_78', 'GSM5171536_CP_90', 'GSM5171537_CP_91']
#sample_id_strings = ['3', '4', '5', '6', '7', '8']
file_base = 'data/GSE159812_RAW_covid/'
#exp_string = '_Regional_'
data_file_end = '_matrix.mtx'
barcode_file_end = '_barcodes.tsv'
gene_file_end = '_features.tsv'

print('Number of samples:', len(sample_strings))
# print('Number of samples 1:', len(sample_strings1))
# print('Number of samples 2:', len(sample_strings2))
# print('Number of samples 3:', len(sample_strings3))

Number of samples: 30


In [11]:
# First data set load & annotation
#Parse Filenames

sample_id = 0
sample = sample_strings.pop(0)
#sample_id = sample_id_strings.pop(0)
data_file = file_base+sample+data_file_end
barcode_file = file_base+sample+barcode_file_end
gene_file = file_base+sample+gene_file_end
print(f'Extracting data from {data_file}')
print(f'Extracting barcode from {barcode_file}')
print(f'Extracting gene feature from {gene_file}')

#Load data
adata = sc.read(data_file, cache=True)
adata = adata.transpose()
#adata.X = adata.X.toarray()

barcodes = pd.read_csv(barcode_file, header=None, sep='\t')
genes = pd.read_csv(gene_file, header=None, sep='\t')

#Annotate data
barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata.obs = barcodes
sample_code = sample.split('_')[0]
sample = sample_mapping[sample_code]
adata.obs['sample'] = [sample]*adata.n_obs
adata.obs['sample_id'] = [sample_id]*adata.n_obs
adata.obs['batch_'] = [sample_id]*adata.n_obs
#adata.obs['region'] = [sample.split("_")[0]]*adata.n_obs
#adata.obs['donor'] = [sample.split("_")[1]]*adata.n_obs

genes.drop([2], axis=1, inplace=True)
genes.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
genes.set_index('gene_symbol', inplace=True)
adata.var = genes
adata.obs_names_make_unique()
adata.var_names_make_unique()


Extracting data from data/GSE159812_RAW_covid/GSM4848442_cv72_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848442_cv72_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848442_cv72_features.tsv


In [12]:
print(adata.obs.index.is_unique)
print(adata.var.index.is_unique)

True
True


In [13]:
adata.obs

Unnamed: 0_level_0,sample,sample_id,batch_
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAACCCAAGGCTTTCA-1,COVID-19_1,0,0
AAACCCACAGGTATGG-1,COVID-19_1,0,0
AAACCCATCGAGTACT-1,COVID-19_1,0,0
AAACCCATCGTGGTAT-1,COVID-19_1,0,0
AAACGAAAGAAGAACG-1,COVID-19_1,0,0
...,...,...,...
TTTGTTGCACCAAATC-1,COVID-19_1,0,0
TTTGTTGTCACGGACC-1,COVID-19_1,0,0
TTTGTTGTCCACGTGG-1,COVID-19_1,0,0
TTTGTTGTCTGTTGGA-1,COVID-19_1,0,0


In [14]:

#  Loop to load rest of data sets
for i in range(len(sample_strings)):
    sample_id += 1
    #Parse Filenames
    sample = sample_strings[i]
    data_file = file_base+sample+data_file_end
    barcode_file = file_base+sample+barcode_file_end
    gene_file = file_base+sample+gene_file_end
    print(f'Extracting data from {data_file}')
    print(f'Extracting barcode from {barcode_file}')
    print(f'Extracting gene feature from {gene_file}')
    
    #Load data
    adata_tmp = sc.read(data_file, cache=True)
    adata_tmp = adata_tmp.transpose()
    #adata_tmp.X = adata_tmp.X.toarray()

    barcodes_tmp = pd.read_csv(barcode_file, header=None, sep='\t')
    genes_tmp = pd.read_csv(gene_file, header=None, sep='\t')
    
    #Annotate data
    barcodes_tmp.rename(columns={0:'barcode'}, inplace=True)
    barcodes_tmp.set_index('barcode', inplace=True)
    adata_tmp.obs = barcodes_tmp
    sample_code = sample.split('_')[0]
    sample = sample_mapping[sample_code]    
    adata_tmp.obs['sample'] = [sample]*adata_tmp.n_obs
    adata_tmp.obs['sample_id'] = [sample_id]*adata_tmp.n_obs
    adata_tmp.obs['batch_'] = [sample_id]*adata_tmp.n_obs
    #adata_tmp.obs['region'] = [sample.split("_")[0]]*adata_tmp.n_obs
    #adata_tmp.obs['donor'] = [sample.split("_")[1]]*adata_tmp.n_obs
    adata_tmp.obs_names_make_unique()
    
    genes_tmp.drop([2], axis=1, inplace=True)
    genes_tmp.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
    genes_tmp.set_index('gene_symbol', inplace=True)
    adata_tmp.var = genes_tmp
    adata_tmp.var_names_make_unique()

    # Concatenate to main adata object
    
    adata = adata.concatenate(adata_tmp, batch_key='sample_id')
    #adata.obs.drop(columns=['sample_id'], inplace=True)
    adata.obs_names = [c.split("-")[0] for c in adata.obs_names]
    adata.obs_names_make_unique(join='_')
    print("Done!!!")

Extracting data from data/GSE159812_RAW_covid/GSM4848443_cv73_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848443_cv73_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848443_cv73_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848444_cv75_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848444_cv75_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848444_cv75_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848445_cv76_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848445_cv76_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848445_cv76_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848446_flu74_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848446_flu74_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848446_flu74_features.tsv
Done!!!
Extracting data from data/GSE

In [15]:
adata.obs.drop(columns=['sample_id'], inplace=True)
adata.obs.rename(columns={'batch_': 'batch'}, inplace=True)


In [16]:
adata.X = adata.X.toarray()

In [17]:
adata

AnnData object with n_obs × n_vars = 121622 × 36601
    obs: 'sample', 'batch'
    var: 'gene_id'

In [18]:
adata.obs

Unnamed: 0,sample,batch
AAACCCAAGGCTTTCA,COVID-19_1,0
AAACCCACAGGTATGG,COVID-19_1,0
AAACCCATCGAGTACT,COVID-19_1,0
AAACCCATCGTGGTAT,COVID-19_1,0
AAACGAAAGAAGAACG,COVID-19_1,0
...,...,...
TTTGGTTCACAACGCC,COVID-19_15,29
TTTGGTTGTACAAGCG,COVID-19_15,29
TTTGTTGAGCCGTAAG,COVID-19_15,29
TTTGTTGCAAGCGGAT,COVID-19_15,29


In [19]:
adata.obs['sample'].unique()

array(['COVID-19_1', 'COVID-19_2', 'COVID-19_3', 'COVID-19_4',
       'Influenza_1', 'Control_1', 'Control_2', 'Control_3', 'Control_4',
       'COVID-19_5', 'COVID-19_6', 'COVID-19_7', 'COVID-19_8',
       'Influenza_2', 'Control_5', 'Control_6', 'Control_7', 'Control_8',
       'Control_9', 'Control_10', 'Control_11', 'Control_12',
       'Control_13', 'COVID-19_9', 'COVID-19_10', 'COVID-19_11',
       'COVID-19_12', 'COVID-19_13', 'COVID-19_14', 'COVID-19_15'],
      dtype=object)

In [20]:
adata.var

Unnamed: 0_level_0,gene_id
gene_symbol,Unnamed: 1_level_1
MIR1302-2HG,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
AL627309.1,ENSG00000238009
AL627309.3,ENSG00000239945
...,...
AC141272.1,ENSG00000277836
AC023491.2,ENSG00000278633
AC007325.1,ENSG00000276017
AC007325.4,ENSG00000278817


In [21]:
# Save adata
save_file = './data/h5ab_folder/gse159812_raw_covid.h5ad'
adata.write_h5ad(save_file)

In [23]:
del adata
del adata_tmp