In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set up data loading
#Data files
sample_strings1 = ['GSM4848442_cv72', 'GSM4848443_cv73', 'GSM4848444_cv75', 'GSM4848445_cv76', 'GSM4848446_flu74', 'GSM4848447_s01_01', 'GSM4848448_s03_03', 'GSM4848449_s03_09', 'GSM4848450_s04_09', 'GSM4848451_c72', 'GSM4848452_c73']
sample_strings2 = ['GSM4848453_c75', 'GSM4848454_c76', 'GSM4848455_c74', 'GSM4848456_c15_12']
sample_strings3 = ['GSM4848457_c15_17', 'GSM4848458_c16_18', 'GSM4848459_c16_23', 'GSM4848460_c16_24', 'GSM4848461_c16_25', 'GSM5171528_CT_01_07']
sample_strings4 = ['GSM5171529_CT_02_01', 'GSM5171530_CT_02_11', 'GSM5171531_CTX_71', 'GSM5171532_CTX_78', 'GSM5171533_CTX_90', 'GSM5171534_CTX_91', 'GSM5171535_CP_78', 'GSM5171536_CP_90', 'GSM5171537_CP_91']
#sample_id_strings = ['3', '4', '5', '6', '7', '8']
file_base = 'data/GSE159812_RAW_covid/'
#exp_string = '_Regional_'
data_file_end = '_matrix.mtx'
barcode_file_end = '_barcodes.tsv'
gene_file_end = '_features.tsv'

print('Number of samples 1:', len(sample_strings1))
print('Number of samples 2:', len(sample_strings2))
print('Number of samples 3:', len(sample_strings3))

Number of samples 1: 11
Number of samples 2: 4
Number of samples 3: 6


In [3]:
# First data set load & annotation
#Parse Filenames
sample = sample_strings1.pop(0)
#sample_id = sample_id_strings.pop(0)
data_file = file_base+sample+data_file_end
barcode_file = file_base+sample+barcode_file_end
gene_file = file_base+sample+gene_file_end
print(f'Extracting data from {data_file}')
print(f'Extracting barcode from {barcode_file}')
print(f'Extracting gene feature from {gene_file}')

#Load data
adata = sc.read(data_file, cache=True)
adata = adata.transpose()
adata.X = adata.X.toarray()

barcodes = pd.read_csv(barcode_file, header=None, sep='\t')
genes = pd.read_csv(gene_file, header=None, sep='\t')

#Annotate data
barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata.obs = barcodes
adata.obs['sample'] = [sample]*adata.n_obs
#adata.obs['region'] = [sample.split("_")[0]]*adata.n_obs
adata.obs['donor'] = [sample.split("_")[1]]*adata.n_obs

genes.drop([2], axis=1, inplace=True)
genes.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
genes.set_index('gene_symbol', inplace=True)
adata.var = genes
adata.obs_names_make_unique()
adata.var_names_make_unique()


Extracting data from data/GSE159812_RAW_covid/GSM4848442_cv72_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848442_cv72_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848442_cv72_features.tsv


In [4]:
print(adata.obs.index.is_unique)
print(adata.var.index.is_unique)

True
True


In [5]:
# Loop to load rest of data sets
for i in range(len(sample_strings1)):
    #Parse Filenames
    sample = sample_strings1[i]
    data_file = file_base+sample+data_file_end
    barcode_file = file_base+sample+barcode_file_end
    gene_file = file_base+sample+gene_file_end
    print(f'Extracting data from {data_file}')
    print(f'Extracting barcode from {barcode_file}')
    print(f'Extracting gene feature from {gene_file}')
    
    #Load data
    adata_tmp = sc.read(data_file, cache=True)
    adata_tmp = adata_tmp.transpose()
    adata_tmp.X = adata_tmp.X.toarray()

    barcodes_tmp = pd.read_csv(barcode_file, header=None, sep='\t')
    genes_tmp = pd.read_csv(gene_file, header=None, sep='\t')
    
    #Annotate data
    barcodes_tmp.rename(columns={0:'barcode'}, inplace=True)
    barcodes_tmp.set_index('barcode', inplace=True)
    adata_tmp.obs = barcodes_tmp
    adata_tmp.obs['sample'] = [sample]*adata_tmp.n_obs
    #adata_tmp.obs['region'] = [sample.split("_")[0]]*adata_tmp.n_obs
    adata_tmp.obs['donor'] = [sample.split("_")[1]]*adata_tmp.n_obs
    adata_tmp.obs_names_make_unique()
    
    genes_tmp.drop([2], axis=1, inplace=True)
    genes_tmp.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
    genes_tmp.set_index('gene_symbol', inplace=True)
    adata_tmp.var = genes_tmp
    adata_tmp.var_names_make_unique()

    # Concatenate to main adata object
    
    adata = adata.concatenate(adata_tmp)
    #adata.obs.drop(columns=['sample_id'], inplace=True)
    adata.obs_names = [c.split("-")[0] for c in adata.obs_names]
    adata.obs_names_make_unique(join='_')
    print("Done!!!")

Extracting data from data/GSE159812_RAW_covid/GSM4848443_cv73_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848443_cv73_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848443_cv73_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848444_cv75_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848444_cv75_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848444_cv75_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848445_cv76_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848445_cv76_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848445_cv76_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848446_flu74_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848446_flu74_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848446_flu74_features.tsv
Done!!!
Extracting data from data/GSE

In [6]:
# Loop to load rest of data sets
for i in range(len(sample_strings2)):
    #Parse Filenames
    sample = sample_strings2[i]
    data_file = file_base+sample+data_file_end
    barcode_file = file_base+sample+barcode_file_end
    gene_file = file_base+sample+gene_file_end
    print(f'Extracting data from {data_file}')
    print(f'Extracting barcode from {barcode_file}')
    print(f'Extracting gene feature from {gene_file}')
    
    #Load data
    adata_tmp = sc.read(data_file, cache=True)
    adata_tmp = adata_tmp.transpose()
    adata_tmp.X = adata_tmp.X.toarray()

    barcodes_tmp = pd.read_csv(barcode_file, header=None, sep='\t')
    genes_tmp = pd.read_csv(gene_file, header=None, sep='\t')
    
    #Annotate data
    barcodes_tmp.rename(columns={0:'barcode'}, inplace=True)
    barcodes_tmp.set_index('barcode', inplace=True)
    adata_tmp.obs = barcodes_tmp
    adata_tmp.obs['sample'] = [sample]*adata_tmp.n_obs
    #adata_tmp.obs['region'] = [sample.split("_")[0]]*adata_tmp.n_obs
    adata_tmp.obs['donor'] = [sample.split("_")[1]]*adata_tmp.n_obs
    adata_tmp.obs_names_make_unique()
    
    genes_tmp.drop([2], axis=1, inplace=True)
    genes_tmp.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
    genes_tmp.set_index('gene_symbol', inplace=True)
    adata_tmp.var = genes_tmp
    adata_tmp.var_names_make_unique()

    # Concatenate to main adata object
    
    adata = adata.concatenate(adata_tmp)
    #adata.obs.drop(columns=['sample_id'], inplace=True)
    adata.obs_names = [c.split("-")[0] for c in adata.obs_names]
    adata.obs_names_make_unique(join='_')
    print("Done!!!")

Extracting data from data/GSE159812_RAW_covid/GSM4848453_c75_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848453_c75_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848453_c75_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848454_c76_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848454_c76_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848454_c76_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848455_c74_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848455_c74_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848455_c74_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848456_c15_12_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848456_c15_12_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848456_c15_12_features.tsv
Done!!!


In [7]:
# Loop to load rest of data sets
for i in range(len(sample_strings3)):
    #Parse Filenames
    sample = sample_strings3[i]
    data_file = file_base+sample+data_file_end
    barcode_file = file_base+sample+barcode_file_end
    gene_file = file_base+sample+gene_file_end
    print(f'Extracting data from {data_file}')
    print(f'Extracting barcode from {barcode_file}')
    print(f'Extracting gene feature from {gene_file}')
    
    #Load data
    adata_tmp = sc.read(data_file, cache=True)
    adata_tmp = adata_tmp.transpose()
    adata_tmp.X = adata_tmp.X.toarray()

    barcodes_tmp = pd.read_csv(barcode_file, header=None, sep='\t')
    genes_tmp = pd.read_csv(gene_file, header=None, sep='\t')
    
    #Annotate data
    barcodes_tmp.rename(columns={0:'barcode'}, inplace=True)
    barcodes_tmp.set_index('barcode', inplace=True)
    adata_tmp.obs = barcodes_tmp
    adata_tmp.obs['sample'] = [sample]*adata_tmp.n_obs
    #adata_tmp.obs['region'] = [sample.split("_")[0]]*adata_tmp.n_obs
    adata_tmp.obs['donor'] = [sample.split("_")[1]]*adata_tmp.n_obs
    adata_tmp.obs_names_make_unique()
    
    genes_tmp.drop([2], axis=1, inplace=True)
    genes_tmp.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
    genes_tmp.set_index('gene_symbol', inplace=True)
    adata_tmp.var = genes_tmp
    adata_tmp.var_names_make_unique()

    # Concatenate to main adata object
    adata = adata.concatenate(adata_tmp)
    #adata.obs.drop(columns=['sample_id'], inplace=True)
    adata.obs_names = [c.split("-")[0] for c in adata.obs_names]
    adata.obs_names_make_unique(join='_')
    print("Done!!!")

Extracting data from data/GSE159812_RAW_covid/GSM4848457_c15_17_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848457_c15_17_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848457_c15_17_features.tsv
Done!!!
Extracting data from data/GSE159812_RAW_covid/GSM4848458_c16_18_matrix.mtx
Extracting barcode from data/GSE159812_RAW_covid/GSM4848458_c16_18_barcodes.tsv
Extracting gene feature from data/GSE159812_RAW_covid/GSM4848458_c16_18_features.tsv


: 

: 

In [1]:
# Loop to load rest of data sets
for i in range(len(sample_strings4)):
    #Parse Filenames
    sample = sample_strings3[i]
    data_file = file_base+sample+data_file_end
    barcode_file = file_base+sample+barcode_file_end
    gene_file = file_base+sample+gene_file_end
    print(f'Extracting data from {data_file}')
    print(f'Extracting barcode from {barcode_file}')
    print(f'Extracting gene feature from {gene_file}')
    
    #Load data
    adata_tmp = sc.read(data_file, cache=True)
    adata_tmp = adata_tmp.transpose()
    adata_tmp.X = adata_tmp.X.toarray()

    barcodes_tmp = pd.read_csv(barcode_file, header=None, sep='\t')
    genes_tmp = pd.read_csv(gene_file, header=None, sep='\t')
    
    #Annotate data
    barcodes_tmp.rename(columns={0:'barcode'}, inplace=True)
    barcodes_tmp.set_index('barcode', inplace=True)
    adata_tmp.obs = barcodes_tmp
    adata_tmp.obs['sample'] = [sample]*adata_tmp.n_obs
    #adata_tmp.obs['region'] = [sample.split("_")[0]]*adata_tmp.n_obs
    adata_tmp.obs['donor'] = [sample.split("_")[1]]*adata_tmp.n_obs
    adata_tmp.obs_names_make_unique()
    
    genes_tmp.drop([2], axis=1, inplace=True)
    genes_tmp.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
    genes_tmp.set_index('gene_symbol', inplace=True)
    adata_tmp.var = genes_tmp
    adata_tmp.var_names_make_unique()

    # Concatenate to main adata object
    
    adata = adata.concatenate(adata_tmp)
    #adata.obs.drop(columns=['sample_id'], inplace=True)
    adata.obs_names = [c.split("-")[0] for c in adata.obs_names]
    adata.obs_names_make_unique(join='_')
    print("Done!!!")

NameError: name 'sample_strings' is not defined

In [26]:
adata_tmp.obs

Unnamed: 0_level_0,sample,donor
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCCAAGCTGACCC-1,GSM4848443_cv73,cv73
AAACCCACAGTCACGC-1,GSM4848443_cv73,cv73
AAACCCAGTCCCTGTT-1,GSM4848443_cv73,cv73
AAACCCATCACCTCGT-1,GSM4848443_cv73,cv73
AAACCCATCAGGGATG-1,GSM4848443_cv73,cv73
...,...,...
TTTGGAGTCTATTGTC-1,GSM4848443_cv73,cv73
TTTGGAGTCTCCATAT-1,GSM4848443_cv73,cv73
TTTGGTTTCTACACAG-1,GSM4848443_cv73,cv73
TTTGTTGGTGACATCT-1,GSM4848443_cv73,cv73


In [28]:
adata.obs

Unnamed: 0_level_0,sample,donor
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCCAAGGCTTTCA-1,GSM4848442_cv72,cv72
AAACCCACAGGTATGG-1,GSM4848442_cv72,cv72
AAACCCATCGAGTACT-1,GSM4848442_cv72,cv72
AAACCCATCGTGGTAT-1,GSM4848442_cv72,cv72
AAACGAAAGAAGAACG-1,GSM4848442_cv72,cv72
...,...,...
TTTGTTGCACCAAATC-1,GSM4848442_cv72,cv72
TTTGTTGTCACGGACC-1,GSM4848442_cv72,cv72
TTTGTTGTCCACGTGG-1,GSM4848442_cv72,cv72
TTTGTTGTCTGTTGGA-1,GSM4848442_cv72,cv72


In [19]:
adata.obs

Unnamed: 0_level_0,sample,region,donor
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAACCCAAGGCTTTCA-1,GSM4848442_cv72,GSM4848442,cv72
AAACCCACAGGTATGG-1,GSM4848442_cv72,GSM4848442,cv72
AAACCCATCGAGTACT-1,GSM4848442_cv72,GSM4848442,cv72
AAACCCATCGTGGTAT-1,GSM4848442_cv72,GSM4848442,cv72
AAACGAAAGAAGAACG-1,GSM4848442_cv72,GSM4848442,cv72
...,...,...,...
TTTGTTGCACCAAATC-1,GSM4848442_cv72,GSM4848442,cv72
TTTGTTGTCACGGACC-1,GSM4848442_cv72,GSM4848442,cv72
TTTGTTGTCCACGTGG-1,GSM4848442_cv72,GSM4848442,cv72
TTTGTTGTCTGTTGGA-1,GSM4848442_cv72,GSM4848442,cv72
