# Preprocess Marjanovic, Hofree, Chan et al Cancer Cell 2020 data

Data originally from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE154989
Place in ./data subdirectory

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd

import scipy

In [2]:
from pathlib import Path

In [3]:
Path("./data").mkdir(parents=True, exist_ok=True) # put GSE154989 files here

In [2]:
fileLoc = "./data/"
#lungTimeNorm = fileLoc + "GSE154989_mmLungPlate_fQC_dSp_normTPM.h5"
lungTimeRaw = fileLoc + "GSE154989_mmLungPlate_fQC_dSp_rawCount.h5"
#lungTimeRAWOrig = fileLoc + "GSE154989_mmLungPlate_fQC_dSp_rawCountOrig.h5"
luadGeneNames = fileLoc + "GSE154989_mmLungPlate_fQC_geneTable.csv"
luadCell = fileLoc + "GSE154989_mmLungPlate_fQC_smpTable.csv"
luadObs = fileLoc + "GSE154989_mmLungPlate_fQC_dZ_annot_smpTable.csv"

In [3]:
luadGenes = pd.read_csv(luadGeneNames)#[["geneID","geneSymbol"]]
luadGenes.index = list(luadGenes["geneID"])
luadGenes

Unnamed: 0,geneID,ensgID,geneSymbol,length
ENSMUSG00000000001.4_Gnai3,ENSMUSG00000000001.4_Gnai3,ENSMUSG00000000001.4,Gnai3,3262.000
ENSMUSG00000000003.15_Pbsn,ENSMUSG00000000003.15_Pbsn,ENSMUSG00000000003.15,Pbsn,799.500
ENSMUSG00000000028.14_Cdc45,ENSMUSG00000000028.14_Cdc45,ENSMUSG00000000028.14,Cdc45,1574.000
ENSMUSG00000000031.15_H19,ENSMUSG00000000031.15_H19,ENSMUSG00000000031.15,H19,1268.600
ENSMUSG00000000037.16_Scml2,ENSMUSG00000000037.16_Scml2,ENSMUSG00000000037.16,Scml2,3297.140
...,...,...,...,...
ENSMUSG00000114966.1_AC154305.4,ENSMUSG00000114966.1_AC154305.4,ENSMUSG00000114966.1,AC154305.4,614.000
ENSMUSG00000114967.1_AC161884.2,ENSMUSG00000114967.1_AC161884.2,ENSMUSG00000114967.1,AC161884.2,227.000
ENSMUSG00000114968.1_AC130217.2,ENSMUSG00000114968.1_AC130217.2,ENSMUSG00000114968.1,AC130217.2,2990.000
kallistoKRASG12D_KRASG12D,kallistoKRASG12D_KRASG12D,kallistoKRASG12D,KRASG12D,422.403


In [4]:
cellIds = pd.read_csv(luadCell)

cellObs = pd.read_csv(luadObs)

totalObs = cellIds.merge(cellObs)
totalObs.index = list(totalObs["sampleID"])
totalObs

Unnamed: 0,sampleID,plateID,mouseID,timesimple,typeID,clusterK12,tSNE_1,tSNE_2,phate_1,phate_2
KP_30w_ND_m3_T7_P1_S302,KP_30w_ND_m3_T7_P1_S302,KP_30w_ND_m3_T7_P1,KP_30w_ND_m3_T7,08_KP_30w_ND,08_KP_30w_ND,11,53.618231,-16.560435,0.768624,0.482972
KP_30w_ND_m2_T4_P5_S331,KP_30w_ND_m2_T4_P5_S331,KP_30w_ND_m2_T4_P5,KP_30w_ND_m2_T4,08_KP_30w_ND,08_KP_30w_ND,12,-18.802622,39.078073,-0.194094,0.498761
K_12w_ND_m3_T0_P2_S272,K_12w_ND_m3_T0_P2_S272,K_12w_ND_m3_T0_P2,K_12w_ND_m3,04_K_12w_ND,04_K_12w_ND,5,-43.260232,-2.067955,0.168767,0.218259
KP_30w_ND_m2_T1_P4_S219,KP_30w_ND_m2_T1_P4_S219,KP_30w_ND_m2_T1_P4,KP_30w_ND_m2_T1,08_KP_30w_ND,08_KP_30w_ND,10,6.216744,24.974503,-0.181736,0.172537
K_2w_ND_m3_T0_P1_S245,K_2w_ND_m3_T0_P1_S245,K_2w_ND_m3_T0_P1,K_2w_ND_m3,02_KorKP_early_ND,02_K_2w_ND,1,-5.762314,-26.705774,0.118866,-0.565761
...,...,...,...,...,...,...,...,...,...,...
KP_20w_STb2_mAdCre1_T1_P2_S377,KP_20w_STb2_mAdCre1_T1_P2_S377,KP_20w_STb2_mAdCre1_T1_P2,KP_20w_ND_m4,07_KP_20w_ND,07_KP_20w_STb2,8,30.446693,10.351931,-0.362628,0.065400
KP_20w_STb2_mAdCre1_T1_P2_S378,KP_20w_STb2_mAdCre1_T1_P2_S378,KP_20w_STb2_mAdCre1_T1_P2,KP_20w_ND_m4,07_KP_20w_ND,07_KP_20w_STb2,9,23.323272,36.534600,0.424339,0.003703
KP_20w_STb2_mAdCre1_T1_P2_S379,KP_20w_STb2_mAdCre1_T1_P2_S379,KP_20w_STb2_mAdCre1_T1_P2,KP_20w_ND_m4,07_KP_20w_ND,07_KP_20w_STb2,8,22.255118,7.601478,-0.364510,0.047828
KP_20w_STb2_mAdCre1_T1_P2_S380,KP_20w_STb2_mAdCre1_T1_P2_S380,KP_20w_STb2_mAdCre1_T1_P2,KP_20w_ND_m4,07_KP_20w_ND,07_KP_20w_STb2,8,-1.844047,12.294332,-0.186080,0.168476


In [5]:
import h5py
filename = lungTimeRaw

with h5py.File(filename, "r") as f:
    # List all groups
    print("Keys: %s" % f.keys())
    # Get the data
    i = np.array(f[list(f.keys())[0]])[0]
    j = np.array(f[list(f.keys())[1]])[0]
    v = np.array(f[list(f.keys())[2]])[0]

Keys: <KeysViewHDF5 ['i', 'j', 'v']>


In [6]:
i=np.concatenate((["%%MatrixMarket matrix coordinate real general","%",str(int(max(i)))],i.astype(int)))
j=np.concatenate((["","",str(int(max(j)))],j.astype(int)))
v=np.concatenate((["","",str(int(len(v)))],v.astype(int)))

In [7]:
exp = pd.DataFrame(data={"i":i,"j":j,"v":v})
exp

Unnamed: 0,i,j,v
0,%%MatrixMarket matrix coordinate real general,,
1,%,,
2,52638,3891,21203238
3,8,1,292
4,9,1,234
...,...,...,...
21203236,52503,3891,6
21203237,52554,3891,3
21203238,52608,3891,1
21203239,52619,3891,0


In [8]:
exp.to_csv(fileLoc + "matrix.mtx", sep="\t", index=False, header=False)

In [9]:
adata = sc.read_mtx(fileLoc+"matrix.mtx",dtype="float")
adata = adata.T

In [10]:
adata.obs = totalObs.copy()
adata.obs_names_make_unique()

In [11]:
adata.var = luadGenes.copy()
adata.var_names = [gene.split("_")[1] for gene in adata.var_names]
adata.var = adata.var[['geneID', 'geneSymbol']]
adata.var_names_make_unique()

In [12]:
adata.write("./luadAdata.h5ad")