## Adds metadata columns to AnnData object

Metadata columns added include 1) if a sample is bulk or single cell and 2) which project the sample was originally sourced from.

In [4]:
import pandas as pd
import h5py
import scanpy as sc
from scipy import sparse

In [None]:
data = sc.read_h5ad('recount3/mouse_FULL_sparse_with_geneIDs.h5ad')

In [None]:
metadata = pd.read_csv("metadata/recount3_mouse_metadata.csv")
metadata = metadata[["external_id", "pattern.predict.type"]]
metadata

In [None]:
new_obs = data.obs.merge(metadata, how='left', left_on="Sample ID", right_on="external_id")[["Sample ID", "project", "pattern.predict.type"]]
new_obs = new_obs.reset_index().set_index("Sample ID", drop=False)
data.obs = new_obs

In [36]:
data

AnnData object with n_obs × n_vars = 416803 × 55422
    obs: 'Sample ID', 'project', 'pattern.predict.type'
    var: 'Ensemble ID'

In [None]:
metadata = pd.read_csv("metadata/recount3_mouse_metadata.csv")

In [39]:
metadata = metadata[["external_id", "study"]]
metadata

Unnamed: 0,external_id,study
0,DRR091074,DRP003800
1,DRR091075,DRP003800
2,DRR091076,DRP003800
3,DRR091077,DRP003800
4,DRR091078,DRP003800
...,...,...
416836,SRR10292306,SRP225899
416837,SRR10292307,SRP225899
416838,SRR10292308,SRP225899
416839,SRR10292309,SRP225899


In [42]:
new_obs2 = data.obs.merge(metadata, how='left', left_index=True, right_on="external_id")
new_obs2 = new_obs2.set_index("external_id")
new_obs2

Unnamed: 0_level_0,Sample ID,project,pattern.predict.type,study
external_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SRR8249400,SRR8249400,0,scrna-seq,SRP170963
SRR8249500,SRR8249500,0,scrna-seq,SRP170963
SRR8249300,SRR8249300,0,scrna-seq,SRP170963
SRR8249200,SRR8249200,0,scrna-seq,SRP170963
SRR8249401,SRR8249401,0,scrna-seq,SRP170963
...,...,...,...,...
SRR5445513,SRR5445513,10087,rna-seq,SRP103859
SRR5445514,SRR5445514,10087,rna-seq,SRP103859
SRR5445515,SRR5445515,10087,rna-seq,SRP103859
SRR5445516,SRR5445516,10087,rna-seq,SRP103859


In [43]:
data.obs = new_obs2
data

AnnData object with n_obs × n_vars = 416803 × 55422
    obs: 'Sample ID', 'project', 'pattern.predict.type', 'study'
    var: 'Ensemble ID'

In [44]:
data.write_h5ad("recount3/mouse_FULL_sparse_with_geneIDs_with_metadata.h5ad")