## Metadata Transformation for Submissions to Single Cell Portal (SCP)

#### Author: Hannah Kang

##### Study Information: 
The aim of this study is to understand the many types of neurons that carry out information processing in the brain evolved recently by focusing on the retina, the thin film of neurons in the eye that initiates vision. Our approach entails high-throughput single-cell RNA-seq where gene expression is quantified in hundreds of thousands of single retinal neurons, and computational methods are used to process and integrate these datasets. We have recently completed a project that involves generating and integrating single-cell RNA-seq data across an unprecedented 17 species.

Date Last Modified: 2/13/2024 (Before 11/18/2023)


In [1]:
# Import Statements
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Reference data from Professor Shekhar

# data = pd.read_csv("rgc_metadata_eg.txt")
# data.head(10)

### Metadata Transformation for Marmoset Dataset

In [3]:
file_path = "/Users/hannahkang/Desktop/Shek Lab/Original Files/MarmosetFovea_BC_metadata.csv"

metadata_marmoset = pd.read_csv(file_path)
metadata_marmoset

Unnamed: 0.1,Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,orig.file,animal,RNA_snn_res.0.5,seurat_clusters,dendro_order,cell_class,integrated_snn_res.1,integrated_snn_res.0.8,all_BC_labels,labels,type,Mammal_OT,Nonmammal_OT
0,possorted_genome_bam_EXKMG:AAAGATGAGCCAGGATx,possorted_genome_bam_EXKMG,1730,1194,Marmoset1FoveaS1,1,0,2,0,BC,1,1,1,2,f2,BC1A,absent
1,possorted_genome_bam_EXKMG:AAAGATGAGGCCCTCAx,possorted_genome_bam_EXKMG,3341,2034,Marmoset1FoveaS1,1,2,1,1,BC,0,0,2,1,f1,absent,absent
2,possorted_genome_bam_EXKMG:AAACGGGTCTTGTATCx,possorted_genome_bam_EXKMG,2549,1519,Marmoset1FoveaS1,1,2,1,1,BC,0,0,2,1,f1,absent,absent
3,possorted_genome_bam_EXKMG:AAATGCCCACTAGTACx,possorted_genome_bam_EXKMG,2145,1471,Marmoset1FoveaS1,1,0,2,0,BC,1,1,1,2,f2,absent,absent
4,possorted_genome_bam_EXKMG:AAACGGGGTCCTAGCGx,possorted_genome_bam_EXKMG,1445,997,Marmoset1FoveaS1,1,8,4,3,BC,3,3,4,4,f4,BC4,absent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12551,possorted_genome_bam_K564P:TTTGGTTGTTGAGTTCx,possorted_genome_bam_K564P,1709,1169,AdultmarmosetFovea2S2,2,9,5,4,BC,4,4,5,5,f5,absent,absent
12552,possorted_genome_bam_K564P:TTTATGCCATGTTCCCx,possorted_genome_bam_K564P,1910,1351,AdultmarmosetFovea2S2,2,0,2,0,BC,9,6,1,2,f2,BC1A,absent
12553,possorted_genome_bam_K564P:TTTATGCCAATCACACx,possorted_genome_bam_K564P,2292,1548,AdultmarmosetFovea2S2,2,0,2,0,BC,1,1,1,2,f2,absent,absent
12554,possorted_genome_bam_K564P:TTTGTCATCTGTTTGTx,possorted_genome_bam_K564P,2963,1794,AdultmarmosetFovea2S2,2,2,1,1,BC,0,0,2,1,f1,absent,absent


In [4]:
# Remove the prefix from the "Unnamed: 0" column
metadata_marmoset['Unnamed: 0'] = metadata_marmoset['Unnamed: 0'].str.replace('possorted_genome_bam_', '') 

metadata_marmoset = metadata_marmoset.rename(columns={'Unnamed: 0': 'NAME'})
metadata_marmoset['NAME'] = metadata_marmoset['NAME'].str.replace(':', '_')

# Define column values
column_values = {
    'biosample_id': metadata_marmoset['orig.file'],
    'donor_id': metadata_marmoset['animal'],
    'species': 'NCBITAXON_9483',
    'species__ontology_label': 'Callithrix jacchus',
    'disease': 'PATO_0000461',
    'disease__ontology_label': 'normal',
    'organ': 'UBERON_0000966',
    'organ__ontology_label': 'retina',
    'library_preparation_protocol': 'EFO_0009899',
    'library_preparation_protocol__ontology_label': "10X 3' v2",
    'sex': 'unknown'
}

# Insert columns into DataFrame
for i, (col_name, col_values) in enumerate(column_values.items(), start=1):
    metadata_marmoset.insert(i, col_name, col_values)

# Add 'TYPE' row
group_row = pd.Series(['TYPE'] + ['group'] * (len(metadata_marmoset.columns) - 1), index=metadata_marmoset.columns)
metadata_marmoset = pd.concat([group_row.to_frame().T, metadata_marmoset], ignore_index=True)

# Limit DataFrame to the first 12 columns
metadata_marmoset = metadata_marmoset.iloc[:, :12]
metadata_marmoset


Unnamed: 0,NAME,biosample_id,donor_id,species,species__ontology_label,disease,disease__ontology_label,organ,organ__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label,sex
0,TYPE,group,group,group,group,group,group,group,group,group,group,group
1,EXKMG_AAAGATGAGCCAGGATx,Marmoset1FoveaS1,1,NCBITAXON_9483,Callithrix jacchus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
2,EXKMG_AAAGATGAGGCCCTCAx,Marmoset1FoveaS1,1,NCBITAXON_9483,Callithrix jacchus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
3,EXKMG_AAACGGGTCTTGTATCx,Marmoset1FoveaS1,1,NCBITAXON_9483,Callithrix jacchus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
4,EXKMG_AAATGCCCACTAGTACx,Marmoset1FoveaS1,1,NCBITAXON_9483,Callithrix jacchus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
12552,K564P_TTTGGTTGTTGAGTTCx,AdultmarmosetFovea2S2,2,NCBITAXON_9483,Callithrix jacchus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
12553,K564P_TTTATGCCATGTTCCCx,AdultmarmosetFovea2S2,2,NCBITAXON_9483,Callithrix jacchus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
12554,K564P_TTTATGCCAATCACACx,AdultmarmosetFovea2S2,2,NCBITAXON_9483,Callithrix jacchus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
12555,K564P_TTTGTCATCTGTTTGTx,AdultmarmosetFovea2S2,2,NCBITAXON_9483,Callithrix jacchus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown


### Metadata Transformation for Mouse Dataset

In [5]:
file_path = "/Users/hannahkang/Desktop/Shek Lab/Original Files/Mouse_BC_metadata.csv"

metadata_mouse = pd.read_csv(file_path)
metadata_mouse

Unnamed: 0.1,Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,orig.file,animal,RNA_snn_res.0.5,seurat_clusters,dendro_order,integrated_snn_res.0.8,integrated_snn_res.0.5,barcode,annotated,type,Mammal_OT,Nonmammal_OT
0,possorted_genome_bam_Z0OYT:AAACGGGCAACACGCCx,possorted_genome_bam_Z0OYT,4424.0,2223.0,CTRLC57AllOther1,1.0,8.0,10.0,8.0,14.0,11.0,oncBC_CtC57AllOtherR1_AAACGGGCAACACGCC-1,BC5A,BC5A,absent,BC8/9
1,possorted_genome_bam_Z0OYT:AAACCTGAGTGTACGGx,possorted_genome_bam_Z0OYT,5860.0,2708.0,CTRLC57AllOther1,1.0,10.0,8.0,10.0,11.0,9.0,oncBC_CtC57AllOtherR1_AAACCTGAGTGTACGG-1,BC4,BC4,BC4,BC4
2,possorted_genome_bam_Z0OYT:AAAGTAGGTTGTTTGGx,possorted_genome_bam_Z0OYT,4458.0,2149.0,CTRLC57AllOther1,1.0,13.0,1.0,13.0,0.0,0.0,oncBC_CtC57AllOtherR1_AAAGTAGGTTGTTTGG-1,RBC,RBC,absent,absent
3,possorted_genome_bam_Z0OYT:AAAGTAGCATTCCTCGx,possorted_genome_bam_Z0OYT,1841.0,1025.0,CTRLC57AllOther1,1.0,11.0,1.0,11.0,1.0,2.0,oncBC_CtC57AllOtherR1_AAAGTAGCATTCCTCG-1,RBC,RBC,absent,RBC
4,possorted_genome_bam_Z0OYT:AAATGCCAGCCACTATx,possorted_genome_bam_Z0OYT,1447.0,990.0,CTRLC57AllOther1,1.0,40.0,15.0,40.0,20.0,18.0,oncBC_CtC57AllOtherR1_AAATGCCAGCCACTAT-1,BC1B,BC1B,BC2,BC1B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8146,Mouse;possorted_genome_bam_H5JEB:TTTACTGTCGGCA...,,,,,,,,,,,,,,12,BC2
8147,Mouse;possorted_genome_bam_H5JEB:TTTGGTTGTTAAA...,,,,,,,,,,,,,,7,BC3B
8148,Mouse;possorted_genome_bam_H5JEB:TTTATGCGTCTGA...,,,,,,,,,,,,,,8,BC5A
8149,Mouse;possorted_genome_bam_H5JEB:TTTCCTCGTTCAG...,,,,,,,,,,,,,,10,BC3B


In [6]:
# Remove the prefix from the "Unnamed: 0" column
metadata_mouse.loc[:, 'Unnamed: 0'] = metadata_mouse['Unnamed: 0'].str.replace('Mouse;possorted_genome_bam_', '')
metadata_mouse.loc[:, 'Unnamed: 0'] = metadata_mouse['Unnamed: 0'].str.replace('possorted_genome_bam_', '')

metadata_mouse = metadata_mouse.rename(columns={'Unnamed: 0': 'NAME'})
metadata_mouse['NAME'] = metadata_mouse['NAME'].str.replace(':', '_')

# Define column values
column_values = {
    'biosample_id': metadata_mouse['orig.file'],
    'donor_id': metadata_mouse['animal'],
    'species': 'NCBITaxon_10090',
    'species__ontology_label': 'Mus musculus',
    'disease': 'PATO_0000461',
    'disease__ontology_label': 'normal',
    'organ': 'UBERON_0000966',
    'organ__ontology_label': 'retina',
    'library_preparation_protocol': 'EFO_0009899',
    'library_preparation_protocol__ontology_label': "10X 3' v2",
    'sex': 'unknown'
}

# Insert columns into DataFrame
for i, (col_name, col_values) in enumerate(column_values.items(), start=1):
    metadata_mouse.insert(i, col_name, col_values)

# Add 'TYPE' row
group_row = pd.Series(['TYPE'] + ['group'] * (len(metadata_mouse.columns) - 1), index=metadata_mouse.columns)
metadata_mouse = pd.concat([group_row.to_frame().T, metadata_mouse], ignore_index=True)

# Limit DataFrame to the first 12 columns
metadata_mouse = metadata_mouse.iloc[:, :12]
metadata_mouse

Unnamed: 0,NAME,biosample_id,donor_id,species,species__ontology_label,disease,disease__ontology_label,organ,organ__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label,sex
0,TYPE,group,group,group,group,group,group,group,group,group,group,group
1,Z0OYT_AAACGGGCAACACGCCx,CTRLC57AllOther1,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
2,Z0OYT_AAACCTGAGTGTACGGx,CTRLC57AllOther1,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
3,Z0OYT_AAAGTAGGTTGTTTGGx,CTRLC57AllOther1,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
4,Z0OYT_AAAGTAGCATTCCTCGx,CTRLC57AllOther1,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
8147,H5JEB_TTTACTGTCGGCATCGx,,,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
8148,H5JEB_TTTGGTTGTTAAAGACx,,,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
8149,H5JEB_TTTATGCGTCTGATTGx,,,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
8150,H5JEB_TTTCCTCGTTCAGCGCx,,,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown


In [7]:
metadata_mouse = metadata_mouse.drop_duplicates(subset='NAME', keep='first')
metadata_mouse

Unnamed: 0,NAME,biosample_id,donor_id,species,species__ontology_label,disease,disease__ontology_label,organ,organ__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label,sex
0,TYPE,group,group,group,group,group,group,group,group,group,group,group
1,Z0OYT_AAACGGGCAACACGCCx,CTRLC57AllOther1,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
2,Z0OYT_AAACCTGAGTGTACGGx,CTRLC57AllOther1,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
3,Z0OYT_AAAGTAGGTTGTTTGGx,CTRLC57AllOther1,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
4,Z0OYT_AAAGTAGCATTCCTCGx,CTRLC57AllOther1,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
5551,H5JEB_TTTACTGTCCGCTGTTx,CTRLC57AllOther2,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
5552,H5JEB_TTTATGCGTCTGATTGx,CTRLC57AllOther2,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
5553,H5JEB_TTTCCTCTCTTGCAAGx,CTRLC57AllOther2,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown
5554,H5JEB_TTTCCTCGTTCAGCGCx,CTRLC57AllOther2,1.0,NCBITaxon_10090,Mus musculus,PATO_0000461,normal,UBERON_0000966,retina,EFO_0009899,10X 3' v2,unknown


### Error Checking for Transformation for Species Metadata and UMAP (Clustering) Files

In [8]:
file_path = "/Users/hannahkang/Desktop/Shek Lab/Single Cell Portal/Output Files/Marmoset/SCP-marmoset-clustering-submission.csv"
clustering_marmoset = pd.read_csv(file_path)

file_path = "/Users/hannahkang/Desktop/Shek Lab/Single Cell Portal/Output Files/Mouse/SCP-mouse-clustering-submission.csv"
clustering_mouse = pd.read_csv(file_path)

# Example of Clustering File Format
clustering_mouse.head()

Unnamed: 0,NAME,X,Y
0,TYPE,numeric,numeric
1,Z0OYT_AAACGGGCAACACGCCx,12.2872521763422,10.8380569849402
2,Z0OYT_AAACCTGAGTGTACGGx,6.87598775678653,1.99284834007327
3,Z0OYT_AAAGTAGGTTGTTTGGx,-9.62521959489804,3.70287269691531
4,Z0OYT_AAAGTAGCATTCCTCGx,-7.14581848329525,5.00803083519045


In [9]:
# OBJECTIVE: Check for any cells present within either a cluster or metadata file for a species that do not exist in the other file. 
# In order to be correct, the set() for all test cases should be empty. 

# FOR MARMOSETS:
cell_names_metadata = set(metadata_marmoset['NAME'])
cell_names_cluster = set(clustering_marmoset['NAME'])

cells_not_in_cluster = cell_names_metadata - cell_names_cluster
cells_not_in_metadata = cell_names_cluster - cell_names_metadata

# print("Cells present in metadata_marmoset but not in cluster_file:", cells_not_in_cluster)
# print("Cells present in cluster_file but not in metadata_marmoset:", cells_not_in_metadata)

# FOR MOUSE:
cell_names_metadata = set(metadata_mouse['NAME'])
cell_names_cluster = set(clustering_mouse['NAME'])

cells_not_in_cluster = cell_names_metadata - cell_names_cluster
cells_not_in_metadata = cell_names_cluster - cell_names_metadata

# print("Cells present in metadata_mouse but not in cluster_file:", cells_not_in_cluster)
# print("Cells present in cluster_file but not in metadata_mouse:", cells_not_in_metadata)

if (cells_not_in_cluster == set()) and (cells_not_in_metadata == set()):
    print("ALL PASSED!")

ALL PASSED!


### CellxGene h5ad File Conversion

In [10]:
import scanpy as sc

# ann_data_2 = sc.read_h5ad("/Users/hannahkang/Desktop/Shek Lab/CellxGene/MouseBC.h5ad", backed = 'r+')
# ann_data_2.write_h5ad('mouseBC_modified2.h5ad')

In [11]:
mouseBC = sc.read_h5ad("/Users/hannahkang/Desktop/Shek Lab/Original Files/MouseBC.h5ad", backed = 'r+')
mouseBC.__dict__['_raw'].__dict__['_var'] = mouseBC.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
# mouseBC.write_h5ad("MouseBC_v2.h5ad")

In [12]:
print(mouseBC.obsm)
print(mouseBC.obsm["X_umap"])

print(' ')
#print(ann_data_marmoset.obsm)
#print(ann_data_marmoset.obsm["X_umap"])

AxisArrays with keys: X_tsne, X_umap
[[ 12.28725218  10.83805698]
 [  6.87598776   1.99284834]
 [ -9.62521959   3.7028727 ]
 ...
 [-10.01020552   3.79835219]
 [ 11.4269874   -2.27397066]
 [ 16.8380134    5.4951424 ]]
 


In [13]:
mouseBC.uns = {"Title": "Mouse Bipolar"}


In [14]:
print(mouseBC.obs_keys)

mouseBC.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
mouseBC.obs["tissue_ontology_term_id"] = "UBERON_0000966"
mouseBC.obs["assay_ontology_term_id"] = "EFO:0009809"
mouseBC.obs["disease_ontology_term_id"] = "PATO_0000461"
mouseBC.obs["cell_type_ontology_term_id"] = "UBERON_0000966"
mouseBC.obs["self_reported_ethnicity_ontology_term_id"] = "NA"
mouseBC.obs["development_stage_ontology_term_id"] = "MmusDv"
mouseBC.obs["sex_ontology_term_id"] = "Unknown"
mouseBC.obs["donor_id"] = "P56_Batch1"
mouseBC.obs["suspension_type"] = "cell"

<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 5555 × 31053 backed at '/Users/hannahkang/Desktop/Shek Lab/Original Files/MouseBC.h5ad'
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'integrated_snn_res.0.8', 'integrated_snn_res.0.5', 'barcode', 'annotated', 'type'
    var: 'features'
    uns: 'Title'
    obsm: 'X_tsne', 'X_umap'>


In [15]:
mouseBC.write_h5ad('/Users/hannahkang/Desktop/Shek Lab/CellxGene/Output Files/Mouse/mouseBC_modified_real.h5ad')

# ann_data_3 = sc.read_h5ad("mouseBC_modified_real.h5ad", backed = 'r+')

In [16]:
#returns a boolean for each gene if it starts with the letters 'GM' or not
mouseBC.var['gm'] = mouseBC.var_names.str.startswith('GM')
mouseBC.var['gm']

XKR4       False
GM37381     True
RP1        False
SOX17      False
GM37323     True
           ...  
GM28672     True
GM28670     True
GM29504     True
GM20837     True
GM47283     True
Name: gm, Length: 31053, dtype: bool

In [17]:
# print(mouseBC)
# sc.pp.calculate_qc_metrics(mouseBC, qc_vars=['gm'], inplace=True)

In [18]:
#print(ann_data.X[:10])
#print(ann_data.raw.X[:10])
print(mouseBC.obsm)
print(mouseBC.obsm["X_umap"])

# print(' ')
# print(ann_data_marmoset.obsm)
# print(ann_data_marmoset.obsm["X_umap"])

AxisArrays with keys: X_tsne, X_umap
[[ 12.28725218  10.83805698]
 [  6.87598776   1.99284834]
 [ -9.62521959   3.7028727 ]
 ...
 [-10.01020552   3.79835219]
 [ 11.4269874   -2.27397066]
 [ 16.8380134    5.4951424 ]]


In [19]:
#Fetching the variable names of the h5ad data
mouseBC.var_names
# ann_data_marmoset.var_names

Index(['XKR4', 'GM37381', 'RP1', 'SOX17', 'GM37323', 'MRPL15', 'RGS20',
       'NPBWR1', '4732440D04RIK', 'GM26901',
       ...
       'GM28406', 'GM29436', 'GM28407', 'GM29393', 'GM21294', 'GM28672',
       'GM28670', 'GM29504', 'GM20837', 'GM47283'],
      dtype='object', length=31053)

In [20]:
mouseBC.var_names_make_unique()
# ann_data_marmoset.var_names_make_unique()

In [21]:
mouseBC.obs_names

Index(['possorted_genome_bam_Z0OYT:AAACGGGCAACACGCCx',
       'possorted_genome_bam_Z0OYT:AAACCTGAGTGTACGGx',
       'possorted_genome_bam_Z0OYT:AAAGTAGGTTGTTTGGx',
       'possorted_genome_bam_Z0OYT:AAAGTAGCATTCCTCGx',
       'possorted_genome_bam_Z0OYT:AAATGCCAGCCACTATx',
       'possorted_genome_bam_Z0OYT:AAACGGGTCTTGCCGTx',
       'possorted_genome_bam_Z0OYT:AAAGCAATCAGTACGTx',
       'possorted_genome_bam_Z0OYT:AAACGGGTCAGCAACTx',
       'possorted_genome_bam_Z0OYT:AAACGGGGTAGGCATGx',
       'possorted_genome_bam_Z0OYT:AAAGATGTCTCGATGAx',
       ...
       'possorted_genome_bam_H5JEB:TTTCCTCAGGCTATCTx',
       'possorted_genome_bam_H5JEB:TTTGGTTTCTCCGGTTx',
       'possorted_genome_bam_H5JEB:TTTCCTCAGGTAAACTx',
       'possorted_genome_bam_H5JEB:TTTACTGTCGGCATCGx',
       'possorted_genome_bam_H5JEB:TTTGGTTGTTAAAGACx',
       'possorted_genome_bam_H5JEB:TTTACTGTCCGCTGTTx',
       'possorted_genome_bam_H5JEB:TTTATGCGTCTGATTGx',
       'possorted_genome_bam_H5JEB:TTTCCTCTCTTGCAAGx',

In [22]:
print(mouseBC.X)

# print(ann_data_marmoset.X)

CSRDataset: backend hdf5, shape (5555, 31053), data_dtype float64


In [23]:
#returns a boolean for each gene if it starts with the letters 'GM' or not
# mouseBC.var['gm'] = mouseBC.var_names.str.startswith('GM')
# mouseBC.var['gm']

In [24]:
# print(mouseBC)
# sc.pp.calculate_qc_metrics(mouseBC, qc_vars=['gm'], inplace=True)

print(mouseBC.obs_keys)

# mouseBC.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
# mouseBC.obs["tissue_ontology_term_id"] = "UBERON_0000966"
# mouseBC.obs["assay_ontology_term_id"] = "EFO:0009809"
# mouseBC.obs["disease_ontology_term_id"] = "PATO_0000461"
# mouseBC.obs["cell_type_ontology_term_id"] = "UBERON_0000966"
# mouseBC.obs["self_reported_ethnicity_ontology_term_id"] = "NA"
# mouseBC.obs["development_stage_ontology_term_id"] = "MmusDv"
# mouseBC.obs["sex_ontology_term_id"] = "Unknown"
# mouseBC.obs["donor_id"] = "P56_Batch1"
# mouseBC.obs["suspension_type"] = "cell"

print(mouseBC.obs["self_reported_ethnicity_ontology_term_id"])

#Save the modified ann_data file
# mouseBC.write_h5ad('mouseBC_modified.h5ad')

<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 5555 × 31053 backed at '/Users/hannahkang/Desktop/Shek Lab/CellxGene/Output Files/Mouse/mouseBC_modified_real.h5ad'
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'integrated_snn_res.0.8', 'integrated_snn_res.0.5', 'barcode', 'annotated', 'type', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type'
    var: 'features', 'gm'
    uns: 'Title'
    obsm: 'X_tsne', 'X_umap'>
possorted_genome_bam_Z0OYT:AAACGGGCAACACGCCx    NA
possorted_genome_bam_Z0OYT:AAACCTGAGTGTACGGx    NA
possorted_genome_bam_Z0OYT:AAAGTAGGTTGTTTGGx    NA
possorted_genome_bam_Z0OYT:AAAGTAGCATTCCTCGx    NA
possorted_genome_bam_Z0OYT:AAATGCCAGCCACTATx  

In [25]:
# print(ann_data_marmoset.obsm["X_umap"])

In [27]:
# Dario's reading the file

mouseBC = sc.read_h5ad("/Users/hannahkang/Desktop/Shek Lab/CellxGene/Output Files/Mouse/mouseBC_modified_real.h5ad")
mouseBC

# Example of output: 
# AnnData object with n_obs × n_vars = 5555 × 31053
#     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'integrated_snn_res.0.8', 'integrated_snn_res.0.5', 'barcode', 'annotated', 'type', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type'
#     var: 'features'
#     uns: 'Title'
#     obsm: 'X_tsne', 'X_umap'

AnnData object with n_obs × n_vars = 5555 × 31053
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'integrated_snn_res.0.8', 'integrated_snn_res.0.5', 'barcode', 'annotated', 'type', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type'
    var: 'features'
    uns: 'Title'
    obsm: 'X_tsne', 'X_umap'

In [28]:
# Marmoset

ann_data_marmoset = sc.read_h5ad("/Users/hannahkang/Desktop/Shek Lab/Original Files/MarmosetFoveaBC.h5ad", backed = 'r+')
ann_data_marmoset.__dict__['_raw'].__dict__['_var'] = ann_data_marmoset.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
# ann_data_marmoset.write_h5ad("MarmosetBC_v2.h5ad")

print(ann_data_marmoset.obsm)
print(ann_data_marmoset.obsm["X_umap"])

ann_data_marmoset.uns = {"Title": "Marmoset Bipolar"}

print(ann_data_marmoset.obs_keys)

# MUST CHECK IF THE DATA IS ACTUALLY CORRECT !!!
ann_data_marmoset.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
ann_data_marmoset.obs["tissue_ontology_term_id"] = "UBERON_0000966"
ann_data_marmoset.obs["assay_ontology_term_id"] = "EFO:0009809"
ann_data_marmoset.obs["disease_ontology_term_id"] = "PATO_0000461"
ann_data_marmoset.obs["cell_type_ontology_term_id"] = "UBERON_0000966"
ann_data_marmoset.obs["self_reported_ethnicity_ontology_term_id"] = "NA"
ann_data_marmoset.obs["development_stage_ontology_term_id"] = "MmusDv"
ann_data_marmoset.obs["sex_ontology_term_id"] = "Unknown"
ann_data_marmoset.obs["donor_id"] = "P56_Batch1"
ann_data_marmoset.obs["suspension_type"] = "cell"

ann_data_marmoset.write_h5ad('/Users/hannahkang/Desktop/Shek Lab/CellxGene/Output Files/Marmoset/ann_data_marmoset_modified_real.h5ad')

# returns a boolean for each gene if it starts with the letters 'GM' or not
ann_data_marmoset.var['gm'] = ann_data_marmoset.var_names.str.startswith('GM')
ann_data_marmoset.var['gm']

print(ann_data_marmoset.obsm)
print(ann_data_marmoset.obsm["X_umap"])

#Fetching the variable names of the h5ad data
ann_data_marmoset.var_names

ann_data_marmoset.var_names_make_unique()

ann_data_marmoset.obs_names

# print(ann_data_marmoset.X)

# Dario's reading the file
ann_data_marmoset = sc.read_h5ad("/Users/hannahkang/Desktop/Shek Lab/CellxGene/Output Files/Marmoset/ann_data_marmoset_modified_real.h5ad")
ann_data_marmoset
# Example of output: 
# AnnData object with n_obs × n_vars = 5555 × 31053
#     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'integrated_snn_res.0.8', 'integrated_snn_res.0.5', 'barcode', 'annotated', 'type', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type'
#     var: 'features'
#     uns: 'Title'
#     obsm: 'X_tsne', 'X_umap'


AxisArrays with keys: X_tsne, X_umap
[[  0.603162     9.92028538]
 [-10.56769328   0.07132263]
 [-10.37739902   1.68855516]
 ...
 [  0.98721249   9.85885254]
 [-10.08541541   1.2832434 ]
 [-10.49356132   1.53761617]]
<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 12556 × 27665 backed at '/Users/hannahkang/Desktop/Shek Lab/Original Files/MarmosetFoveaBC.h5ad'
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'cell_class', 'integrated_snn_res.1', 'integrated_snn_res.0.8', 'all_BC_labels', 'labels', 'type'
    var: 'features'
    uns: 'Title'
    obsm: 'X_tsne', 'X_umap'>
AxisArrays with keys: X_tsne, X_umap
[[  0.603162     9.92028538]
 [-10.56769328   0.07132263]
 [-10.37739902   1.68855516]
 ...
 [  0.98721249   9.85885254]
 [-10.08541541   1.2832434 ]
 [-10.49356132   1.53761617]]


AnnData object with n_obs × n_vars = 12556 × 27665
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'cell_class', 'integrated_snn_res.1', 'integrated_snn_res.0.8', 'all_BC_labels', 'labels', 'type', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type'
    var: 'features'
    uns: 'Title'
    obsm: 'X_tsne', 'X_umap'

### Translate Dataframes to CSV Files for Submission to SCP

In [29]:
file_path = "/Users/hannahkang/Desktop/Shek Lab/Single Cell Portal/Output Files/Marmoset/SCP-metadata-submission-marmoset.csv"
metadata_marmoset.to_csv(file_path, index=False)

file_path = "/Users/hannahkang/Desktop/Shek Lab/Single Cell Portal/Output Files/Mouse/SCP-metadata-submission-mouse.csv"
metadata_mouse.to_csv(file_path, index=False)

### Scratch Work Code (Ignore)

In [30]:
# metadata_marmoset.insert(1, 'biosample_id', metadata_marmoset['orig.file'])
# metadata_marmoset.insert(2, 'donor_id', metadata_marmoset['animal'])
# metadata_marmoset.insert(3, 'species', 'NCBITAXON_9483')
# metadata_marmoset.insert(4, 'species__ontology_label', 'Callithrix jacchus')
# metadata_marmoset.insert(5, 'disease', 'PATO_0000461')
# metadata_marmoset.insert(6, 'disease__ontology_label', 'normal')
# metadata_marmoset.insert(7, 'organ', 'UBERON_0000966')
# metadata_marmoset.insert(8, 'organ__ontology_label', 'retina')
# metadata_marmoset.insert(9, 'library_preparation_protocol', 'EFO_0009899')
# metadata_marmoset.insert(10, 'library_preparation_protocol__ontology_label', "10X 3' v2")
# metadata_marmoset.insert(11, 'sex', 'unknown')

# group_row = pd.Series(['TYPE'] + ['group'] * (len(metadata_marmoset.columns) - 1), index=metadata_marmoset.columns)
# # Concatenate the 'group' row with the existing DataFrame
# metadata_marmoset = pd.concat([group_row.to_frame().T, metadata_marmoset], ignore_index=True)

# metadata_marmoset = metadata_marmoset.iloc[:, :12]



# metadata_mouse.insert(1, 'biosample_id', 'P56_Batch1')
# metadata_mouse.insert(2, 'donor_id', 'P56_Batch1')


# metadata_mouse.insert(1, 'biosample_id', metadata_mouse['orig.file'])
# metadata_mouse.insert(2, 'donor_id', metadata_mouse['animal'])

# metadata_mouse.insert(3, 'species', 'NCBITaxon_10090')
# metadata_mouse.insert(4, 'species__ontology_label', 'Mus musculus')
# metadata_mouse.insert(5, 'disease', 'PATO_0000461')
# metadata_mouse.insert(6, 'disease__ontology_label', 'normal')
# metadata_mouse.insert(7, 'organ', 'UBERON_0000966')
# metadata_mouse.insert(8, 'organ__ontology_label', 'retina')
# metadata_mouse.insert(9, 'library_preparation_protocol', 'EFO_0009899')
# metadata_mouse.insert(10, 'library_preparation_protocol__ontology_label', "10X 3' v2")
# metadata_mouse.insert(11, 'sex', 'unknown')

# group_row = pd.Series(['TYPE'] + ['group'] * (len(metadata_mouse.columns) - 1), index=metadata_mouse.columns)
# # Concatenate the 'group' row with the existing DataFrame
# metadata_mouse = pd.concat([group_row.to_frame().T, metadata_mouse], ignore_index=True)

# metadata_mouse = metadata_mouse.iloc[:, :12]