## Clustering Transformation for Submissions to Single Cell Portal (SCP)

#### Author: Hannah Kang
##### Date Last Modified: 11/18/2023

##### Study Information: 
The aim of this study is to understand the many types of neurons that carry out information processing in the brain evolved recently by focusing on the retina, the thin film of neurons in the eye that initiates vision. Our approach entails high-throughput single-cell RNA-seq where gene expression is quantified in hundreds of thousands of single retinal neurons, and computational methods are used to process and integrate these datasets. We have recently completed a project that involves generating and integrating single-cell RNA-seq data across an unprecedented 17 species.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
file_path = "/Users/hannahkang/Desktop/Shek Lab/Original Files/MarmosetFovea_BC_UMAP.csv"

marmoset_umap_data = pd.read_csv(file_path)
marmoset_umap_data.head()

Unnamed: 0,rn,UMAP_1,UMAP_2
0,possorted_genome_bam_EXKMG:AAAGATGAGCCAGGATx,0.603162,9.920285
1,possorted_genome_bam_EXKMG:AAAGATGAGGCCCTCAx,-10.567693,0.071323
2,possorted_genome_bam_EXKMG:AAACGGGTCTTGTATCx,-10.377399,1.688555
3,possorted_genome_bam_EXKMG:AAATGCCCACTAGTACx,-1.433023,9.61167
4,possorted_genome_bam_EXKMG:AAACGGGGTCCTAGCGx,5.751928,0.07063


In [3]:
marmoset_umap_data['rn'] = marmoset_umap_data['rn'].str.replace('possorted_genome_bam_', '') 
marmoset_umap_data = marmoset_umap_data.rename(columns={'rn': 'NAME'}) 
marmoset_umap_data['NAME'] = marmoset_umap_data['NAME'].str.replace(':', '_')
marmoset_umap_data = marmoset_umap_data.rename(columns={'UMAP_1': 'X'})
marmoset_umap_data = marmoset_umap_data.rename(columns={'UMAP_2': 'Y'})

group_row = pd.Series(['TYPE'] + ['numeric'] * (len(marmoset_umap_data.columns) - 1), index=marmoset_umap_data.columns)
# Concatenate the 'group' row with the existing DataFrame
marmoset_umap_data = pd.concat([group_row.to_frame().T, marmoset_umap_data], ignore_index=True)

marmoset_umap_data

Unnamed: 0,NAME,X,Y
0,TYPE,numeric,numeric
1,EXKMG_AAAGATGAGCCAGGATx,0.603162,9.920285
2,EXKMG_AAAGATGAGGCCCTCAx,-10.567693,0.071323
3,EXKMG_AAACGGGTCTTGTATCx,-10.377399,1.688555
4,EXKMG_AAATGCCCACTAGTACx,-1.433023,9.61167
...,...,...,...
12552,K564P_TTTGGTTGTTGAGTTCx,4.5215,-9.880826
12553,K564P_TTTATGCCATGTTCCCx,-1.635579,12.19411
12554,K564P_TTTATGCCAATCACACx,0.987212,9.858853
12555,K564P_TTTGTCATCTGTTTGTx,-10.085415,1.283243


In [4]:
file_path = "/Users/hannahkang/Desktop/Shek Lab/Original Files/Mouse_BC_UMAP.csv"

mouse_umap_data = pd.read_csv(file_path)
mouse_umap_data.head(10)

Unnamed: 0,rn,UMAP_1,UMAP_2
0,possorted_genome_bam_Z0OYT:AAACGGGCAACACGCCx,12.287252,10.838057
1,possorted_genome_bam_Z0OYT:AAACCTGAGTGTACGGx,6.875988,1.992848
2,possorted_genome_bam_Z0OYT:AAAGTAGGTTGTTTGGx,-9.62522,3.702873
3,possorted_genome_bam_Z0OYT:AAAGTAGCATTCCTCGx,-7.145818,5.008031
4,possorted_genome_bam_Z0OYT:AAATGCCAGCCACTATx,15.272835,5.652173
5,possorted_genome_bam_Z0OYT:AAACGGGTCTTGCCGTx,-7.448705,3.298023
6,possorted_genome_bam_Z0OYT:AAAGCAATCAGTACGTx,11.657537,0.121542
7,possorted_genome_bam_Z0OYT:AAACGGGTCAGCAACTx,-9.952392,3.860907
8,possorted_genome_bam_Z0OYT:AAACGGGGTAGGCATGx,16.578685,5.499982
9,possorted_genome_bam_Z0OYT:AAAGATGTCTCGATGAx,-0.212624,9.738052


In [5]:
# Remove the prefix from the "Unnamed: 0" column
mouse_umap_data['rn'] = mouse_umap_data['rn'].str.replace('possorted_genome_bam_', '') 
mouse_umap_data = mouse_umap_data.rename(columns={'rn': 'NAME'}) 
mouse_umap_data['NAME'] = mouse_umap_data['NAME'].str.replace(':', '_')
mouse_umap_data = mouse_umap_data.rename(columns={'UMAP_1': 'X'})
mouse_umap_data = mouse_umap_data.rename(columns={'UMAP_2': 'Y'})

group_row = pd.Series(['TYPE'] + ['numeric'] * (len(mouse_umap_data.columns) - 1), index=mouse_umap_data.columns)
# Concatenate the 'group' row with the existing DataFrame
mouse_umap_data = pd.concat([group_row.to_frame().T, mouse_umap_data], ignore_index=True)

mouse_umap_data

Unnamed: 0,NAME,X,Y
0,TYPE,numeric,numeric
1,Z0OYT_AAACGGGCAACACGCCx,12.287252,10.838057
2,Z0OYT_AAACCTGAGTGTACGGx,6.875988,1.992848
3,Z0OYT_AAAGTAGGTTGTTTGGx,-9.62522,3.702873
4,Z0OYT_AAAGTAGCATTCCTCGx,-7.145818,5.008031
...,...,...,...
5551,H5JEB_TTTACTGTCCGCTGTTx,11.375189,-0.060606
5552,H5JEB_TTTATGCGTCTGATTGx,-4.768785,-14.713917
5553,H5JEB_TTTCCTCTCTTGCAAGx,-10.010206,3.798352
5554,H5JEB_TTTCCTCGTTCAGCGCx,11.426987,-2.273971


In [6]:
file_path = "/Users/hannahkang/Desktop/Shek Lab/Single Cell Portal/Output Files/Marmoset/SCP-marmoset-clustering-submission.csv"
marmoset_umap_data.to_csv(file_path, index=False)

file_path = "/Users/hannahkang/Desktop/Shek Lab/Single Cell Portal/Output Files/Mouse/SCP-mouse-clustering-submission.csv"
mouse_umap_data.to_csv(file_path, index=False)