## Preprocesses Downstream Datasets

E.g. OSD-105

**Note:** To make the dataset scBERT ready, you must then run `preprocessing.py` (which is in the base RNALearner directory) AFTER running the code in this file, using the output dataset of this file as the input dataset in `preprocessing.py`.

### To AnnData Object

In [3]:
import pandas as pd
import numpy as np
import scanpy as sc

In [4]:
#data = pd.read_csv("osd105/GLDS-105_rna_seq_Unnormalized_Counts.csv")
data = pd.read_csv("osd99/GLDS-99_rna_seq_Unnormalized_Counts.csv")
data = data.set_index("Unnamed: 0")
data

Unnamed: 0_level_0,Mmus_C57-6J_EDL_FLT_Rep1_M23,Mmus_C57-6J_EDL_FLT_Rep2_M26,Mmus_C57-6J_EDL_FLT_Rep3_M27,Mmus_C57-6J_EDL_FLT_Rep4_M28,Mmus_C57-6J_EDL_FLT_Rep5_M29,Mmus_C57-6J_EDL_FLT_Rep6_M30,Mmus_C57-6J_EDL_GC_Rep1_M31,Mmus_C57-6J_EDL_GC_Rep2_M32,Mmus_C57-6J_EDL_GC_Rep3_M37,Mmus_C57-6J_EDL_GC_Rep4_M38,Mmus_C57-6J_EDL_GC_Rep5_M39,Mmus_C57-6J_EDL_GC_Rep6_M40
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSMUSG00000000001,909.0,1408.0,1421.0,1444.0,1508.0,1413.0,1339.0,1026.0,1058.0,1207.0,1160.0,1133.0
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,90.0,180.0,232.0,126.0,154.0,149.0,161.0,159.0,150.0,173.0,146.0,177.0
ENSMUSG00000000031,26779.0,42757.0,24304.0,31095.0,30104.0,31753.0,38976.0,21904.0,34635.0,29431.0,32538.0,31259.0
ENSMUSG00000000037,4.0,9.0,8.0,8.0,14.0,8.0,14.0,6.0,5.0,9.0,7.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000118389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000118390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000118391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000118392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
adata = sc.AnnData(data.T, data.columns.to_frame(), data.index.to_frame())
adata.obs.columns = ['Sample ID']
adata.var.columns = ['Gene ID']
adata.var

  adata = sc.AnnData(data.T, data.columns.to_frame(), data.index.to_frame())


### Ensembl to Gene ID

In [7]:
import pybiomart
import numpy as np

In [8]:
dataset = pybiomart.Dataset(name='mmusculus_gene_ensembl', host='http://www.ensembl.org')
mappings = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])

In [9]:
def convert(e):
    x = mappings[mappings['Gene stable ID'] == e]
    if x.shape[0] != 0:
        return x['Gene name'].iloc[0]
    return np.NaN
gene_ids = adata.var_names.str.slice(stop=18).map(convert)
gene_ids

Index([        'Gnai3',          'Pbsn',         'Cdc45',           'H19',
               'Scml2',          'Apoh',          'Narf',          'Cav2',
                'Klf6',         'Scmh1',
       ...
       '9030625G05Rik',       'Gm50311',       'Gm50149',       'Gm50415',
             'Vmn1r64',       'Gm50246',       'Gm50102',       'Gm19519',
       '4930524O05Rik',       'Gm50426'],
      dtype='object', name='Unnamed: 0', length=55536)

In [10]:
adata.var_names = gene_ids
adata.var.index = adata.var_names.fillna("No GeneID") 

In [None]:
# Assign labels for the downstream dataset. 
# This part depends on what your downstream dataset is and whether or not it includes labels 
# in the data table already.
adata.obs["y"] = ["FLT"] * 6 + ["GC"] * 6

In [13]:
adata.var

Unnamed: 0_level_0,Gene ID
Unnamed: 0,Unnamed: 1_level_1
Gnai3,ENSMUSG00000000001
Pbsn,ENSMUSG00000000003
Cdc45,ENSMUSG00000000028
H19,ENSMUSG00000000031
Scml2,ENSMUSG00000000037
...,...
Gm50246,ENSMUSG00000118389
Gm50102,ENSMUSG00000118390
Gm19519,ENSMUSG00000118391
4930524O05Rik,ENSMUSG00000118392


In [14]:
adata.obs

Unnamed: 0,Sample ID,y
Mmus_C57-6J_EDL_FLT_Rep1_M23,Mmus_C57-6J_EDL_FLT_Rep1_M23,FLT
Mmus_C57-6J_EDL_FLT_Rep2_M26,Mmus_C57-6J_EDL_FLT_Rep2_M26,FLT
Mmus_C57-6J_EDL_FLT_Rep3_M27,Mmus_C57-6J_EDL_FLT_Rep3_M27,FLT
Mmus_C57-6J_EDL_FLT_Rep4_M28,Mmus_C57-6J_EDL_FLT_Rep4_M28,FLT
Mmus_C57-6J_EDL_FLT_Rep5_M29,Mmus_C57-6J_EDL_FLT_Rep5_M29,FLT
Mmus_C57-6J_EDL_FLT_Rep6_M30,Mmus_C57-6J_EDL_FLT_Rep6_M30,FLT
Mmus_C57-6J_EDL_GC_Rep1_M31,Mmus_C57-6J_EDL_GC_Rep1_M31,GC
Mmus_C57-6J_EDL_GC_Rep2_M32,Mmus_C57-6J_EDL_GC_Rep2_M32,GC
Mmus_C57-6J_EDL_GC_Rep3_M37,Mmus_C57-6J_EDL_GC_Rep3_M37,GC
Mmus_C57-6J_EDL_GC_Rep4_M38,Mmus_C57-6J_EDL_GC_Rep4_M38,GC


In [15]:
adata.obs_names = adata.obs_names.str.split("_").str[-1]

In [16]:
adata.var.index = adata.var.index.str.lower()

In [17]:
adata.var

Unnamed: 0_level_0,Gene ID
Unnamed: 0,Unnamed: 1_level_1
gnai3,ENSMUSG00000000001
pbsn,ENSMUSG00000000003
cdc45,ENSMUSG00000000028
h19,ENSMUSG00000000031
scml2,ENSMUSG00000000037
...,...
gm50246,ENSMUSG00000118389
gm50102,ENSMUSG00000118390
gm19519,ENSMUSG00000118391
4930524o05rik,ENSMUSG00000118392


In [18]:
#adata.write_h5ad("osd105/osd105_raw_ensembl_all_genes.h5ad")
adata.write_h5ad("osd99/osd99_raw_ensembl_all_genes.h5ad")