<a href="https://colab.research.google.com/github/jgalazka/genelab-tools/blob/main/add_biotype_to_raw_counts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Associate Biotype to Gene ID in raw counts table
Add gene biotype annotation to raw counts table and then export with and without rRNA genes.
Requires id2biotype.json file to build dictionary for associating gene id with biotype. Build with build_biotype_dictionary.ipynb notebook available here https://github.com/jgalazka/genelab-tools. 

In [16]:
import pandas as pd
import numpy as np
import json

In [2]:
with open('id2biotype.json') as json_file:
    id_biotype_dict = json.load(json_file)

In [3]:
!wget https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-289_rna_seq_Unnormalized_Counts.csv?version=3

--2021-08-02 21:20:58--  https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-289_rna_seq_Unnormalized_Counts.csv?version=3
Resolving genelab-data.ndc.nasa.gov (genelab-data.ndc.nasa.gov)... 161.40.160.150, 161.40.160.171
Connecting to genelab-data.ndc.nasa.gov (genelab-data.ndc.nasa.gov)|161.40.160.150|:443... connected.
HTTP request sent, awaiting response... 307 307
Location: https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/GLDS-289/download?file=GLDS-289_rna_seq_Unnormalized_Counts.csv&version=3 [following]
--2021-08-02 21:20:58--  https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/GLDS-289/download?file=GLDS-289_rna_seq_Unnormalized_Counts.csv&version=3
Reusing existing connection to genelab-data.ndc.nasa.gov:443.
HTTP request sent, awaiting response... 302 FOUND
Location: https://genelab-repo-prod.s3.amazonaws.com/genelab-data/GLDS-289/rna_seq/GLDS-289_rna_seq_Unnormalized_Counts.csv?versionId=8nsQeup6.mJQJxGoapZCnVBmAd5blKjv&AWSAccessKeyId=ASIASEHL3NLH4EAH

In [4]:
counts = pd.read_csv('GLDS-289_rna_seq_Unnormalized_Counts.csv?version=3', sep=',')
counts=counts.rename(columns = {'Unnamed: 0':'gene_id'})
counts.head()

Unnamed: 0,gene_id,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep1,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep2,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep3,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep1,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep2,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep3,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep1,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep2,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep3,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep1,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep2,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep3,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep1,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep2,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep3,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep1,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep2,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep3
0,ENSMUSG00000000001,601.0,529.0,666.0,564.0,435.0,453.0,531.0,633.0,733.0,1078.0,1065.0,1103.0,1286.0,1086.0,1194.0,1075.0,1084.0,1115.0
1,ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSMUSG00000000028,136.0,123.0,151.0,72.0,85.0,76.0,140.0,121.0,177.0,66.0,58.0,70.0,185.0,181.0,191.0,266.0,210.0,231.0
3,ENSMUSG00000000031,10.0,11.0,13.0,8.0,6.0,7.0,17.0,20.0,20.0,20.0,14.0,23.0,26.0,22.0,42.0,34.0,20.0,33.0
4,ENSMUSG00000000037,12.0,11.0,13.0,8.0,11.0,4.0,7.0,20.0,15.0,15.0,6.0,9.0,27.0,21.0,24.0,19.0,22.0,20.0


In [5]:
meltcounts = pd.melt(counts, id_vars=['gene_id'], value_vars=counts.columns[1:])
meltcounts.columns = ['gene_id','sample','counts']
meltcounts['log2_counts'] = np.log2(meltcounts['counts'] + 1.0)
meltcounts['gene_biotype'] = meltcounts['gene_id'].map(id_biotype_dict)
meltcounts.head()
#meltcounts.to_csv('GLDS-289_meltcounts_biotype.csv')

In [6]:
counts['gene_biotype'] = counts['gene_id'].map(id_biotype_dict)
counts.head()

Unnamed: 0,gene_id,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep1,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep2,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep3,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep1,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep2,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep3,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep1,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep2,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep3,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep1,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep2,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep3,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep1,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep2,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep3,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep1,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep2,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep3,gene_biotype
0,ENSMUSG00000000001,601.0,529.0,666.0,564.0,435.0,453.0,531.0,633.0,733.0,1078.0,1065.0,1103.0,1286.0,1086.0,1194.0,1075.0,1084.0,1115.0,protein_coding
1,ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,protein_coding
2,ENSMUSG00000000028,136.0,123.0,151.0,72.0,85.0,76.0,140.0,121.0,177.0,66.0,58.0,70.0,185.0,181.0,191.0,266.0,210.0,231.0,protein_coding
3,ENSMUSG00000000031,10.0,11.0,13.0,8.0,6.0,7.0,17.0,20.0,20.0,20.0,14.0,23.0,26.0,22.0,42.0,34.0,20.0,33.0,lincRNA
4,ENSMUSG00000000037,12.0,11.0,13.0,8.0,11.0,4.0,7.0,20.0,15.0,15.0,6.0,9.0,27.0,21.0,24.0,19.0,22.0,20.0,protein_coding


In [7]:
counts.to_csv('GLDS-289_biotype_counts.csv')

In [8]:
counts_no_rRNA = counts[counts['gene_biotype'] != 'rRNA'].iloc[:,:-1]

In [9]:
counts_no_rRNA.shape

(55182, 19)

In [10]:
counts.shape

(55536, 20)

In [11]:
counts_no_rRNA.head()

Unnamed: 0,gene_id,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep1,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep2,Mmus_C57-6J_TMS_MHU1_FLT_1G_Rep3,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep1,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep2,Mmus_C57-6J_TMS_MHU1_FLT_uG_Rep3,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep1,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep2,Mmus_C57-6CR_TMS_MHU1_GC_1G_Rep3,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep1,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep2,Mmus_C57-6J_TMS_MHU2_FLT_1G_Rep3,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep1,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep2,Mmus_C57-6J_TMS_MHU2_FLT_uG_Rep3,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep1,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep2,Mmus_C57-6J_TMS_MHU2_GC_1G_Rep3
0,ENSMUSG00000000001,601.0,529.0,666.0,564.0,435.0,453.0,531.0,633.0,733.0,1078.0,1065.0,1103.0,1286.0,1086.0,1194.0,1075.0,1084.0,1115.0
1,ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSMUSG00000000028,136.0,123.0,151.0,72.0,85.0,76.0,140.0,121.0,177.0,66.0,58.0,70.0,185.0,181.0,191.0,266.0,210.0,231.0
3,ENSMUSG00000000031,10.0,11.0,13.0,8.0,6.0,7.0,17.0,20.0,20.0,20.0,14.0,23.0,26.0,22.0,42.0,34.0,20.0,33.0
4,ENSMUSG00000000037,12.0,11.0,13.0,8.0,11.0,4.0,7.0,20.0,15.0,15.0,6.0,9.0,27.0,21.0,24.0,19.0,22.0,20.0


In [14]:
counts_no_rRNA.to_csv('counts_no_rRNA.csv', index=False)

In [15]:
counts.iloc[:,:-1].to_csv('counts.csv', index=False)