In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('res_deseq2_R.tsv', sep = '\t', index_col = 0).dropna()
types = pd.read_fwf('mart_export.txt', delimiter = '\t')

df['Gene name'] = df.index
types = types['Gene type\tGene name'].str.split('\t', expand = True)
types.columns = ['Gene type', 'Gene name']

table = pd.merge(types, df, how='left', on='Gene name')
table = table.reindex(columns=[
                     'Gene name', 'baseMean','log2FoldChange',
                     'lfcSE', 'pvalue', 'padj', 'Gene type']
                     ).dropna().drop_duplicates()
table.set_index('Gene name', inplace = True)
table.to_csv('genes_DESeq2.tsv', sep='\t', encoding='utf-8')

table.head()

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,pvalue,padj,Gene type
Gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MT-TF,1.804471,0.061123,0.758122,0.884268,0.917391,Mt_tRNA
MT-RNR1,10399.048428,1.000508,0.041827,4.108651e-127,1.4191130000000001e-125,Mt_rRNA
MT-RNR2,185709.228607,1.095647,0.036093,7.850145e-203,4.940794e-201,Mt_rRNA
MT-TL1,7.041,1.900482,0.836892,0.003696429,0.007749948,Mt_tRNA
MT-ND1,32042.822879,1.005064,0.110177,4.3762759999999996e-20,2.41848e-19,protein_coding


### Differential expressed genes

In [3]:
diff_expr = table[(table['baseMean'] > 200) & (table['padj'] < 0.05)& (table['log2FoldChange'] > 1)]
diff_expr = diff_expr.sort_values(by = 'log2FoldChange', key = abs, ascending = False)
diff_expr.to_csv('diff_genes.tsv', sep='\t', encoding='utf-8')

diff_expr.head()

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,pvalue,padj,Gene type
Gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HMOX1,16880.36056,8.747973,0.09566,0.0,0.0,protein_coding
RBPMS2,317.415626,7.838965,0.461918,1.024589e-63,1.553891e-62,protein_coding
MAP1A,3009.589442,7.665786,0.146527,0.0,0.0,protein_coding
MMP13,265.317244,7.441984,0.44846,5.07956e-61,7.368289e-60,protein_coding
IGFL1P1,235.584,6.574985,0.358426,1.494997e-74,2.667215e-73,unprocessed_pseudogene


### Reference genes

In [4]:
refer = table[(table['baseMean'] > 200) & (table['log2FoldChange'] < 1)]
refer = refer.sort_values(by = 'log2FoldChange', key = abs)
refer.to_csv('reference_genes.tsv', sep='\t', encoding='utf-8')

refer.head()

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,pvalue,padj,Gene type
Gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATF1,591.181157,-0.000102,0.089792,0.995313,0.996985,protein_coding
FER1L4,2679.794456,-0.000284,0.053589,0.993168,0.995793,transcribed_unitary_pseudogene
IPO8,1873.463463,-0.000471,0.056956,0.982253,0.988053,protein_coding
SLC45A3,599.113445,-0.000785,0.089469,0.988884,0.99298,protein_coding
PHF23,505.178041,-0.000955,0.092646,0.989615,0.993342,protein_coding
