In [1]:
import pandas as pd
import numpy as np
import time
import math

from Bio.KEGG.REST import kegg_info
from Bio.KEGG.REST import kegg_list
from Bio.KEGG.REST import kegg_link

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

In [2]:
request = ''

# KO
open("Data/request", 'w').write(kegg_list("ko").read())
df = pd.read_table('Data/request', header=None)
df1 = df[0].str.replace('ko:','')
df2 = df[1].str.replace(';','\t').str.split('\t', expand=True)

df_ko = pd.concat([df1, df2.loc[:,:0]], axis=1)
df_ko.columns =['ko','Gene']
df_ko.head()

# Module-KO
open("Data/request", 'w').write(kegg_link("module","ko").read())
df = pd.read_table('Data/request', header=None)
df[0] = df[0].str.replace('ko:','')
df[1] = df[1].str.replace('md:','')
df.columns = ['ko','Module']

df_link = pd.merge(df_ko, df, on='ko', how='left')

# Module
open("request", 'w').write(kegg_list("module").read())
df = pd.read_table('request', header=None)
df[0] = df[0].str.replace('md:','')
df.columns =['Module', 'Module_name']

df_module_ko = pd.merge(df_link, df, on='Module', how='left')

df_module_ko.head()

Unnamed: 0,ko,Gene,Module,Module_name
0,K00001,"E1.1.1.1, adh",,
1,K00002,"AKR1A1, adh",M00014,Glucuronate pathway (uronate pathway)
2,K00002,"AKR1A1, adh",M00129,"Ascorbate biosynthesis, animals, glucose-1P =>..."
3,K00003,hom,M00017,"Methionine biosynthesis, apartate => homoserin..."
4,K00003,hom,M00018,"Threonine biosynthesis, aspartate => homoserin..."


## Input data

In [3]:
col = ['black', '#7b68ee', '#ff7f24', '#ffc125']
graph_list = ['All transcripts', 'Thiotrichales', 'Methylococcales', 'Sulfurovum']
taxa_list = ['Thiotrichales', 'Methylococcales', 'Sulfurovum']

In [4]:
dffig_ko = pd.read_csv('Data/dffig_ko', index_col=0, header=0, usecols=['ko','insitu1', 'insitu2', 'onboard1', 'onboard2','taxa'], sep='\t')

In [5]:
dffig_ko.reset_index(inplace=True)
dffig_komodule=pd.merge(df_module_ko, dffig_ko, on='ko', how='right')

In [6]:
# chemolithotrophic metabolisms
modulelist = ['M00165', 'M00173', 'M00579','M00376','M00375', 'M00374','M00377','M00620', 
               'M00175','M00531','M00530','M00529','M00804',
               'M00174','M00346','M00345','M00344','M00140',
               'M00176','M00596','M00595',
               'M00144','M00149','M00150','M00151','M00155', 'M00153','M00417','M00416','M00156','M00157','M00159']

kolist = ['K00122', 'K10713', 'K10714','K01499', 'K17229','K17230','K17218']

In [7]:
def extract_data(selectlist, colname):
    output = pd.DataFrame()
    dfoutput = pd.DataFrame()
    
    for i in range(len(graph_list)):
        dftaxa = dffig_komodule[dffig_komodule['taxa'] == graph_list[i]]
        
        for j in range(len(selectlist)):
            dftmp = dftaxa[dftaxa[colname] == selectlist[j]]
            output = pd.concat([output, dftmp], sort=False)
        dfoutput=pd.concat([dfoutput, output], sort=False)
    return dfoutput

df_selectM = extract_data(modulelist, "Module")
df_selectK = extract_data(kolist, "ko")

df_selected = pd.concat([df_selectM, df_selectK])

In [8]:
df_selected.columns=['KO', 'Gene', 'Module', 'Module_name',
                     '$\it{In}$ $\it{situ}$ mRNA 1', '$\it{In}$ $\it{situ}$ mRNA 2', 'Onboard mRNA 1', 'Onboard mRNA 2', 'taxa']

In [9]:
df_selected.to_csv("Figures/DataSet1", sep="\t")   # 出力