In [4]:
import pandas as pd
import numpy as np
import math

from Bio.KEGG.REST import kegg_info
from Bio.KEGG.REST import kegg_list
from Bio.KEGG.REST import kegg_link

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

# Input file

In [5]:
taxa_list = ['Thiotrichales', 'Methylococcales', 'Sulfurovum']
ranks = ['order','order', 'genus']

df_ALL = pd.read_csv("Data/NRKG_tpm", index_col=0, header=[0], sep='\t', dtype={'gi': str, 'taxid': str})
df_ALL = df_ALL.rename(columns={'KO':'ko'})

# KEGG KO-Pathway list

In [6]:
request = ''
open("Data/request", 'w').write(kegg_link("pathway","ko").read())

KG_kopath = pd.read_csv('Data/request', header=None, sep='\t')

KG_kopath[0] = KG_kopath[0].str.replace('ko:','')
KG_kopath[1] = KG_kopath[1].str.replace('path:','')

KG_kopath.columns=['ko', 'pathway']
KG_kopath.head()

Unnamed: 0,ko,pathway
0,K00001,map00010
1,K00001,ko00010
2,K00002,map00010
3,K00002,ko00010
4,K00016,map00010


In [7]:
def TPM_statistics(df_data):
    
    #statistics
    df = df_data.loc[:, ['insitu1','insitu2','onboard1','onboard2']]
    
    df['insitu_ave'] = df.loc[:, ['insitu1','insitu2']].apply(lambda x: np.sum(x)/ 2, axis=1)
    df['onboard_ave'] = df.loc[:,['onboard1','onboard2']].apply(lambda x: np.sum(x)/ 2, axis=1)

    # transformed log 10
    df2 = df.applymap(lambda x: math.log10(x) if x != 0 else float(x))
    df2.columns = ['insitu1-log10', 'insitu2-log10', 'onboard1-log10', 'onboard2-log10', 
                        'insitu_ave-log10', 'onboard_ave-log10']

    df2['ratio'] = df['insitu_ave'] / df['onboard_ave']
    df2['ratio'] = df2['ratio'].apply(lambda x: math.log2(x) if x != 0 else float(x))
    
    df2['insitu_range'] = np.fabs(df['insitu1'] - df['insitu_ave'])
    df2['onboard_range'] = np.fabs(df['onboard1'] - df['onboard_ave'])
    
    df_stats = pd.concat([df2, df], axis=1, sort=False, join='inner')
    
    return df_stats

## Gene level expression profile

In [8]:
def taxa_ko_stats(ranks, taxa):
        df_taxon = df_ALL[df_ALL[ranks] == taxa]
        df_taxon_ko = df_taxon[df_taxon['ko'].str.contains('K', na=False)]
        df_taxon_ko_groupby = df_taxon_ko.groupby(by=df_taxon_ko['ko']).sum()
        
        df_taxon_ko_stats = TPM_statistics(df_taxon_ko_groupby)
        df_taxon_ko_stats['taxa'] = taxa
        return df_taxon_ko_stats

# each taxon
df_taxa=pd.DataFrame()
for i in range(len(taxa_list)):
    df_taxon_ko_stats=taxa_ko_stats(ranks[i], taxa_list[i])
    df_taxa = pd.concat([df_taxa, df_taxon_ko_stats], sort=False)

# whole data set
df_ALL_ko = df_ALL[df_ALL['ko'].str.contains('K', na=False)]
df_ALL_ko_groupby = df_ALL_ko.groupby(by=df_ALL_ko['ko']).sum()
df_ALL_ko_stats = TPM_statistics(df_ALL_ko_groupby)
df_ALL_ko_stats['taxa'] = 'All transcripts'

dffig_ko = pd.concat([df_ALL_ko_stats, df_taxa], sort=False)

In [9]:
dffig_ko.to_csv("Data/dffig_ko", sep="\t")

In [10]:
for i in range(len(taxa_list)):
    print(taxa_list[i])
    
    tmp=df_ALL[df_ALL[ranks[i]]==taxa_list[i]]
    print('taxon_annotated_seqs:', len(tmp))
    print('ko_annotated_seqs:', len(tmp[tmp['ko'].str.contains('K', na=False)]))
    
    tmp=dffig_ko[dffig_ko['taxa']==taxa_list[i]]
    print('annotated_KOs:', len(tmp), 'KOs')
    print('ko_annotated_TPMs')
    print(dffig_ko[dffig_ko['taxa']==taxa_list[i]].loc[:,['insitu1','insitu2','onboard1','onboard2']].sum())

Thiotrichales
taxon_annotated_seqs: 49014
ko_annotated_seqs: 30581
annotated_KOs: 1921 KOs
ko_annotated_TPMs
insitu1     149516.10
insitu2     112940.17
onboard1    238128.60
onboard2    228067.47
dtype: float64
Methylococcales
taxon_annotated_seqs: 27321
ko_annotated_seqs: 17281
annotated_KOs: 1604 KOs
ko_annotated_TPMs
insitu1     186422.95
insitu2     225602.36
onboard1    113291.54
onboard2    159281.74
dtype: float64
Sulfurovum
taxon_annotated_seqs: 26141
ko_annotated_seqs: 17046
annotated_KOs: 1109 KOs
ko_annotated_TPMs
insitu1     154797.45
insitu2     133111.22
onboard1     40983.63
onboard2     82426.42
dtype: float64


## Pathway level expression profile

In [11]:
KG_path1234 = pd.read_table('Data/1234_kegg_pathway_list', header=None)

KG_path1234[1]=KG_path1234[0].str.replace('_', '\t').str.split('\t', expand=True)[0]
KG_path1234 = KG_path1234.set_index(KG_path1234.iloc[:,1])
KG_path1234.columns = ['pathway_name','pathway_entry']

KG_path1234_ko = KG_kopath[KG_kopath.pathway.isin(set(KG_path1234.index))]
KG_path1234_ko.head()

Unnamed: 0,ko,pathway
0,K00001,map00010
2,K00002,map00010
4,K00016,map00010
6,K00114,map00010
8,K00121,map00010


In [12]:
def pathway1234(df_ko):

    df_merged = pd.merge(KG_path1234_ko, df_ko, on='ko', how='left')
    df_merged_groupby = df_merged.groupby("pathway").sum()

    df_path1234 = df_merged_groupby.loc[:,['insitu1','insitu2','onboard1','onboard2']]
    df_path1234_stats = TPM_statistics(df_path1234)
    
    return df_path1234_stats

In [13]:
# pathways in kegg category 1~4
df_ALL_path_stats = pathway1234(df_ALL_ko_groupby)
df_ALL_path_stats.head()

# > 1000 in at least one sample
df_ALL_path_stats1000 = df_ALL_path_stats[
    (df_ALL_path_stats['insitu1'] >= 1000) | 
    (df_ALL_path_stats['insitu2'] >= 1000) | 
    (df_ALL_path_stats['onboard1'] >= 1000) | 
    (df_ALL_path_stats['onboard2'] >= 1000)]

print(len(df_ALL_path_stats1000)) 

# bacterial pathways
KG_path1000bac = pd.read_csv('Data/stats_pathway_all_1000_bacteria_list', header=None, index_col=0)
df_ALL_path_stats1000bac = pd.concat([KG_path1000bac, df_ALL_path_stats1000], axis=1, join_axes=[KG_path1000bac.index])
print(len(df_ALL_path_stats1000bac))

df_ALL_path_stats1000bac['taxa'] = 'All transcripts'

77
55


Unnamed: 0_level_0,insitu1-log10,insitu2-log10,onboard1-log10,onboard2-log10,insitu_ave-log10,onboard_ave-log10,ratio,insitu_range,onboard_range,insitu1,insitu2,onboard1,onboard2,insitu_ave,onboard_ave,taxa
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
map00010,4.299262,4.209038,4.143329,4.223218,4.256489,4.185108,0.237121,1868.255,1404.62,19918.74,16182.23,13910.07,16719.31,18050.485,15314.69,All transcripts
map00020,4.385498,4.315425,4.016778,4.253923,4.351873,4.15134,0.666156,1809.95,3775.14,24293.93,20674.03,10393.89,17944.17,22483.98,14169.03,All transcripts
map00030,4.259202,4.334273,4.1194,4.19769,4.298357,4.160307,0.458594,1713.7,1300.235,18163.59,21590.99,13164.37,15764.84,19877.29,14464.605,All transcripts
map00051,3.704898,3.755423,3.756107,3.749279,3.730895,3.752707,-0.072457,312.675,44.48,5068.72,5694.07,5703.05,5614.09,5381.395,5658.57,All transcripts
map00061,3.492343,3.34587,3.459781,3.420494,3.425252,3.440582,-0.050924,444.74,124.66,3107.01,2217.53,2882.58,2633.26,2662.27,2757.92,All transcripts


## Add data for the major taxa and Sort for visualization

In [14]:
df_ALL_path_stats1000bac_sorted = df_ALL_path_stats1000bac.sort_values('ratio')

# each taxon
df_taxa=pd.DataFrame()
for i in range(len(taxa_list)):
    df_taxon_ko_groupby = dffig_ko[dffig_ko['taxa']==taxa_list[i]]
    df_taxon_path_stats=pathway1234(df_taxon_ko_groupby)
    
    # sort for visualization
    dfreindex=pd.concat(
        [df_ALL_path_stats1000bac_sorted, df_taxon_path_stats], 
        axis=1, sort=False, join_axes=[df_ALL_path_stats1000bac_sorted.index]
    ).iloc[:, 15:]
    
    dfreindex['taxa'] = taxa_list[i]
    
    df_taxa = pd.concat([df_taxa, dfreindex], sort=False)


In [15]:
dffig_pathway = pd.concat([df_ALL_path_stats1000bac_sorted, df_taxa], sort=False)

In [16]:
dffig_pathway = pd.concat([dffig_pathway, KG_path1234], axis=1, join_axes=[dffig_pathway.index], sort=False)

In [17]:
dffig_pathway.to_csv("Data/dffig_pathway", sep="\t")