## *nosZ*の機能に必要な遺伝子の同定

In [8]:
from Bio import Phylo
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt

In [9]:
# データ前処理
def create_dictionaries(tree, df):
  ## 親ノードの辞書(child node -> parent node)
  parent_dict = {}
  for clade in tree.find_clades(order='level'): # BFS
    if len(parent_dict) == 0: # root
      parent_dict[clade.name] = None

    for child in clade: # 2 elements
      if child.name in parent_dict:
        raise ValueError("Duplicate key: %s" % child.name)
      parent_dict[child.name] = clade.name

  print("# of keys in parent_dict is {}.".format(len(parent_dict))) # archaea: 1801, bacteria: 51753

  ## Possesionの辞書(OG, Node -> Possesion)
  possesion_dict = {}
  for row in df.itertuples():
    possesion_dict[row.OG, row.Node] = row.Possesion

  print("# of keys in possesion_dict is {}.".format(len(possesion_dict))) # this should be equal to # of row in df

  return parent_dict, possesion_dict

In [10]:
# nosZの獲得/損失回数
## gain
def count_nosZ_gain(node_list, parent_dict, possesion_dict, threshold=0.5):
  nosZ_gain_count = 0
  for node in node_list:
    src = possesion_dict.get(("K00376", parent_dict[node]), 0.0)
    dst = possesion_dict.get(("K00376", node), 0.0)
    if src < threshold and dst > threshold:
      # print("parent_node: {}, child_node: {}".format(parent_dict[node], node))
      # print("{}->{}".format(src, dst))
      nosZ_gain_count += 1

  print("nosZ is gained {} times.".format(nosZ_gain_count))

## loss
def count_nosZ_loss(node_list, parent_dict, possesion_dict, threshold=0.5):
  nosZ_loss_count = 0
  for node in node_list:
    src = possesion_dict.get(("K00376", parent_dict[node]), 0.0)
    dst = possesion_dict.get(("K00376", node), 0.0)
    if src > threshold and dst < threshold:
      # print("parent_node: {}, child_node: {}".format(parent_dict[node], node))
      # print("{}->{}".format(src, dst))
      nosZ_loss_count += 1

  print("nosZ is lost {} times.".format(nosZ_loss_count))

In [11]:
# 分割表を作成
def generate_table(target_OG, candidate_OG, parent_dict, possesion_dict, mode='gain', threshold=0.5):
  # print("\n<OG: {}>".format(candidate_OG))  
  table = np.zeros((2,2))
  
  for node in node_list:
    ## 0のノードはpossesion_dictに登録されていないのでget()で0として取得
    parent_has_target = possesion_dict.get((target_OG, parent_dict[node]), 0.0)
    
    # loss
    if mode == 'loss' and parent_has_target > threshold: # parentがnosZ獲得済
      has_target           = possesion_dict.get((target_OG, node), 0.0)
      parent_has_candidate = possesion_dict.get((candidate_OG, parent_dict[node]), 0.0)

      if has_target < threshold and parent_has_candidate > threshold:
        table[0,0] += 1
      elif has_target < threshold and parent_has_candidate < threshold:
        table[0,1] += 1
      elif has_target > threshold and parent_has_candidate > threshold:
        table[1,0] += 1
      elif has_target > threshold and parent_has_candidate < threshold:
        table[1,1] += 1
    
    # gain
    elif parent_has_target < threshold:  # nosZについて, parentが未獲得
      has_target           = possesion_dict.get((target_OG, node), 0.0)
      parent_has_candidate = possesion_dict.get((candidate_OG, parent_dict[node]), 0.0)

      if has_target > threshold and parent_has_candidate > threshold:
        table[0,0] += 1
      elif has_target > threshold and parent_has_candidate < threshold:
        table[0,1] += 1
      elif has_target < threshold and parent_has_candidate > threshold:
        table[1,0] += 1
      elif has_target < threshold and parent_has_candidate < threshold:
        table[1,1] += 1

  # print("# of Non-excluded nodes: {}".format(np.sum(table)))
  return table

In [12]:
### Archaea ###

# データ読み込み
## 系統樹
tree = Phylo.read("../../data/raw/ar122_r202.selected.internal_renamed.tree", "newick")
node_list = [node.name for node in tree.find_clades()]
print(len(node_list)) ## 1801 nodes, which includes 901 terminals.

## 祖先状態推定データ
## Possesion=0の行は削除されている
df = pd.read_table("../../data/raw/ko_gn_possession.archaea.txt", names=["OG", "Node", "Possesion"])
OG_list = df["OG"].unique().tolist()
print(len(OG_list)) ## 4656 OGs

1801
4656


In [7]:
### Bacteria ###
 
# データ読み込み
## 系統樹
tree = Phylo.read("../../data/raw/bac120_r202.selected.internal_renamed.tree", "newick") 
# tree = Phylo.read("../../data/itol/bacteria_tree_random.txt", "newick") ## 5999 nodes
node_list = [node.name for node in tree.find_clades()]
print(len(node_list)) ## 51753 nodes, which includes 25877 terminals.

## 祖先状態推定データ
## Possesion=0の行は削除されている
# df = pd.read_table("../../data/raw/ko_gn_possession.bacteria.txt", names=["OG", "Node", "Possesion"])
# OG_list = df["OG"].unique().tolist()
# print(len(OG_list)) ## 9566 OGs

51753
25877


In [13]:
threshold = 0.5
parent_dict, possesion_dict = create_dictionaries(tree, df)
del df, tree
count_nosZ_gain(node_list, parent_dict, possesion_dict, threshold)
## (Archaea) 0→1: 4 times, 0→(0.5 or 1): 7 times, (0 or 0.5)→1: 9 times
## (Bacteria) 0→1: 208 times, 0→(0.5 or 1):  times, (0 or 0.5)→1:  times
count_nosZ_loss(node_list, parent_dict, possesion_dict, threshold)
## (Archaea) 1→0: 58 times, (0.5 or 1)→0: 103 times, 1→(0 or 0.5): 74 times

# of keys in parent_dict is 1801.
# of keys in possesion_dict is 1690463.
nosZ is gained 4 times.
nosZ is lost 58 times.


In [14]:
# 検定: nosZを獲得 vs 遺伝子Xを親ノードが保持
target_OG = "K00376" # nosZ

df_result = pd.DataFrame(index=[], columns=["oddsratio", "p-value"])
for candidate_OG in OG_list:
  if candidate_OG != target_OG:
    contingency_table = generate_table(target_OG, candidate_OG, parent_dict, possesion_dict, mode='loss', threshold=threshold)
    # print(contingency_table)
    df_result.loc[candidate_OG] = fisher_exact(contingency_table, alternative='two-sided')

print(df_result) 

        oddsratio       p-value
K00003        inf  7.989333e-04
K00004   6.379967  1.299528e-09
K00005   3.925746  4.550485e-03
K00008   3.467593  2.385604e-04
K00010   2.669492  1.930380e-01
...           ...           ...
K24217        NaN  1.000000e+00
K24258        NaN  1.000000e+00
K24393   0.110114  5.012972e-03
K24409   0.000000  1.105287e-11
K24410   0.000000  1.111238e-10

[4655 rows x 2 columns]


In [15]:
# Multiple test correction
alpha = 0.05
p_value_list = df_result["p-value"].to_numpy(copy=True)
df_result["rejected"] = (p_value_list < alpha) # no correction
Bonferroni_rejected, Bf_p_list, _, _ = multipletests(p_value_list, alpha=alpha, method='bonferroni')
BH_rejected, BH_p_list, _, _         = multipletests(p_value_list, alpha=alpha, method='fdr_bh')
BY_rejected, BY_p_list, _, _         = multipletests(p_value_list, alpha=alpha, method='fdr_by')
df_result["Bonferroni adjusted p-value"] = Bf_p_list
df_result["Bonferroni rejected"] = Bonferroni_rejected
df_result["BH adjusted p-value"] = BH_p_list
df_result["BH rejected"] = BH_rejected
df_result["BY adjusted p-value"] = BY_p_list
df_result["BY rejected"] = BY_rejected

print(sum(df_result["rejected"]), sum(df_result["Bonferroni rejected"]), sum(df_result["BH rejected"]), sum(df_result["BY rejected"]))


# Merge with ko table
df_result["ko"] = ["ko:" + index for index in df_result.index]
ko_table = pd.read_table("../../data/raw/ko.txt", names=["ko", "description"])
df_merged = pd.merge(ko_table, df_result, on="ko", copy=False).sort_values("p-value")
# df_merged.to_csv("../../data/result/archaea_gain_candidates_sorted_low_threshold.txt", columns=["p-value", "ko", "description"], index=False, sep='\t', float_format="%.4e")
# df_merged.to_csv("../../data/result/bacteria_gain_candidates_sorted.txt", columns=["p-value", "ko", "description"], index=False, sep='\t', float_format="%.4f")
df_merged.head(10)

2009 1151 1865 1576


Unnamed: 0,ko,description,oddsratio,p-value,rejected,Bonferroni adjusted p-value,Bonferroni rejected,BH adjusted p-value,BH rejected,BY adjusted p-value,BY rejected
4190,ko:K19342,nosL; copper chaperone NosL,196.91906,1.789456e-35,True,8.329917e-32,True,8.329917e-32,True,7.516101000000001e-31,True
2842,ko:K08976,K08976; putative membrane protein,167.819005,8.107264e-33,True,3.773932e-29,True,1.886966e-29,True,1.702613e-28,True
158,ko:K00299,"ssuE, msuE; FMN reductase [EC:1.5.1.38]",65.383944,1.352215e-32,True,6.294559e-29,True,2.098186e-29,True,1.893198e-28,True
21,ko:K00033,"PGD, gnd, gntZ; 6-phosphogluconate dehydrogena...",162.756637,2.886483e-32,True,1.343658e-28,True,3.359145e-29,True,3.0309630000000003e-28,True
4242,ko:K20110,JAMM1; desampylase [EC:3.4.19.15],43.128902,1.0836080000000002e-31,True,5.044193e-28,True,8.777894e-29,True,7.920312e-28,True
4189,ko:K19341,nosY; Cu-processing system permease protein,62.168798,1.131415e-31,True,5.266736e-28,True,8.777894e-29,True,7.920312e-28,True
3667,ko:K15408,coxAC; cytochrome c oxidase subunit I+III [EC:...,42.77464,1.53238e-31,True,7.133228e-28,True,1.0190330000000002e-28,True,9.194752e-28,True
4082,ko:K18929,lldF; L-lactate dehydrogenase complex protein ...,84.068966,2.859591e-31,True,1.3311400000000001e-27,True,1.494356e-28,True,1.3483610000000001e-27,True
773,ko:K01679,"E4.2.1.2B, fumC, FH; fumarate hydratase, class...",153.803419,2.9881380000000002e-31,True,1.3909780000000001e-27,True,1.494356e-28,True,1.3483610000000001e-27,True
3014,ko:K09786,K09786; uncharacterized protein,60.356322,3.6298740000000002e-31,True,1.689706e-27,True,1.494356e-28,True,1.3483610000000001e-27,True


In [None]:
import gc
del parent_dict, possesion_dict
gc.collect()
gc.get_count()

In [17]:
df_merged[df_merged["oddsratio"] < 0]

Unnamed: 0,ko,description,oddsratio,p-value,rejected,Bonferroni adjusted p-value,Bonferroni rejected,BH adjusted p-value,BH rejected,BY adjusted p-value,BY rejected
