## *nosZ*の機能に必要な遺伝子の同定

- ターゲットの遺伝子(*nosZ*)の獲得の有無を行、調べている遺伝子を直接の祖先が既に獲得しているかどうかを列とする
- Fisher's exact testで有意な遺伝子を抽出

In [1]:
from Bio import Phylo
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact

# データ読み込み
# 系統樹
tree = Phylo.read("../../data/raw/ar122_r202.selected.internal_renamed.tree", "newick") ## 1801 nodes, which includes 901 terminals.
node_list = [node.name for node in tree.find_clades()]
print(len(node_list))

# 祖先状態推定データ
# Possesion=0の行は削除されている
df = pd.read_table("../../data/raw/ko_gn_possession.archaea.txt", names=["OG", "Node", "Possesion"])
print(df)
OG_list = df["OG"].unique().tolist()
print(len(OG_list))

1801
             OG                Node  Possesion
0        K00003              node0_        1.0
1        K00003              node1_        1.0
2        K00003              node2_        1.0
3        K00003  GB_GCA_001873845.1        1.0
4        K00003              node3_        1.0
...         ...                 ...        ...
1690458  K24410            node550_        1.0
1690459  K24410  RS_GCF_003111625.1        1.0
1690460  K24410  GB_GCA_902774605.1        1.0
1690461  K24410  RS_GCF_003814835.1        1.0
1690462  K24410  RS_GCF_900769095.1        1.0

[1690463 rows x 3 columns]
4656


In [2]:
# データ前処理

# 親ノードの辞書
parent_dict = {}
for clade in tree.find_clades(order='level'): # BFS
  if len(parent_dict) == 0: # root
    parent_dict[clade.name] = None

  for child in clade: # 2 elements
    if child.name in parent_dict:
      raise ValueError("Duplicate key: %s" % child.name)
    parent_dict[child.name] = clade.name

print(len(parent_dict)) # 1801

# Possesionの辞書(OG, Node -> Possesion)
possesion_dict = {}
for row in df.itertuples():
  possesion_dict[row.OG, row.Node] = row.Possesion

print(len(possesion_dict)) # 1690463 (should be equal to # of row in df)


1801
1690463


In [5]:
def generate_table(target_OG, candidate_OG):
  # print("\n<OG: {}>".format(candidate_OG))  
  table = np.zeros((2,2))

  ## 0のノードはpossesion_dictに登録されていないのでget()で0として取得
  for node in node_list: # 1801 elements
    if (target_OG, parent_dict[node]) not in parent_dict:  # nosZ, parentが0 (1なら獲得済みなので無視)
      target_obtained = possesion_dict.get((target_OG, node), 0.0)
      candidate_possess = possesion_dict.get((candidate_OG, parent_dict[node]), 0.0)

      if target_obtained == 1.0 and candidate_possess == 1.0:
        table[0,0] += 1
      elif target_obtained == 1.0 and candidate_possess == 0.0:
        table[0,1] += 1
      elif target_obtained == 0.0 and candidate_possess == 1.0:
        table[1,0] += 1
      elif target_obtained == 0.0 and candidate_possess == 0.0:
        table[1,1] += 1

  # print("# of Non-excluded nodes: {}".format(np.sum(table)))
  return table

In [6]:
# nosZを獲得した内部ノード vs 直接の祖先が遺伝子Xを持っている
target_OG = "K00376" # nosZ (今後変える)
# candidate_OG = "K00003" # 調べたいOG
p_value_table = pd.DataFrame(index=[], columns=["oddsratio", "p-value"])

for candidate_OG in OG_list:
  contingency_table = generate_table(target_OG, candidate_OG)
  p_value_table.loc[candidate_OG] = fisher_exact(contingency_table, alternative='two-sided')
  
#   if p < 0.05:
#     print("The imbalance between {} and {} is statistically significant. p-value is {}".format(target_OG, candidate_OG, p))

print(p_value_table)
  

        oddsratio       p-value
K00003  19.800000  1.965396e-12
K00004   8.312500  6.997635e-36
K00005   9.204514  2.553506e-19
K00008   6.503117  3.533414e-25
K00010   0.735693  7.863834e-01
...           ...           ...
K24217        NaN  1.000000e+00
K24258        NaN  1.000000e+00
K24393   0.023347  8.757099e-15
K24409   0.000000  7.417418e-49
K24410   0.000000  1.260748e-44

[4656 rows x 2 columns]
