In [1]:
import pandas as pd
import sqlite3

from collections import Counter

In [2]:
con = sqlite3.connect(":memory:")

In [3]:
nodes = pd.read_csv("../aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv")
nodes.to_sql("nodes", con, if_exists="replace")

In [4]:
i2 = pd.read_csv("out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv", delimiter="\t")
i2.to_sql("i2", con, if_exists="replace")
i4 = pd.read_csv("out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv", delimiter="\t")
i4.to_sql("i4", con, if_exists="replace")
i6 = pd.read_csv("out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv", delimiter="\t")
i6.to_sql("i6", con, if_exists="replace")
len(i2), len(i4), len(i6)

(3102, 3693, 3634)

In [5]:
nodes_in_cluster = [ids for ids in pd.read_sql("""
SELECT i2.node_ids
FROM i2
UNION ALL
SELECT i4.node_ids
FROM i4
UNION ALL
SELECT i6.node_ids
FROM i6
""", con)["node_ids"]]

len(set(sum([node_ids.split(",") for node_ids in nodes_in_cluster], [])))

10236

In [6]:
sum((3102, 3693, 3634))

10429

In [7]:
clusters = list(dict(pd.read_sql("""
SELECT cluster_id, node_ids
FROM i2
""", con).values).items())

clusters[-1]

('Cluster3102', 'AT1G54870,AT5G54740')

In [8]:
pd.read_sql("""
SELECT node_id, "GO_terms"
FROM nodes
WHERE 1
    AND INSTR('AT1G54870,AT5G54740', node_id) > 0
""", con)

Unnamed: 0,node_id,GO_terms
0,AT1G54870,"GO:0016491, GO:0008106"
1,AT5G54740,"GO:0045735, GO:0008289"


In [9]:
go_term_query = """
SELECT "GO_terms"
FROM nodes
WHERE 1
    AND INSTR('{node_ids}', node_id) > 0
"""

def get_go_terms(node_ids):
    return pd.read_sql(go_term_query.format(node_ids=node_ids), con)

get_go_terms(clusters[-1][1])

Unnamed: 0,GO_terms
0,"GO:0016491, GO:0008106"
1,"GO:0045735, GO:0008289"


In [13]:
cluster1_go_terms = get_go_terms(clusters[0][1])
cluster1_go_terms_flat = sum([terms.split(", ") for terms in list(cluster1_go_terms["GO_terms"]) if terms], [])
counts = Counter(cluster1_go_terms_flat)
cluster1_go_terms_desc = next(zip(*counts.most_common()))
counts.most_common(5), cluster1_go_terms_desc[:5]

([('GO:0003735', 183),
  ('GO:0003729', 64),
  ('GO:0003723', 35),
  ('GO:0003674', 28),
  ('GO:0005515', 27)],
 ('GO:0003735', 'GO:0003729', 'GO:0003723', 'GO:0003674', 'GO:0005515'))

In [19]:
def go_terms_stats(node_ids):
    go_terms_raw = get_go_terms(node_ids)
    go_terms_flat = sum([terms.split(", ") for terms in list(go_terms_raw["GO_terms"]) if terms], [])
    go_term_counts = Counter(go_terms_flat)
    return go_term_counts.most_common()
    
def go_terms_by_cluster(clusters):
    return {cluster_id: go_terms_stats(node_ids) for (cluster_id, node_ids) in clusters}

In [22]:
go_terms_by_cluster(clusters[100:101])

{'Cluster101': [('GO:0005524', 5),
  ('GO:0051082', 5),
  ('GO:0005515', 2),
  ('GO:0042803', 1),
  ('GO:0008270', 1),
  ('GO:0008483', 1),
  ('GO:0010285', 1),
  ('GO:0030170', 1),
  ('GO:0005507', 1),
  ('GO:0042623', 1),
  ('GO:0044183', 1),
  ('GO:0051787', 1),
  ('GO:0031072', 1),
  ('GO:0016887', 1)]}

In [18]:
clusters[-1:]

[('Cluster3102', 'AT1G54870,AT5G54740')]