In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from Bio import SeqIO
import graphviz as gv
import json

In [2]:
no_pfams = "/sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/genera/Mycobacterium/graphs/anti_gids_no_pfams.gexf"
no_pfams = nx.readwrite.read_gexf(no_pfams)

pfams_path = "/sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/genera/Mycobacterium/graphs/anti_gids_with_pfams.gexf"
pfams = nx.readwrite.read_gexf(pfams_path)

In [2]:
smash = pd.read_csv("antismash_db_results.csv", sep="\t")
smash[smash["NCBI accession"] == "NC_010612.1"] 
counts = smash["NCBI accession"].value_counts()
above_10_index = counts[counts > 10].index
smash = smash[smash["NCBI accession"].isin(above_10_index)]
smash["NCBI accession"] = smash["NCBI accession"].apply(lambda row: row.split(".")[0])
smash_dict = dict()
for accession, df in smash.groupby("NCBI accession"):
    smash_dict[accession] = df[["From", "To"]].to_numpy()

In [5]:
len(smash_dict.keys())

90

In [87]:
[i[0] for i in sorted([(k, len(value)) for k, value in smash_dict.items()], key = lambda tup: tup[1], reverse=True)][:3]

['NC_010612', 'NZ_CP024190', 'NZ_AP018410']

In [37]:
smash_dict["NC_008705"]

array([[ 260663,  316802],
       [1140141, 1179943],
       [2127726, 2167061],
       [2404961, 2447356],
       [2476639, 2517560],
       [2980520, 3038252],
       [3296046, 3358234],
       [3683063, 3747542],
       [4479777, 4490167],
       [5080399, 5124335],
       [5321913, 5367367],
       [5403153, 5424104]])

In [42]:
def is_feature_in_bgc(contig_name, start, stop):
    start = int(start)
    stop = int(stop)
    try:
        bgcs = smash_dict[contig_name]
    except:
        return False
    for b_start, b_stop in bgcs:
        if (start>= b_start) and (stop <= b_stop):
            return True
    return False

In [94]:
def annotate_nodes(pfams):
    for node_id in pfams.nodes:
        node = pfams.nodes[node_id]
        node_bgc = False
        if type(node["features"]) == str:
            features = json.loads(node["features"].replace('""', '"'))
        else:
            features = node["features"]
            
        if node["family"][:2] == "PF":
            node["resolution"] = "Pfam"
        else:
            node["resolution"] = "PGFam"

        for genome in features["info"]:
            for contig in features["info"][genome]:
                for ix, sequence in enumerate(features["info"][genome][contig]):
                    is_in_bgc = is_feature_in_bgc(contig, sequence["start"], sequence["end"])
                    features["info"][genome][contig][ix]["in_bgc"] = is_in_bgc
                    if is_in_bgc:
                        node_bgc = True

        node["features"] = features
        node["BGC"] = node_bgc

In [77]:
tot = 0
for node_id in pfams.nodes:
    if pfams.nodes[node_id]["BGC"] and pfams.nodes[node_id]["family"][:2] == "PF":
        tot += 1
print(tot)

3253


In [92]:
def annotate_edges(pfams):
    for u,v,a in pfams.edges(data=True):
        start = pfams.nodes[u]
        stop = pfams.nodes[v]
        edge_bgc = start["BGC"] and stop["BGC"]
        is_start_pfam = start["family"][:2] == "PF"
        is_stop_pfam = stop["family"][:2] == "PF"
        if is_start_pfam and is_stop_pfam:
            edge_type = "pfam"
        elif is_start_pfam ^ is_stop_pfam:
            edge_type = "mixed"
        else:
            edge_type = "pgfam"
        a["BGC"] = edge_bgc
        a["LinkType"] = edge_type


In [72]:
nx.readwrite.write_gexf(pfams, "anti_gids_with_pfams_processed.gexf")

In [90]:
def process_graph(graph_path, save=True):
    pfams = nx.readwrite.read_gexf(graph_path)    
    annotate_nodes(pfams)
    annotate_edges(pfams)
    if save:
        save_path = graph_path.split(".")[0] + "_processed.gexf"
        nx.readwrite.write_gexf(pfams, save_path)        

In [95]:
process_graph("top_3_anti_gids_with_pfams.gexf")