In [1]:
import networkx as nx
from copy import deepcopy
import json
import random
import numpy as np
import pandas as pd

In [2]:
# with open("../test_data/toy-ppi-G.json") as src:
#     data = json.load(src)
#     j = nx.readwrite.json_graph.node_link_graph(data)

In [3]:
# with open("../scripts/test-G.json") as src:
#     data = json.load(src)
#     G = nx.readwrite.json_graph.node_link_graph(data)

In [4]:
smash = pd.read_csv("../misc_data/antismash_db_results.csv", sep="\t")
counts = smash["NCBI accession"].value_counts()
# above_10_index = counts[counts > 10].index
# smash = smash[smash["NCBI accession"].isin(above_10_index)]
smash["NCBI accession"] = smash["NCBI accession"].apply(lambda row: row.split(".")[0])
smash_dict = dict()
for accession, df in smash.groupby("NCBI accession"):
    smash_dict[accession] = df[["From", "To"]].to_numpy()

In [85]:
g = nx.readwrite.read_gexf("../test_data/first_pass_split_0_train_pgfams.gexf")

In [6]:
no_atts = deepcopy(g) #g.nodes["60687"]

In [7]:
id_map = dict()
class_map = dict()
for ix, node_id in enumerate(sorted(list(no_atts.nodes))):    
    id_map[node_id] = ix
    
with open("test-id_map.json", "w") as dest:
    json.dump(id_map, dest)

In [8]:
feat_arr = []
for ix, node_id in enumerate(sorted(list(no_atts.nodes))):    
    feat_arr.append(g.nodes[node_id]["family"])
np.save("test-feats.npy", np.array(feat_arr))

In [95]:
def is_feature_in_bgc(contig_name, start, stop):
    start = int(start)
    stop = int(stop)
    try:
        bgcs = smash_dict[contig_name]
    except:
        return False
    for b_start, b_stop in bgcs:
        if (start>= b_start) and (stop <= b_stop):
            return True
    return False

def annotate_edges(pfams):
    for u,v,a in pfams.edges(data=True):
        start = pfams.nodes[u]
        stop = pfams.nodes[v]
        edge_bgc = start["BGC"] and stop["BGC"]
        is_start_pfam = start["family"][:2] == "PF"
        is_stop_pfam = stop["family"][:2] == "PF"
        if is_start_pfam and is_stop_pfam:
            edge_type = "pfam"
        elif is_start_pfam ^ is_stop_pfam:
            edge_type = "mixed"
        else:
            edge_type = "pgfam"
        a["BGC"] = edge_bgc
        a["LinkType"] = edge_type

def add_min_dist_and_bgc_features(orig_graph):
    g = deepcopy(orig_graph)
    for node_id in list(g.nodes):
        node_bgc = False
        node = g.nodes[node_id]
        if type(node["features"]) == str:
            features = json.loads(node["features"].replace('""', '"'))
        else:
            features = node["features"]
        genomes = list(features["info"].keys())
        g_dict = features["info"]
        
        #Usually there is only one segment per genome, but 
        #just in case, take the first one for start and last for end
        
        genomes_start = {genome:min([seg["start"] \
                               for chrom in g_dict[genome].keys() \
                               for seg in g_dict[genome][chrom]]) \
                               for genome in genomes}
        genomes_end = {genome:max([seg["end"] \
                               for chrom in g_dict[genome].keys() \
                               for seg in g_dict[genome][chrom]]) \
                               for genome in genomes}
        
        for genome in features["info"]:
            if node_bgc:
                break
            for contig in features["info"][genome]:
                if node_bgc:
                    break
                for ix, sequence in enumerate(features["info"][genome][contig]):
                    is_in_bgc = is_feature_in_bgc(contig, sequence["start"], sequence["end"])
                    features["info"][genome][contig][ix]["in_bgc"] = is_in_bgc
                    if is_in_bgc:
                        node_bgc = True
                        break

        node["features"] = features
        node["BGC"] = node_bgc
        
        min_previous_gene = ("", float("inf"))
        min_next_gene = ("", float("inf"))
        
        for neighbor_id in g.neighbors(node_id):
            neighbor = g.nodes[neighbor_id]
            if type(neighbor["features"]) == str:
                neighbor_features = json.loads(neighbor["features"].replace('""', '"'))
            else:
                neighbor_features = neighbor["features"]
            ng_dict = neighbor_features["info"]
            neighbor_genomes_start = {genome:min([seg["start"] \
                                            for chrom in ng_dict[genome].keys() \
                                            for seg in ng_dict[genome][chrom]]) \
                                    for genome in ng_dict.keys()}
            
            neighbor_genomes_end = {genome:max([seg["end"] \
                                            for chrom in ng_dict[genome].keys() \
                                            for seg in ng_dict[genome][chrom]]) \
                                    for genome in ng_dict.keys()}
            
            for neighbor_genome in neighbor_genomes_start:
                ng_start = neighbor_genomes_start[neighbor_genome]
                ng_end = neighbor_genomes_end[neighbor_genome]
                if neighbor_genome in genomes:
                    g_start = (genomes_start[neighbor_genome])
                    g_end = (genomes_end[neighbor_genome])
                    dist = g_start - ng_start
                    if dist < 0: #The neighbor is after the main node
                        dist = max(0, ng_start - g_end)
                        if dist < min_next_gene[1]:
                            min_next_gene = (neighbor_genome, dist)
                    else: #The main node is after the neighbor
                        dist = max(0, g_start - ng_end)
                        if dist < min_previous_gene[1]:
                            min_previous_gene = (neighbor_genome, dist)
                            
        if float("inf") == min_previous_gene[1]:
            node["dist_to_last"] = 0
        else:
            node["dist_to_last"] = min_previous_gene[1]
        
        if float("inf") == min_next_gene[1]:
            node["dist_to_next"] = 0
        else:
            node["dist_to_next"] = min_next_gene[1]
    return g
    

def bare_graph(orig_graph, retain_features=[]):
    graph = deepcopy(orig_graph)
    for node in list(graph.nodes):
        node = graph.nodes[node]
        for key in list(node.keys()):
            if key not in retain_features:
                del node[key]
    return graph

In [92]:
t = add_min_dist_and_bgc_features(g)

> <ipython-input-90-c7ec7eef29e2>(72)add_min_dist_and_bgc_features()
-> min_previous_gene = ("", float("inf"))


(Pdb)  c


In [94]:
family_dists_graph = add_min_dist_and_bgc_features(g)
family_dists_graph = bare_graph(family_dists_graph, retain_features=["BGC", "family", "dist_to_last", "dist_to_next"])
nx.readwrite.write_gexf(family_dists_graph, "../test_data/test-families-and-dists-only-pgfams.gexf")

> <ipython-input-90-c7ec7eef29e2>(72)add_min_dist_and_bgc_features()
-> min_previous_gene = ("", float("inf"))


(Pdb)  c


In [10]:
g_family = bare_graph(g, retain_features=["family"])

In [11]:
# with open("../test_data/test-families-only.gexf", "w") as dest:
nx.readwrite.write_gexf(g_family, "../test_data/test-families-only.gexf")

In [12]:
g_family.nodes["0"]

{'family': 'PF16450'}

In [9]:
nodes = list(no_atts.nodes)
random.seed(42)
random.shuffle(nodes)

val_percent = 10
test_percent = 20

num_records = len(nodes)

val_start = int(num_records * (1 - (val_percent + test_percent)/100))
test_start = int(num_records * (1 - test_percent/100))

for ix, node in enumerate(nodes):
#     no_atts[node] = {"val":False, "test":False}
    node = no_atts.nodes[node]
    keys = list(node.keys())
    for key in keys:
        if key not in ["val", "test"]:
            del node[key]
    if ix > test_start:
        node["test"] = True
        node["val"] = False
    elif ix > val_start:
        node["test"] = False
        node["val"] = True
    else:
        node["test"] = False
        node["val"] = False
    for att in ["viz", "position", "size"]:
        try:
            del node[att]
        except:
            pass
#     break
#     del node["viz"]    
#     del node["position"]    
#     del node["size"]

In [10]:
# with open("test-G.json", "w") as dest:
#     json.dump(nx.readwrite.json_graph.node_link_data(no_atts), dest)
nx.readwrite.write_gexf(no_atts, "test-G.gexf")

In [99]:

# with open("test-G.json", "w") as dest:
#     json.dump(nx.readwrite.json_graph.node_link_data(no_atts), dest)

In [103]:
test = nx.readwrite.json_graph.node_link_graph(json.load(open("test-G.json")))

In [None]:
nx.readwrite.json_graph.node_link_data(g)