# Make JSON of link info for each node in GFA

In [29]:
import re
import json

GFA_F = "copan_0.gfa"
NODE_INFO_DICT = GFA_F.replace(".gfa", "_links.json")

def main():
    n_seg = 0
    n_links = 0
    n_node = 0
    node_dict = {}

    with open(GFA_F, 'r') as f:
        for line in f:

            line = re.split("\t|:", line)

            if line[0] == "S":
                node = line[1]
                seq = line[2]
                contig = line[5]
                sample = line[6]
                source_position = line[8]
                end_position = line[9]
                orientation = line[10].strip()
                n_seg +=1

                if node not in node_dict.keys():
                    node_dict[node]= {"seqs": [seq], 
                                    "contigs": [contig], 
                                    "samples": [sample], 
                                    "orientations": [orientation], 
                                    "links":{}}
                    n_node += 1
                elif node in node_dict.keys():
                    # possible issue: this allows for redundant info in the nested dictionaries if there are multiple seqs 
                    # belonging to the same node that also come from the same sample or contig
                    node_dict[node]["seqs"].append(seq)
                    node_dict[node]["contigs"].append(contig)
                    node_dict[node]["samples"].append(sample)
                    node_dict[node]["orientations"].append(orientation) 

            
            if line[0] == "L":
                n_links += 1
                node_1 = line[1]
                node_1_orientation = line[2]
                node_2 = line[3]
                node_2_orientation = line[4]

                if node_1 in node_dict.keys():
                    node_dict[node_1]["links"][node_2] = {"target_orientation": node_2_orientation, 
                                                            "source_orientation": node_1_orientation}
                if node_2 in node_dict.keys():
                    node_1_revcomp_orientation = reverse_complement(node_1_orientation)
                    node_2_revcomp_orientation = reverse_complement(node_2_orientation)
                    # orientation for the target and source are flipped because this represents the reverse complement link
                    node_dict[node_2]["links"][node_1] = {"target_orientation": node_1_revcomp_orientation, 
                                                            "source_orientation": node_2_revcomp_orientation}
    
    with open(NODE_INFO_DICT, 'w') as f:
        json.dump(node_dict, f, indent=4)


def reverse_complement(forward_orientation):
    if forward_orientation == "-":
        return "+"
    else:
        return "-"


if __name__ == '__main__':
    main()


# Perform random walks through the graph

In [8]:
import random
import json

NODE_INFO_DICT_F = "dummy_graph_links.json"
WALKS_DICT_F = NODE_INFO_DICT_F.replace("links", "walks")
    
# set walk params
walk_length = 5 # number of nodes in walk
n_walks = 2 # number of walks to perform per node


# Iter through nodes in node_dict
def main():
    with open(NODE_INFO_DICT_F, 'r') as f:
        node_dict = json.load(f)

    walks_dict = {}
    walks_noOrientation = []

    for start_node in node_dict.keys():
        print(start_node)
        walk_counter = 0

        walks_dict[start_node] = []

        while walk_counter < n_walks:
            print("walk number: ", walk_counter)
            walk_counter = take_walk(node_dict, walks_dict, start_node, walk_counter, walks_noOrientation)
    
    
    with open(WALKS_DICT_F, 'w') as f:
        json.dump(walks_dict, f, indent=4)


def take_walk(node_dict, walks_dict, start_node, walk_counter, walks_noOrientation):
    node_counter = 0

    # pick random link from list of links for this start node
    linked_nodes = list(node_dict[start_node]["links"].keys())

    if len(linked_nodes) == 0:
        path_wOrientation = []
        path_noOrientation = []

    else: 
        # for the first step in the path, we can ignore the orientation of the first node. hence, why we call take_step here
        next_node = random.choice(linked_nodes)
        next_node_orientation = node_dict[start_node]["links"][next_node]["target_orientation"]

        start_node_orientation = node_dict[start_node]["links"][next_node]["source_orientation"]

        path_wOrientation = [[start_node, start_node_orientation], [next_node, next_node_orientation]]
        path_noOrientation = [start_node, next_node]

        node_counter += 1

        paths = take_step(node_dict, walks_dict, next_node, next_node_orientation, node_counter, walk_counter, start_node, path_wOrientation, path_noOrientation)

        path_wOrientation = paths[0]
        path_noOrientation = paths[1]

    walks_dict[start_node].append(path_wOrientation)
    walks_noOrientation.append(path_noOrientation)

    print("walk with orientation: ", walks_dict)
    print("walk without orientation: ", walks_noOrientation)

    return walk_counter + 1
    

def take_step(node_dict, walks_dict, source_node, source_orientation, node_counter, walk_counter, start_node, path_wOrientation, path_noOrientation):
    while node_counter < walk_length -1:
        linked_nodes = [linked_node for linked_node in node_dict[source_node]["links"] if node_dict[source_node]["links"][linked_node]["source_orientation"] == source_orientation]

        if len(linked_nodes) == 0:
            return [path_wOrientation, path_noOrientation]

        else: 
            next_node = random.choice(linked_nodes)
            next_node_orientation = node_dict[source_node]["links"][next_node]["target_orientation"]

            path_wOrientation.append([next_node, next_node_orientation])
            path_noOrientation.append(next_node)

            node_counter += 1

            source_node = next_node
            source_orientation = next_node_orientation

    return [path_wOrientation, path_noOrientation]


if __name__ == '__main__':
    main()
    


A
walk number:  0
walk with orientation:  {'A': [[['A', '-'], ['D', '-'], ['E', '+'], ['C', '+']]]}
walk without orientation:  [['A', 'D', 'E', 'C']]
walk number:  1
walk with orientation:  {'A': [[['A', '-'], ['D', '-'], ['E', '+'], ['C', '+']], [['A', '-'], ['D', '-'], ['E', '+'], ['C', '+']]]}
walk without orientation:  [['A', 'D', 'E', 'C'], ['A', 'D', 'E', 'C']]
B
walk number:  0
walk with orientation:  {'A': [[['A', '-'], ['D', '-'], ['E', '+'], ['C', '+']], [['A', '-'], ['D', '-'], ['E', '+'], ['C', '+']]], 'B': [[['B', '-'], ['C', '+']]]}
walk without orientation:  [['A', 'D', 'E', 'C'], ['A', 'D', 'E', 'C'], ['B', 'C']]
walk number:  1
walk with orientation:  {'A': [[['A', '-'], ['D', '-'], ['E', '+'], ['C', '+']], [['A', '-'], ['D', '-'], ['E', '+'], ['C', '+']]], 'B': [[['B', '-'], ['C', '+']], [['B', '-'], ['C', '+']]]}
walk without orientation:  [['A', 'D', 'E', 'C'], ['A', 'D', 'E', 'C'], ['B', 'C'], ['B', 'C']]
C
walk number:  0
walk with orientation:  {'A': [[['A', '-']

# Random walks with transition probabilities

In [None]:
import random
import json
import numpy as np

NODE_INFO_DICT_F = "dummy_graph_links.json"
    
# set walk params
walk_length = 5 # number of nodes in walk (doesn't include starting node)
n_walks = 2 # number of walks to perform per node
p = 1
q = 1
seed = 1  # set seed for later use of random.choice() for reproducibility 

# name output file for walks_dict to include the walk length ("Lw") and number of walks ("Nw")
path_addon = str(walk_length) + "Lw" + str(n_walks) + "Nw" + str(p) + p "_walks"

WALKS_DICT_F = NODE_INFO_DICT_F.replace("links", path_addon)


# Iter through nodes in node_dict
def main():
    with open(NODE_INFO_DICT_F, 'r') as f:
        node_dict = json.load(f)

    walks_dict = {}

    for start_node in node_dict.keys():
        print(start_node)
        walk_counter = 0
        walks_dict[start_node] = []

        walks_dict = take_walk(node_dict, walks_dict, start_node, walk_counter)
    
    
    with open(WALKS_DICT_F, 'w') as f:
        json.dump(walks_dict, f, indent=4)


def take_walk(node_dict, walks_dict, start_node, walk_counter):
    while walk_counter < n_walks:
        seed += 1  # don't want to replicate walks, so changing seed for each walk
        node_counter = 0

        # pick random link from list of links for this start node
        neighbors = list(node_dict[start_node]["links"].keys())

        if len(neighbors) == 0:
            break

        else: 
            # for the first step in the path, we can ignore the orientation of the first node. hence, why we call take_step here
            random.seed(seed)
            next_node = random.choice(neighbors)
            next_node_orientation = node_dict[start_node]["links"][next_node]["target_orientation"]

            start_node_orientation = node_dict[start_node]["links"][next_node]["source_orientation"]

            path = [[start_node, start_node_orientation], [next_node, next_node_orientation]]

            node_counter += 1

            path_result = take_step(node_dict, walks_dict, next_node, next_node_orientation, node_counter, walk_counter, start_node, start_node_orientation, path, p, q)
            path = path_result[0]
            walk_counter = path_result[1]

        walks_dict[start_node].append(path)

    return walks_dict
    

def take_step(node_dict, walks_dict, curr_node, curr_orientation, node_counter, walk_counter, prev_node, prev_orientation, path, p, q):
    while node_counter < walk_length -1:
        transition_probabilities = {}

        curr_node_neighbors = [linked_node for linked_node in node_dict[curr_node]["links"] if node_dict[curr_node]["links"][linked_node]["source_orientation"] == curr_orientation]
        prev_node_neighbors = [linked_node for linked_node in node_dict[prev_node]["links"] if node_dict[prev_node]["links"][linked_node]["source_orientation"] == prev_orientation]


        if len(curr_node_neighbors) == 0:
            walk_counter += 1
            return [path, walk_counter]

        else: 
            for neighbor in curr_node_neighbors:
                if neighbor == prev_node:
                    # return to previous node
                    transition_probabilities[neighbor] = 1/p
                elif neighbor in prev_node_neighbors:
                    # neighbor of current node is also neighbor to previous node
                    transition_probabilities[neighbor] = 1.0
                else:
                    # neighbor is not previous node and not neighbor to previous node
                    transition_probabilities[neighbor] = 1/q

            # normalize probabilities
            total_probabilities = sum(transition_probabilities.values())  # also referred to as "Z" value in the literature
            normalized_transition_probabilities = {k: v / total_probabilities for k, v in transition_probabilities.items()}


            next_node = np.random.choice(list(normalized_transition_probabilities.keys()), p=list(normalized_transition_probabilities.values()))
            next_orientation = node_dict[curr_node]["links"][next_node]["target_orientation"]

            path.append([next_node, next_orientation])

            node_counter += 1

            prev_node = curr_node
            prev_orientation = curr_orientation

            curr_node = next_node
            curr_orientation = next_orientation

    walk_counter += 1
    return [path, walk_counter]


if __name__ == '__main__':
    main()
    


# random walks -- adding feature to write walks for Word2Vec input

In [15]:
import random
import json
import numpy as np

NODE_INFO_DICT_F = "copan_0_links.json"
DICT_OUTPUT_DIR = "walk_dicts_wOrientation"
LIST_OUTPUT_DIR = "walk_lists_noOrientation"

# set walk params
walk_length = 80 # number of nodes in walk (doesn't include starting node)
n_walks = 10 # number of walks to perform per node
p = 1
q = 1
SEED = 1
# name output file for walks_dict to include the walk length ("Lw") and number of walks ("Nw")
dict_path_addon = str(walk_length) + "Lw" + str(n_walks) + "Nw" + str(p) + "p" + str(q) + "q" + "_walks_wOrientation"
list_path_addon = str(walk_length) + "Lw" + str(n_walks) + "Nw" + str(p) + "p" + str(q) + "q" + "_walks_noOrientation.txt"

WALKS_DICT_F = DICT_OUTPUT_DIR + "/" + NODE_INFO_DICT_F.replace("links", dict_path_addon)
WALKS_LIST_F = LIST_OUTPUT_DIR + "/" + NODE_INFO_DICT_F.replace("links.json", list_path_addon)

# Iter through nodes in node_dict
def main():
    with open(NODE_INFO_DICT_F, 'r') as f:
        node_dict = json.load(f)

    walks_dict_wOrientation = {}
    walks_list_noOrientation = []

    for start_node in node_dict.keys():
        # print("start node: ", start_node)
        seed = SEED  # set seed for later use of random.choice() for reproducibility 

        print(start_node)
        walk_counter = 0
        walks_dict_wOrientation[start_node] = []
        

        walks = take_walk(node_dict, walks_dict_wOrientation, start_node, walk_counter, seed, walks_list_noOrientation)
        walks_dict_wOrientation = walks[0]
        walks_list_noOrientation = walks[1]
    
    
    with open(WALKS_DICT_F, 'w') as f:
        json.dump(walks_dict_wOrientation, f, indent=4)
    
    with open(WALKS_LIST_F, 'w') as f:
        for walk in walks_list_noOrientation:
            # Join the elements of the nested list into a single string
            line = ', '.join(map(str, walk))  # Convert each element to a string and join with a comma
            f.write(line + '\n')  # Write the line to the file and add a newline
        

def take_walk(node_dict, walks_dict_wOrientation, start_node, walk_counter, seed, walks_list_noOrientation):
    while walk_counter < n_walks:
        seed += 1  # don't want to replicate walks, so changing seed for each walk
        node_counter = 0

        # pick random link from list of links for this start node
        neighbors = list(node_dict[start_node]["links"].keys())

        if len(neighbors) == 0:
            break

        else: 
            # for the first step in the path, we can ignore the orientation of the first node. hence, why we call take_step here
            # np.random.seed(seed)
            next_node = np.random.choice(neighbors)
            next_node_orientation = node_dict[start_node]["links"][next_node]["target_orientation"]

            start_node_orientation = node_dict[start_node]["links"][next_node]["source_orientation"]

            path_wOrientation = [[start_node, start_node_orientation], [next_node, next_node_orientation]]
            path_noOrientation = [start_node, next_node]

            node_counter += 1

            paths = take_step(node_dict, next_node, next_node_orientation, node_counter, walk_counter, start_node, start_node_orientation, path_wOrientation, path_noOrientation, p, q, seed)
            path_wOrientation = paths[0]
            path_noOrientation = paths[1]
            walk_counter = paths[2]

        walks_dict_wOrientation[start_node].append(path_wOrientation)
        walks_list_noOrientation.append(path_noOrientation)

        print("path with orientaiton:", path_wOrientation)
        print("path no orientaiton:", path_noOrientation)

    return [walks_dict_wOrientation, walks_list_noOrientation]
    

def take_step(node_dict, curr_node, curr_orientation, node_counter, walk_counter, prev_node, prev_orientation, path_wOrientation, path_noOrientation, p, q, seed):
    while node_counter < walk_length -1:
        transition_probabilities = {}

        curr_node_neighbors = [linked_node for linked_node in node_dict[curr_node]["links"] if node_dict[curr_node]["links"][linked_node]["source_orientation"] == curr_orientation]
        prev_node_neighbors = [linked_node for linked_node in node_dict[prev_node]["links"] if node_dict[prev_node]["links"][linked_node]["source_orientation"] == prev_orientation]


        if len(curr_node_neighbors) == 0:
            walk_counter += 1

            return [path_wOrientation, path_noOrientation, walk_counter]

        else: 
            for neighbor in curr_node_neighbors:
                if neighbor == prev_node:
                    # return to previous node
                    transition_probabilities[neighbor] = 1/p
                elif neighbor in prev_node_neighbors:
                    # neighbor of current node is also neighbor to previous node
                    transition_probabilities[neighbor] = 1.0
                else:
                    # neighbor is not previous node and not neighbor to previous node
                    transition_probabilities[neighbor] = 1/q

            # normalize probabilities
            total_probabilities = sum(transition_probabilities.values())  # also referred to as "Z" value in the literature
            normalized_transition_probabilities = {k: v / total_probabilities for k, v in transition_probabilities.items()}

            # np.random.seed(seed)
            next_node = np.random.choice(list(normalized_transition_probabilities.keys()), p=list(normalized_transition_probabilities.values()))
            next_orientation = node_dict[curr_node]["links"][next_node]["target_orientation"]

            path_wOrientation.append([next_node, next_orientation])
            path_noOrientation.append(next_node)

            node_counter += 1

            prev_node = curr_node
            prev_orientation = curr_orientation

            curr_node = next_node
            curr_orientation = next_orientation

    walk_counter += 1

    return [path_wOrientation, path_noOrientation, walk_counter]


if __name__ == '__main__':
    main()
    


A
path with orientaiton: [['A', '+'], ['B', '+'], ['D', '-'], ['E', '+'], ['C', '+']]
path no orientaiton: ['A', 'B', 'D', 'E', 'C']
path with orientaiton: [['A', '-'], ['D', '-'], ['E', '+'], ['C', '+']]
path no orientaiton: ['A', 'D', 'E', 'C']
B
path with orientaiton: [['B', '+'], ['D', '-'], ['E', '+'], ['C', '+']]
path no orientaiton: ['B', 'D', 'E', 'C']
path with orientaiton: [['B', '-'], ['C', '+']]
path no orientaiton: ['B', 'C']
C
path with orientaiton: [['C', '-'], ['B', '+'], ['D', '-'], ['E', '+'], ['C', '+']]
path no orientaiton: ['C', 'B', 'D', 'E', 'C']
path with orientaiton: [['C', '-'], ['E', '-'], ['D', '+'], ['A', '+'], ['A', '+']]
path no orientaiton: ['C', 'E', 'D', 'A', 'A']
D
path with orientaiton: [['D', '+'], ['B', '-'], ['A', '-'], ['D', '-'], ['E', '+']]
path no orientaiton: ['D', 'B', 'A', 'D', 'E']
path with orientaiton: [['D', '+'], ['B', '-'], ['C', '+']]
path no orientaiton: ['D', 'B', 'C']
E
path with orientaiton: [['E', '-'], ['D', '+'], ['A', '+'], [

# Visualize Embeddings

In [4]:


import numpy as np
import pandas as pd
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph
from stellargraph import datasets
from IPython.display import display, HTML

dataset = datasets.Cora()
display(HTML(dataset.description))
G, node_subjects = dataset.load(largest_connected_component_only=True)

ModuleNotFoundError: No module named 'stellargraph'