# Back to the basics: Single Path discovery

In [None]:
%load_ext autoreload
%autoreload 2

import networkx as nx
# import scipy
import matplotlib.pyplot as plt


# import clique_discovery 
# from helpers import *

My specific goal is to isolate all single paths in the traces, i.e. find the longest sequences of symbols that appears consequtively in a set of traces.. For that, I need to identify the loops and remove it. One significative fact is that loops must be identified in one single Trace. 

In this notebook and all the nexts the function will be inline.

In [None]:
def graph(G, color="#cccccc", with_weigths=True):
    pos = nx.circular_layout(G)
    plt.rcParams['figure.figsize'] = [10, 6]
    plt.subplot(111)

    nx.draw_networkx (G, pos, width=1, node_color=color, with_labels=True, connectionstyle='arc3, rad=0.03' )
    if with_weigths:
        weights = { (str(u), str(v)): G[u][v]['weight'] for u,v in G.edges() }
        nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=weights)
    plt.show()

In [None]:
def naive_graph( T, with_weigths=False ):
    NAIVE=nx.DiGraph()
    NAIVE.add_edges_from( [ (a,b) for t in T for a,b in zip( t[:-1], t[1:] )  ] )
    graph( NAIVE, with_weigths=with_weigths )

## Simple example

In [None]:
def get_successor_by_freq( traces ):
    """
    Get successor pairs in every T in traces, and combine them by frequency of appearance.
    
    >>> T = [ list("ABC"), list("ABCABC") ]
    >>> get_successor_pairs_by_freq(T)
    {('A', 'B'): 3, ('A', 'C'): 3, ('B', 'C'): 3, ('B', 'A'): 1, ('C', 'A'): 1, ('C', 'B'): 1}
    """
    pairs_with_freq = {}
    L = float(len(traces))

    for trace in traces:
        for pair in get_successor_pairs(trace):
            if pair in pairs_with_freq.keys():
                pairs_with_freq[pair] = pairs_with_freq[pair] + 1.0/L
            else:
                pairs_with_freq[pair] = 1.0/L
            
    By_freq = {}
    for (u, v), freq in pairs_with_freq.items():
        f = round(freq,2)
        if f in By_freq.keys():
            By_freq[f].append( (u,v) )
        else:
            By_freq[f] =[ (u,v) ]
    return By_freq

In [None]:
def get_successor_pairs( T_prime ):
    """
    Get near successor pairs
    
    Given the trace $T' = s_1 ... s_L$
    For every $1 <= i <= L$ find the maximal subtrace starting at $i$
    $T_i_j = s_i ... s_j$ such that $s_i \ne s_k$ for all $i < k <= j$
    
    Return the concatenation for all $T_i_j$
    [ (s_i, s_k) ] for all s_i \in T_i_j, s_k \in T_i_j for all i < k <= j
    
    ADDED 2020-01-23:
    Que no se repitan!

    
    >>> get_successor_pairs(list("ABCD"))
    [('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D'), ('C', 'D')]
    """
    pairs = []
    for i in range(0, len(T_prime)-1):


        partial_subtrace = T_prime[i:]

        s_i = partial_subtrace.pop(0)
        L = len(partial_subtrace)

        # Find first first j such s_i == s_j, or L if not exists
        if s_i in partial_subtrace:
            j = partial_subtrace.index(s_i)
        else:
            j = L

        # This is the subtrace T_i_j, the maximal that not contains s1
        # (Actually, it not contains s_i)
        T_i_j=partial_subtrace[:j]
        T_i_j_pairs = [] # 2020-01-23 BEHAVIOR

        # Construct all s_i, s_k , i < k <= j
        for s_k in T_i_j:
            e = (s_i, s_k)

#             pairs.append(e)       # 2020-01-20 BEHAVIOR
            
            if e not in T_i_j_pairs:  # 2020-01-23 BEHAVIOR
                T_i_j_pairs.append(e) # 2020-01-23 BEHAVIOR
        pairs += T_i_j_pairs          # 2020-01-23 BEHAVIOR
            
            # Added 2020-01-18: global star / end
#             if ADD_START_END:
#                 pairs.append( ("_START_", s_i) )
#                 pairs.append(( s_k, "_END_") )


    return pairs

In [None]:
# A single path is shown as a complete graph 
get_successor_by_freq([list("ABC")])

In [None]:
# Note that in a loop the path appears inverted
get_successor_by_freq([list("ABCABC")])

In [None]:
# Note what happens when noise is introduced
get_successor_by_freq([list(".ABAB")]), \
get_successor_by_freq([list("AB.AB")]), \
get_successor_by_freq([list("ABAB.")])

In [None]:
# Here the symbols are merged simulating parallel execution, when no order is guaranteed between processes
get_successor_by_freq([list("1A2BA1B2")])

## Simple Loop

In [None]:
T = [
    list("ABCDABCD"),
]

In [None]:
# This graph is a simple loop.
naive_graph(T)

In [None]:
# Build an auxiliary graph for one frequency
def successors_in_one_freq( successor_pairs_f ):
    G={}
    for f, pairs in successor_pairs_f.items():
        G[f] = nx.DiGraph()
        for p in pairs:
            G[f].add_edge(p[0], p[1], weight=f )
    return G

In [None]:
# Build the graph of successors for one frequency
for f, G in successors_in_one_freq(get_successor_by_freq( T )).items():
    graph(G)

In [None]:
# The list T has only one item, then the pairs below represents that single path.
succ_of_T = get_successor_by_freq(T)

In [None]:
def successorsGraph(successor_by_freq):
    G=nx.DiGraph()
    for f, pairs in successor_by_freq.items():
        for u, v in pairs:
            G.add_edge(u, v, weight=f)
    return G

In [None]:
# Let's build the graph of all (u,v) in successor_by_freq

bigG = successorsGraph(succ_of_T)
graph(bigG)

Now back to previous ideas I had weeks ago: Search cliques in the same frequency. But now, extracting the loops in each trace. Let's build a list of G_freq[f]

In [None]:
# Now with the complex T
G_freq = successors_in_one_freq(succ_of_T) # { f: successors_graph(pairs, f) for f, pairs in succ_of_T.items() }

But the cliques appears duplicated in different frequencies.

Cliques are set of vertex:

In [None]:
{ f: list(nx.algorithms.clique.find_cliques( G_freq[f].to_undirected() )) for f in G_freq.keys() }

In [None]:
# Sort the cliques and apply the rules:
# 1) The nodes of a single path in their equivalent pair has in_degree=0,1,2,...
# 2) A path is composed for at least 2 nodes

def infer_paths(G_freq, min_clique_size=2):

    paths_f = {}
    cliques_f = { f: list(nx.algorithms.clique.find_cliques( G_freq[f].to_undirected() )) for f in G_freq.keys() }
    for f, cliques in cliques_f.items():
        paths = []
        for clique in cliques:
            # From the original graph(f),
            G_complete = G_freq[f].copy()
            
            # remove the nodes not in this clique 
            for node in set(G_complete.nodes).difference( set(clique) ):
                G_complete.remove_node(node)
                
            # Order nodes by inner degree
            nodes = sorted( G_complete.in_degree() , key=lambda p: p[1], reverse=False)

            # - core - CRITERIA 1 and 2
            # Strict checking: in_degree(n) in [0, ... , len(N)-1]
            if len(nodes) >=min_clique_size and all( 
                [in_degree == i for i, (a, in_degree) in zip(range(0, len(nodes)), nodes)]
            ):
                    paths.append ( [ a for a, in_degree in nodes ] )
        if paths:
            paths_f[f] = paths
    return paths_f

Sort the cliques and apply the rule:

**Claim (1)**: The nodes of a single path in their equivalent pair graph can be ordered by in_degree, starting at 0. $in\_degree=0,1,2,...$ 

In [None]:
infer_paths(G_freq)

In [None]:
def split_in_freqGraph( successorsGraph ):
    u_v_f = [ (u, v, successorsGraph[u][v]["weight"]) for u, v in successorsGraph.edges]
    frqs = set([ f for u, v, f in u_v_f])
    G={}
    for f in frqs:
        G[f] = successorsGraph.copy()
        # Get all nodes whose pairs has weight!=f
        for u, v, f2 in u_v_f:
            if f != f2:
                G[f].remove_edge(u,v)
        # Remove 
    return G

In [None]:
# Now all together:
T = [
    list("ABCDABCD"),
]
bigG = successorsGraph(  get_successor_by_freq(T)  )
G_freq = split_in_freqGraph( bigG )

# for f, G in G_freq.items():
#     graph(G)
graph(bigG)

infer_paths(G_freq)

In [None]:
# Single path again, using the clique method
T = [
    list("abcde")
]
bigG = successorsGraph(  get_successor_by_freq(T)  )

naive_graph(T)
graph(bigG)
infer_paths(  split_in_freqGraph( bigG ) )

## Multiple traces , no loop

In [None]:
# Expected paths: ABC 123
T = [
    list("ABC123"),
    list("123ABC"),    
]
bigG = successorsGraph(  get_successor_by_freq(T)  )
infer_paths( split_in_freqGraph( bigG ) )

In [None]:
# Expected paths: ABC 123
T = [
    list("ABC123"), list("ABC123"),
    list("123ABC"), list("123ABC"),
    list("123"), list("123"), list("123"), list("123"), list("123"), 
]
bigG = successorsGraph(  get_successor_by_freq(T)  )
infer_paths( split_in_freqGraph( bigG ) )

## Simple Loops

In [None]:
# Expected Loops: ABC
T = [
    list("ABCABC")
]
bigG = successorsGraph(  get_successor_by_freq(T)  )
naive_graph(T)
paths = infer_paths( split_in_freqGraph( bigG ) )
paths

Claim (2): If the nodes $V$ makes a loop in a pairs graph $G$ then $| freq(AB) - freq(BA) | = 1$, $\forall A,B \in V(G)$ (2)
```
    ABC ABC ..  ABC ABC = N
    
    AB  AB  ..  AB  AB  = N
     \  / \     / \ / 
     B--A  B..--A B-A   = N-1

    A-C A-C ..  A-C A-C = N
      \ / \     / \ /   
      C-A C-..--A C-A   = N-1
```

In [None]:
# Nodes part of a loop
# Search all n in bigG such that
# | freq(AB) - freq(BA) | = 1 \forall A,B \in V(G)

def is_a_loop_criteria(u, v, bigG):
    return \
        (u,v) in bigG.edges \
        and (v,u) in bigG.edges \
        and  1 == abs( bigG.edges[u,v]['weight'] - bigG.edges[v,u]['weight'] )

    
def loops_from_G( bigG, verbose=False ):
    loops = set()
    for u in bigG.nodes:
        for v in bigG.nodes - u:
            if is_a_loop_criteria(u,v,bigG):
                loops.add(u)
                loops.add(v)
    return loops

In [None]:
# Let's see. 
loops_from_G(bigG, verbose=True)

In [None]:
# Expected Loops: ABC 123
T = [
    list("ABCABC123123")
]
bigG = successorsGraph(  get_successor_by_freq(T)  )
naive_graph(T)
paths = infer_paths( split_in_freqGraph( bigG ) )
paths

In [None]:
loops_from_G(bigG, verbose=True)

## Anoying Loop Example

In [None]:
# More interesting serie. Expected Loops: ABC 123
T = [
#     list("A1B23C1A2B3C1ABC23zxcv"),
    list("ABCABCABC123123123")
#     list("aAb1B2cd3Cefgh1iAjk2Blmnop3qCrst1AuBC2vwxyz3"),
#     list(".1A.B2.C3..ABC...A1..2B..3C.")
]
bigG = successorsGraph(  get_successor_by_freq(T)  )
# graph( bigG, with_weigths=False)
paths = infer_paths( split_in_freqGraph( bigG ) )
# paths

In [None]:
# for f, pairs in get_successor_by_freq(T).items():
#     print(f, [(a,b) for a, b in pairs \
#         if a in list("ABC123") and b in list("ABC123") ] )

In [None]:
# Let's see. 
nodes_in_loops = loops_from_G(bigG, verbose=True)

In [None]:
nodes_in_loops

In [None]:
bigG.out_degree()

In [None]:
# Every subclique is a clique, hence, a path in T. Let's search the ordered intersection with the nodes in loop.
for f, cliques in paths.items():
    for path in cliques:
        if all([ p in nodes_in_loops for p in path ]):
            print (f, path)

In [None]:
paths

Claim: all paths (cliques) forming a base are loops. Starting from max(f) -> min(f).