# Test Battery for Clique Detection

In [10]:
# Parameters
METHOD_ID="2020-01-24 all pairs"

In [11]:
%load_ext autoreload
%autoreload 2

import networkx as nx
# import scipy
import matplotlib.pyplot as plt

In [35]:
def evaluate_against (T, expected_paths):
    if METHOD_ID=="2020-01-24 all pairs":
        paths = infer_paths( split_in_freqGraph( successorsGraph(  get_successor_by_freq(T)  ) ) )
    else:
        raise ValueError("Not valid METHOD_ID")
    good = []
    bad = []
    expected = [ list(a) for a in expected_paths ]
    for f, paths in paths.items():
        for p in paths:
#             print(paths)
            if p in expected:
                good.append("".join(p))
            else:
                bad.append("".join(p))
            
    if len(expected_paths) != len(good):
        print(), print()
        print("WARNING HERE! OJO AQUI!")
        print(), print()
    print("These %s paths were correctly detected (%s undetected):" % (len(good), len(expected_paths)-len(good)))
    print(good)
    print()
    
    print("These %s paths are spurious:" % len(bad))
    print (bad)
    return good, bad

In [13]:
def graph(G, color="#cccccc", with_weigths=True):
    pos = nx.circular_layout(G)
    plt.rcParams['figure.figsize'] = [10, 6]
    plt.subplot(111)

    nx.draw_networkx (G, pos, width=1, node_color=color, with_labels=True, connectionstyle='arc3, rad=0.03' )
    if with_weigths:
        weights = { (str(u), str(v)): G[u][v]['weight'] for u,v in G.edges() }
        nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=weights)
    plt.show()

In [14]:
def get_successor_by_freq( traces ):
    """
    Get successor pairs in every T in traces, and combine them by frequency of appearance.
    
    >>> T = [ list("ABC"), list("ABCABC") ]
    >>> get_successor_pairs_by_freq(T)
    {('A', 'B'): 3, ('A', 'C'): 3, ('B', 'C'): 3, ('B', 'A'): 1, ('C', 'A'): 1, ('C', 'B'): 1}
    """
    pairs_with_freq = {}
    L = float(len(traces))

    for trace in traces:
        for pair in get_successor_pairs(trace):
            if pair in pairs_with_freq.keys():
                pairs_with_freq[pair] = pairs_with_freq[pair] + 1.0/L
            else:
                pairs_with_freq[pair] = 1.0/L
            
    By_freq = {}
    for (u, v), freq in pairs_with_freq.items():
        f = round(freq,2)
        if f in By_freq.keys():
            By_freq[f].append( (u,v) )
        else:
            By_freq[f] =[ (u,v) ]
    return By_freq

In [15]:
def successorsGraph(successor_by_freq):
    G=nx.DiGraph()
    for f, pairs in successor_by_freq.items():
        for u, v in pairs:
            G.add_edge(u, v, weight=f)
    return G

In [16]:
# Sort the cliques and apply the rules:
# 1) The nodes of a single path in their equivalent pair has in_degree=0,1,2,...
# 2) A path is composed for at least 2 nodes

def infer_paths(G_freq, min_clique_size=2):

    paths_f = {}
    cliques_f = { f: list(nx.algorithms.clique.find_cliques( G_freq[f].to_undirected() )) for f in G_freq.keys() }
    for f, cliques in cliques_f.items():
        paths = []
        for clique in cliques:
            # From the original graph(f),
            G_complete = G_freq[f].copy()
            
            # remove the nodes not in this clique 
            for node in set(G_complete.nodes).difference( set(clique) ):
                G_complete.remove_node(node)
                
            # Order nodes by inner degree
            nodes = sorted( G_complete.in_degree() , key=lambda p: p[1], reverse=False)

            # - core - CRITERIA 1 and 2
            # Strict checking: in_degree(n) in [0, ... , len(N)-1]
            if len(nodes) >=min_clique_size and all( 
                [in_degree == i for i, (a, in_degree) in zip(range(0, len(nodes)), nodes)]
            ):
                    paths.append ( [ a for a, in_degree in nodes ] )
        if paths:
            paths_f[f] = paths
    return paths_f

In [17]:
# Including Modifications by Andres (20200124)
def get_successor_pairs( T_prime ):
    """
    Get near successor pairs
    
    Given the trace $T' = s_1 ... s_L$
    For every $1 <= i <= L$ find the maximal subtrace starting at $i$
    $T_i_j = s_i ... s_j$ such that $s_i \ne s_k$ for all $i < k <= j$
    
    Return the concatenation for all $T_i_j$
    [ (s_i, s_k) ] for all s_i \in T_i_j, s_k \in T_i_j for all i < k <= j
    
    ADDED 2020-01-23:
    Que no se repitan!

    
    >>> get_successor_pairs(list("ABCD"))
    [('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D'), ('C', 'D')]
    """
    pairs = []
    for i in range(0, len(T_prime)-1):


        partial_subtrace = T_prime[i:]

        s_i = partial_subtrace.pop(0)
        L = len(partial_subtrace)

        # Find first first j such s_i == s_j, or L if not exists
        if s_i in partial_subtrace:
            j = partial_subtrace.index(s_i)
        else:
            j = L

        # This is the subtrace T_i_j, the maximal that not contains s1
        # (Actually, it not contains s_i)
#         T_i_j=partial_subtrace[:j] # 2020-01-20 Old BEHAVIOR
        T_i_j=partial_subtrace[:L] # Andres 20200124 .. all friends with all, including loops
        
        T_i_j_pairs = [] # 2020-01-23 BEHAVIOR ... but not remove, it doesn't affect

        # Construct all s_i, s_k , i < k <= j
        for s_k in T_i_j:
            e = (s_i, s_k)

            pairs.append(e)       # 2020-01-20 Old BEHAVIOR
            
#             if e not in T_i_j_pairs:  # 2020-01-23 BEHAVIOR
#                 T_i_j_pairs.append(e) # 2020-01-23 BEHAVIOR
#         pairs += T_i_j_pairs          # 2020-01-23 BEHAVIOR
            
            
            # Added 2020-01-18: global star / end
#             if ADD_START_END:
#                 pairs.append( ("_START_", s_i) )
#                 pairs.append(( s_k, "_END_") )


    return pairs

In [18]:
def split_in_freqGraph( successorsGraph ):
    u_v_f = [ (u, v, successorsGraph[u][v]["weight"]) for u, v in successorsGraph.edges]
    frqs = set([ f for u, v, f in u_v_f])
    G={}
    for f in frqs:
        G[f] = successorsGraph.copy()
        # Get all nodes whose pairs has weight!=f
        for u, v, f2 in u_v_f:
            if f != f2:
                G[f].remove_edge(u,v)
        # Remove 
    return G

## Tested examples

### Simple Loop Examples

In [41]:
T = [
    list("ABCDABCD"),
]
g, b = evaluate_against(T, ["ABCD"] )

These 1 paths were correctly detected (0 undetected):
['ABCD']

These 0 paths are spurious:
[]


In [42]:
T = [
    list("ABCDABCD"),
    list("1234512345"),
    list("12345")
]
g, b = evaluate_against(T, ["ABCD", "12345"] )

These 2 paths were correctly detected (0 undetected):
['ABCD', '12345']

These 0 paths are spurious:
[]


### Complex Loop Examples

In [43]:
# Type 75
T = [
    list("A1B23C1A2B3C1ABC23zxcv"),
]
g, b = evaluate_against(T, ("ABC", "123", "zxcv") )

These 3 paths were correctly detected (0 undetected):
['zxcv', 'ABC', '123']

These 12 paths are spurious:
['A1', '2B', 'C3', 'C2', '1A', 'B2', '3C', '2C', 'AB3', 'A23', '1BC', '1B3']


In [44]:
# Type 76
T = [
    list("ABCABCABC123123123123")
]
g, b = evaluate_against(T, ("ABC", "123") )

These 2 paths were correctly detected (0 undetected):
['123', 'ABC']

These 9 paths are spurious:
['B1', 'B3', 'B2', 'C1', 'C3', 'C2', 'A1', 'A3', 'A2']


In [45]:
# Type 76-b
T = [
    list("ABCABCABC"),
    list("123123123123")
]
g, b = evaluate_against(T, ("ABC", "123") )

These 2 paths were correctly detected (0 undetected):
['ABC', '123']

These 0 paths are spurious:
[]


In [46]:
# Type 77
T = [
    list("aAb1B2cd3Cefgh1iAjk2Blmnop3qCrst1AuBC2vwxyz3"),
]
g, b = evaluate_against(T, ("ABC", "123", "abcdefghijklmnopqrstuvwxyz") )

These 3 paths were correctly detected (0 undetected):
['abcdefghijklmnopqrstuvwxyz', '123', 'ABC']

These 126 paths are spurious:
['An', 'Ao', 'fA', 'cA', 'Al', 'iA', 'hA', 'Ap', 'Aj', 'Am', 'Ak', 'gA', 'eA', 'Ar', 'bA', 'Aq', 'As', 'dA', 'At', '1n', 'g1', '1i', '1r', 'h1', '1o', '1q', '1p', '1j', '1s', 'd1', 'e1', '1t', '1m', 'f1', '1k', 'c1', '1l', 'Bn', 'Bo', 'fB', 'cB', 'Bl', 'iB', 'hB', 'Bp', 'jB', 'Bm', 'kB', 'Bu', 'gB', 'eB', 'Br', 'Bq', 'Bs', 'dB', 'Bt', 'nC', 'gC', 'iC', 'Cr', 'hC', 'oC', 'qC', 'kC', 'pC', 'jC', 'Cs', 'eC', 'Ct', 'mC', 'fC', 'Cu', 'lC', 'd2', '2n', 'g2', 'i2', '2r', 'h2', '2o', '2q', 'k2', '2p', 'j2', '2s', 'e2', '2t', '2m', 'f2', '2u', 'c2', '2l', '3v', 'n3', 'g3', 'o3', 'e3', 'f3', '3x', 'k3', 'l3', '3z', '3y', 'i3', '3r', 'h3', '3q', 'p3', 'j3', '3s', '3t', 'm3', '3u', '3w', 'A1', '2B', 'C2', 'C3', '1A', 'B2', '2C', '3C', '1BC', '1B3', 'AB3', 'A23']


In [47]:
# Type 78
T = [
    list(".1A.B2.C3..ABC...A1..2B..3C.")
]
g, b = evaluate_against(T, ("ABC", "123") )

These 2 paths were correctly detected (0 undetected):
['123', 'ABC']

These 24 paths are spurious:
['3A', '2A', 'B1', 'C1', '3B', 'C2', 'A3', 'A2', '1B', '1C', 'B3', '2C', '.1', '3.', '.2', '.A', '2.', '.3', '1.', '.B', 'C.', '.C', 'B.', 'A.']


Claim: all paths (cliques) forming a base are loops. Starting from max(f) -> min(f).

### Examples from previous notebooks

In [48]:
# combined_loops: lk 12 ABCDE
T = [
    list("lklk1212ABCDEABCDEABCDE1212"),
    list("lk12ABCDEABCDE"),
    list("lk12ABCDE"),
]
g, b = evaluate_against(T, ("ABCDE", "lk", "12") )

These 3 paths were correctly detected (0 undetected):
['lk', 'ABCDE', '12']

These 24 paths are spurious:
['k1', 'l1', 'k2', 'l2', '1A', '1B', '1E', '1C', '1D', 'kA', 'kB', 'kE', 'kC', 'kD', 'lA', 'lB', 'lE', 'lC', 'lD', '2A', '2B', '2E', '2C', '2D']


In [49]:
"""
head_with_loop = 12 ABCD* ef*

12 @ABCD @ef
"""
T = [
    list("12ABCDefABCDABCDef"),
]
g, b = evaluate_against(T, ("ABCD", "12", "ef") )

These 3 paths were correctly detected (0 undetected):
['12', 'ef', 'ABCD']

These 20 paths are spurious:
['fA', '1f', 'fB', 'fC', '2f', 'fD', 'eA', '1e', 'eB', 'eC', '2e', 'eD', 'Bf', 'Cf', 'Af', 'Df', 'Be', 'Ce', 'Ae', 'De']


In [50]:
"""
  @lk - @12 - @ABCDE
"""
T = [
    list("lklk1212ABCDEABCDEABCDE1212"),
    list("lk12ABCDEABCDE"),
    list("lk12ABCDE"),
]
g, b = evaluate_against(T, ["ABCDE", "lk", "12"] )

These 3 paths were correctly detected (0 undetected):
['lk', 'ABCDE', '12']

These 24 paths are spurious:
['k1', 'l1', 'k2', 'l2', '1A', '1B', '1E', '1C', '1D', 'kA', 'kB', 'kE', 'kC', 'kD', 'lA', 'lB', 'lE', 'lC', 'lD', '2A', '2B', '2E', '2C', '2D']


In [51]:
"""
   ABC * 123        
"""
T = [ list(a) for a in ['ABC123', 'AB1C23', 'AB12C3', 'AB123C', 'A1BC23', 'A1B2C3', 'A1B23C', 
      'A12BC3', 'A12B3C', 'A123BC', '1ABC23', '1AB2C3', '1AB23C', '1A2BC3', 
      '1A2B3C', '1A23BC', '12ABC3', '12AB3C', '12A3BC', '123ABC'
]]
g, b = evaluate_against(T, ("ABC", "123") )

These 2 paths were correctly detected (0 undetected):
['123', 'ABC']

These 12 paths are spurious:
['1B', 'B3', '2C', 'A2', 'B1', '3B', 'C2', '2A', 'A3', '1C', '3A', 'C1']


## Still Failing. Still...

In [52]:
"""
  @def 
       > FG 
  @hjk 
"""        
T = [
    list("defdefFG"),
    list("hjkhjkFG")
]
g, b = evaluate_against(T, ("hjk", "def", "FG") )





These 2 paths were correctly detected (1 undetected):
['hjk', 'def']

These 6 paths are spurious:
['hFG', 'jFG', 'dFG', 'eFG', 'fFG', 'kFG']


In [53]:
"""
          hjk
  12345 <     > FG
          def 
"""
T = [
    list("12345defFG"),
    list("12345hjklFG")
]
g, b = evaluate_against(T, ("12345FG", "hjkl", "def") )





These 1 paths were correctly detected (2 undetected):
['12345FG']

These 14 paths are spurious:
['4hjkl', '1hjkl', 'hjklG', '2hjkl', '5hjkl', 'hjklF', '3hjkl', '4def', '1def', 'defG', '2def', '5def', 'defF', '3def']


In [54]:
"""
          @hjk
  12345 <      > FG
          @def 
"""
T = [
    list("12345defdefFG"),
    list("12345hjkhjkFG")
]
g, b = evaluate_against(T, ["12345", "hjk", "def", "FG"] )





These 2 paths were correctly detected (2 undetected):
['hjk', 'def']

These 6 paths are spurious:
['12345hFG', '12345jFG', '12345dFG', '12345eFG', '12345fFG', '12345kFG']


In [55]:
# loop_inside = @hjk | ( (12 | 34) - ABC - @def - FG) )
"""
                     
  @hjk ----------- 
                  \
   12               > FG
       > ABC - @def 
   34 
        
"""
T = [
    list("12ABCdefdefFG"),
    list("12ABCdefdefdefFG"),
    list("34ABCdefFG"),
    list("hjkhjkFG")
]
g, b = evaluate_against(T, ("ABC", "34", "12", "hjk", "def", "FG") )





These 3 paths were correctly detected (3 undetected):
['FG', 'hjk', 'def']

These 42 paths are spurious:
['hF', 'hG', '12A', '12B', '12C', '12G', '12F', 'jF', 'jG', 'kF', 'kG', '1f', '1d', '1e', '2f', '2d', '2e', 'ABCF', 'ABCG', 'Ad', 'Bd', 'Cd', 'dG', 'dF', 'Ae', 'Be', 'Ce', 'eG', 'eF', 'Af', 'Bf', 'Cf', 'fG', 'fF', '34A', '34B', '34C', '34e', '34d', '34G', '34f', '34F']


In [56]:
"""
          @hjk
  12345 <      > FG
          @def 
          @xyz
"""
T = [
    list("12345defdefxyzxyzFG"),
    list("12345xyzxyzhjkhjkFG"),
    list("12345hjkxyzxyzhjkFG"),

]
g, b = evaluate_against(T, ["12345", "hjk", "def", "xyz", "FG"] )





These 2 paths were correctly detected (3 undetected):
['def', 'xyz']

These 76 paths are spurious:
['4d', '1d', 'dG', '2d', '5d', 'dF', '3d', '4e', '1e', 'eG', '2e', '5e', 'eF', '3e', '4f', '1f', 'fG', '2f', '5f', 'fF', '3f', '12345FG', '4x', 'xhjk', '1x', 'xG', '2x', '5x', 'xF', '3x', '4z', 'zhjk', '1z', 'zG', '2z', '5z', 'zF', '3z', '4y', 'yhjk', '1y', 'yG', '2y', '5y', 'yF', '3y', '4h', '1h', 'hG', '2h', '5h', 'hF', '3h', '4j', '1j', 'jG', '2j', '5j', 'jF', '3j', 'dx', 'dz', 'dy', 'ex', 'ez', 'ey', 'fx', 'fz', 'fy', '4k', '1k', 'kG', '2k', '5k', 'kF', '3k']
