In [134]:
import struct
import copy
from operator import attrgetter

In [212]:

leafEnd = -1
class Node:
    """The Suffix-tree's node."""

    def __init__(self, leaf):
        # self.__identifier = identifier
        self.children = {}
        self.fileNames = {}
        self.content = []
        self.leaf = leaf 
        self.suffixIndex = None 
        self.start = {} 
        self.end = {} 
        self.suffixLink = None 

    def __eq__(self, node):
        atg = attrgetter('start', 'end', 'suffixIndex', 'fileName')
        s_start, s_end, s_suffixIndex, s_fileName = atg(self)
        n_start, n_end, n_suffixIndex, n_fileName = atg(node)
        return s_start[s_fileName] == n_start[n_fileName] and s_end[s_fileName] == n_start[n_fileName] and s_suffixIndex == n_suffixIndex

    def __ne__(self, node):
        atg = attrgetter('start', 'end', 'suffixIndex', 'fileName')
        s_start, s_end, s_suffixIndex, s_fileName = atg(self)
        n_start, n_end, n_suffixIndex, n_fileName = atg(node)
        return s_start[s_fileName] == n_start[n_fileName] and s_end[s_fileName] == n_start[n_fileName] and s_suffixIndex == n_suffixIndex

    def __getattribute__(self, name):
        if name == 'end':
            if self.leaf:
                return leafEnd
        return super(Node, self).__getattribute__(name)


class SuffixTree:
    """The Suffix-Tree."""

    def __init__(self, data, fileName):
        """Initiate the tree."""
        self.content = data # fileContent (array of bytes represented by integers)
        self.lastNewNode = None #
        self.activeNode = None 
        """activeEdge is represeted as input string character
          index (not the character itself)"""
        self.activeEdge = -1 # number of the current edge that we are checking
        self.activeLength = 0 # active length
        # remainingSuffixCount tells how many suffixes yet to
        # be added in tree
        self.remainingSuffixCount = 0
        self.rootEnd = -1
        self.splitEnd = None
        self.size = -1  # Length of input string
        self.fileName = fileName
        self.root = self.new_node(-1, self.rootEnd)
        
        
    def addFile(self, data, fileName):
        self.content = data # fileContent (array of bytes represented by integers)
        self.lastNewNode = None # used when split
        self.activeNode = None 
        self.activeEdge = -1 # number of the current edge that we are checking
        self.activeLength = 0 # active length
        self.remainingSuffixCount = 0
        self.rootEnd = -1
        self.splitEnd = None
        self.size = -1  # Length of input string
        self.fileName = fileName
#         self.root = self.new_node(-1, self.rootEnd)
        
    
    
    def extend_suffix_tree(self, pos):
        global leafEnd # takes care of all the leaves created so far
        leafEnd = pos # pos incrementing to size
        self.remainingSuffixCount += 1
        self.lastNewNode = None
        while self.remainingSuffixCount > 0:
            if self.activeLength == 0:
                self.activeEdge = pos
            
            print(self.activeEdge, self.content[self.activeEdge], self.activeNode.children.get(self.content[self.activeEdge]))
            if self.activeNode.children.get(self.content[self.activeEdge]) is None: 
                # didn't find a edge with that character on that node
                # we must therefore create a node
                self.activeNode.children[self.content[self.activeEdge]] = self.new_node(pos, leaf=True)
                #
                if (self.lastNewNode is not None):
                    self.lastNewNode.suffixLink = self.activeNode
                    self.lastNewNode = None
            else:
                nextNode = self.activeNode.children.get(self.content[self.activeEdge])
                if self.fileName not in nextNode.start:
                    nextNode.start[self.fileName] = pos
                if self.fileName not in nextNode.end:
                    nextNode.end[self.fileName] = None
                print(nextNode.content)
                # checks whether the active length
                if self.walk_down(nextNode):
                    print("Walking")
                    # sets the active node, active edge and active length to the appropriate values
                    continue
                
                # self.activeLength < length of current node
                if self.content[nextNode.start[self.fileName] + self.activeLength] == self.content[pos]:
                    # Indicates that the content 
                    if (self.lastNewNode is not None) and (self.activeNode != self.root):
                        self.lastNewNode.suffixLink = self.activeNode
                        self.lastNewNode = None
                    self.activeLength += 1
                    break
                
                self.splitEnd = nextNode.start[self.fileName] + self.activeLength - 1
                split = self.new_node(nextNode.start[self.fileName], self.splitEnd, splitNode=nextNode)
                self.activeNode.children[self.content[self.activeEdge]] = split
                
                split.children[self.content[pos]] = self.new_node(pos, leaf=True, splitNode=nextNode)
                nextNode.start[self.fileName] += self.activeLength
                split.children[self.content[nextNode.start[self.fileName]]] = nextNode
                print("hello")
#                 split.content = split.content[:splitEnd]
                
                if self.lastNewNode is not None:
                    self.lastNewNode.suffixLink = split
                self.lastNewNode = split
            self.remainingSuffixCount -= 1
            if self.activeNode == self.root and self.activeLength > 0:
                self.activeLength -= 1
                self.activeEdge = pos
            elif (self.activeNode != self.root):
                self.activeNode = self.activeNode.suffixLink
            

    def walk_down(self, current_node):
        length = current_node.end[self.fileName] - current_node.start[self.fileName] + 1
        print(length, current_node.start, current_node.end)
        if (self.activeLength >= length):
            self.activeEdge += length
            self.activeLength -= length
            self.activeNode = current_node
            return True
        return False

    def new_node(self, start, end=None, leaf=False, splitNode=None):
        node = Node(leaf)
        if splitNode is not None:
            node.fileNames = copy.deepcopy(splitNode.fileNames)
        if start >= 0 and self.fileName not in node.fileNames:
            node.fileNames[self.fileName] = [start]
        elif start >= 0 and start not in node.fileNames[self.fileName]:
            node.fileNames[self.fileName].append(start)
        if start >= 0:
            node.suffixLink = self.root
        else: 
            node.suffixLink = None
        node.start[self.fileName] = start
        node.end[self.fileName] = 0
        if end is not None:
            node.content = self.content[start:end]
        else:
            node.content = self.content[start:]
        node.suffixIndex = -1
        return node
    
    def count_longest_subsequence(self):
        active_pos = {}
        
    
    def walk_dfs(self, current):
        start, end = current.start, current.end
        yield self.content[start: end + 1]

        for node in current.children.values():
            if node:
                yield from self.walk_dfs(node)
                
    def walk_dfs_print(self, current):
        start, end = current.start, current.end
        yield current.content

        for node in current.children.values():
            if node:
                yield from self.walk_dfs_print(node)

    def build_suffix_tree(self):
        self.size = len(self.content)
        self.activeNode = self.root  # First activeNode will be root
        for i in range(self.size):
            self.extend_suffix_tree(i)

    def __str__(self):
        return "\n".join(map(str, self.edges.values()))

    def print_dfs(self):
        for sub in self.walk_dfs_print(self.root):
            print(sub)

In [213]:
longest_subsequence(["small/0.bin", "small/1.bin"])

0 197 None


TypeError: 'int' object does not support item assignment

In [161]:
def longest_subsequence(fileNames):
    x0_bytes = open_binary_file(fileNames[0])
    x0 = convert_bytes_to_int(x0_bytes)
    x0.append('$')
    suffix_tree = SuffixTree(x0, fileNames[0])
    suffix_tree.build_suffix_tree()
#     suffix_tree.print_dfs()
    longest = 0
    files = {}
    for fileName in fileNames[1:]:
        x_bytes = open_binary_file(fileName)
        x = convert_bytes_to_int(x_bytes)
        x.append('$')
        suffix_tree.addFile(x, fileName)
        suffix_tree.build_suffix_tree()
        suffix_tree.print_dfs()
#         if temp_longest > longest:
#             longest, files = temp_longest, files_and_offset
#     return longest, files

def open_binary_file(fileName):
    with open(fileName, mode='rb') as file:
        fileContent = file.read()
    file.close()
    return fileContent

def convert_bytes_to_int(x_bytes):
    lst = []
    for i in range(len(x_bytes)):
        lst.append(x_bytes[i])
    return lst

In [155]:
x = open_binary_file("small/0.bin")
lst = []
for i in range(len(x)):
    lst.append(x[i])
lst.append('$')
print(lst)
x = open_binary_file("small/1.bin")
lst = []
for i in range(len(x)):
    lst.append(x[i])
lst.append('$')
print(lst)

[197, 41, 97, 55, 212, 192, 12, 121, 206, 168, 176, 44, '$']
[231, 10, 12, 121, 206, 168, 176, 79, 242, 83, 101, 64, '$']


In [158]:
longest_subsequence(["small/0.bin", "small/1.bin"])

#    1101 0010

[]
[231, 10, 12, 121, 206, 168, 176, 79, 242, 83, 101, 64, '$']
[10, 12, 121, 206, 168, 176, 79, 242, 83, 101, 64, '$']
[12, 121, 206, 168, 176, 79, 242, 83, 101, 64, '$']
[121, 206, 168, 176, 79, 242, 83, 101, 64, '$']
[206, 168, 176, 79, 242, 83, 101, 64, '$']
[168, 176, 79, 242, 83, 101, 64, '$']
[176, 79, 242, 83, 101, 64, '$']
[12, 121, 206, 168, 176, 79, 242, 83, 101, 64, '$']
[79, 242, 83, 101, 64, '$']
[79, 242, 83, 101, 64, '$']
[242, 83, 101, 64, '$']
[206, 168, 176, 79, 242, 83, 101, 64, '$']
[242, 83, 101, 64, '$']
[83, 101, 64, '$']
[83, 101, 64, '$']
[101, 64, '$']
[176, 79, 242, 83, 101, 64, '$']
[101, 64, '$']
[64, '$']
['$']
[121, 206, 168, 176, 79, 242, 83, 101, 64, '$']
[231, 10, 12, 121, 206, 168, 176, 79, 242, 83, 101, 64, '$']
[10, 12, 121, 206, 168, 176, 79, 242, 83, 101, 64, '$']
[168, 176, 79, 242, 83, 101, 64, '$']
[64, '$']
['$']


In [None]:
class SuffixTree(object):
    class Node(object):
        def __init__(self, label, fileName=None, offset=None):
            self.label = label
            self.fileName = {}
            self.fileName[fileName] = []
            self.fileName[fileName].append(offset)
            self.out = {} 
        def addFile(self, fileName, offset):
            if fileName not in self.fileName:
                self.fileName[fileName] = []
            if offset not in self.fileName[fileName]:
                self.fileName[fileName].append(offset)
#         def copyDictionary(self, dic):
#             for i in dic.keys():
#                 self.addFile(i, )
            
            
    def __init__(self, content, fileName):
        content.append('$')
        self.root = self.Node(None, fileName, -1)
        self.root.out[content[0]] = self.Node(content, fileName, 0)
        for i in range(1, len(content)):
            cur = self.root
            j = i
            while j < len(content):
                if content[j] in cur.out:
                    child = cur.out[content[j]]
                    label = child.label
                    k = j + 1
                    while k - j < len(label) and label[k - j] == content[k]:
                        k += 1
                    if k - j == len(label):
                        cur = child
                        j = k
                    else:
                        currLabel, newLabel = label[k-j], content[k]
                        mid = self.Node(label[:k-j], fileName, 0)
                        mid.fileName = child.fileName
                        mid.addFile(fileName, j)
                        mid.out[newLabel] = self.Node(content[k:], fileName, k)
                        mid.out[currLabel] = child
                        child.label = label[k - j:]
                        
                else:
                    cur.out[content[j]] = self.Node(content[j:], fileName, j)
                    
    def addFile(self, content, fileName):
        content.append('$')
        longestSequence, fileOffset = 0, {}
        if content[0] in self.root.out:
            self.root.out[content[0]].addFile(fileName, 0)
        start = 0
        for i in range(len(content)):
#             print("INCREASE")
            cur, j = self.root, i
            start = j
            while j < len(content):
                print (start)
                if content[j] in cur.out:
                    child = cur.out[content[j]]
#                     print("label", child.label, "content", content[j:])
                    label = child.label
                    k = j + 1
                    while k - j < len(label) and label[k - j] == content[k]:
                        k += 1
                    if k - j == len(label):
                        child.addFile(fileName, j)
#                         print("vals", child.fileName.values())
                        cur = child
                        j += 1
                    else:
                        currLabel, newLabel = label[k - j], content[k]
                        mid = self.Node(label[:k - j], fileName, 0)
                        mid.fileName = child.fileName
                        mid.addFile(fileName, i)
#                         print(content[i], mid.fileName.values())
#                         print(k, start)
                        mid.out[newLabel] = self.Node(content[k:], fileName, k)
                        mid.out[currLabel] = child
                        child.label = label[k - j:]
                        print(start, k - start)
                        if longestSequence < k - start:
                            fileOffset = child.fileName
                            longestSequence = k - start
                        cur.out[content[j]] = mid
                else:
                    print("bleh", start)
                    cur.out[content[j]] = self.Node(content[j:], fileName, j)
        return longestSequence, fileOffset
            