# Directed acyclic graph

https://en.wikipedia.org/wiki/Directed_acyclic_graph

https://www.youtube.com/watch?v=LOr_abIZL04

### Used In
* https://github.com/fxsjy/jieba

<img src="./images/dag1.png" width="70%"/>

## Implementation on Chinese sentence

### TODO
* what is the use of this...

In [4]:
import json

def print_pretty_json(dic):
#     parsed = json.loads(dic)
    print json.dumps(dic, indent=2, sort_keys=False, ensure_ascii=False).encode('utf8')
    
def does_dag_has(DAG, rng):
    for n in DAG:
        if DAG[n]['range'] == rng:
            return n
    return -1

def add_dag_2(DAG, sentence, rng, req, FREQ):
    d = len(DAG)
#     d = sentence[rng[0]:rng[1]]
    index = does_dag_has(DAG, rng)
    word = sentence[rng[0]:rng[1]]
    word_p = sentence[rng[0]-1:rng[1]]
    
    rel = -1
    if word_p in FREQ:
        rel = FREQ[word_p]
    
    if index == -1:
        r = []
        if req:
            r = [req]
        
        DAG[d] = {
            'range': rng,
            'word': word,
            'req': r,
            'rel': rel
        }
        return DAG

    if req not in DAG[index]['req']:
        DAG[index]['req'].append(req)
    
    return DAG

def generate_dag_rec(DAG, sentence, FREQ, depth=0, padding=0, prev=None):
    N = len(sentence)
    for i in range(depth,N):
#         print depth + padding, i+1, sentence[depth+padding:i+1]
        if depth + padding >= i+1:
            continue
        
        if sentence[depth+padding:i+1] in FREQ:
            DAG = add_dag_2(DAG, sentence, [depth+padding, i+1], prev, FREQ)
#         print depth, padding, i+1, sentence[depth+padding:i+1], sentence[depth+padding:i+1] in FREQ
#         if sentence[depth+padding:i+1] in FREQ:
            generate_dag_rec(DAG, sentence, FREQ, depth+1, i+1-depth-1, sentence[depth+padding:i+1])

def generate_dag(sentence, FREQ):
    DAG = {}
    generate_dag_rec(DAG, sentence, FREQ)
    return DAG

def remove_key(d, key):
    r = dict(d)
    del r[key]
    return r

def get_removable_keys(DAG):
    keys = []
    for d in DAG:
        if len(DAG[d]['req']) == 0:
            keys.append(d)
    return keys

def remove_node_from_dag(DAG):
    key = get_removable_keys(DAG)[0]
    DAG = remove_key(DAG, key)
    for d in DAG:
        if key in DAG[d]['req']:
            req = [r for r in DAG[d]['req'] if r != key]
            DAG[d]['req'] = req
    return DAG, key

def get_possible_node(DAG, key=None):
    
    if key == None:
        return get_removable_keys(DAG)
    
    keyy= DAG[key]['word']
    
    nodes = []
    for d in DAG:
        if keyy in DAG[d]['req']:
            nodes.append(d)
    return nodes

def generate_path_stack(DAG):
    stack = []
    pp = get_possible_node(DAG, None)
    def generate_path_stack_rec(DAG, pp, stack):

        if len(pp) > 1:
            stack.append('BRANCH')

        for i in range(len(pp)):
            stack.append(pp[i])
            ppp = get_possible_node(DAG, pp[i])
            if len(ppp) > 0:
                generate_path_stack_rec(DAG, ppp, stack)
            else:
                stack.append('END')
        return stack

    stack = generate_path_stack_rec(DAG, pp, stack)
    return stack


def get_possible_paths(stack):
    sents = []
    prev_branch = 0
    prev_sent = u''
    sent = u''
    for i, c in enumerate(stack):
        if c == 'BRANCH':
            prev_sent = stack[prev_branch+1:i]
            prev_branch = i
        if c == 'END':
            sent = stack[prev_branch+1:i]
            sents.append(prev_sent + sent)
            prev_branch = i

    readable_sents = []
    for ss in sents:
        readable_sents.append([DAG[s]['word'] for s in ss])

    return sents, readable_sents

In [17]:
# FREQ = [u'你', u'你好', u'好', u'吗', u'我', u'名', u'字', u'名字', u'叫']

FREQ = {
    u'你':   12,
    u'你好': 32,
    u'好':   1,
    u'吗':   4,
    u'我':   6,
    u'名':   9,
    u'字':   3,
    u'名字': 5,
    u'叫':   9,
}


sentence = u'你好吗我名字叫'

DAG = generate_dag(sentence, FREQ)
stack = generate_path_stack(DAG)
path_nodes, path_words = get_possible_paths(stack)

print_pretty_json(DAG)
print_pretty_json(stack)
print_pretty_json(path_nodes)
print_pretty_json(path_words)


{
  "0": {
    "range": [
      0, 
      1
    ], 
    "req": [], 
    "word": "你", 
    "rel": -1
  }, 
  "1": {
    "range": [
      1, 
      2
    ], 
    "req": [
      "你"
    ], 
    "word": "好", 
    "rel": 32
  }, 
  "2": {
    "range": [
      2, 
      3
    ], 
    "req": [
      "好", 
      "你好"
    ], 
    "word": "吗", 
    "rel": -1
  }, 
  "3": {
    "range": [
      3, 
      4
    ], 
    "req": [
      "吗"
    ], 
    "word": "我", 
    "rel": -1
  }, 
  "4": {
    "range": [
      4, 
      5
    ], 
    "req": [
      "我"
    ], 
    "word": "名", 
    "rel": -1
  }, 
  "5": {
    "range": [
      5, 
      6
    ], 
    "req": [
      "名"
    ], 
    "word": "字", 
    "rel": 5
  }, 
  "6": {
    "range": [
      6, 
      7
    ], 
    "req": [
      "字", 
      "名字"
    ], 
    "word": "叫", 
    "rel": -1
  }, 
  "7": {
    "range": [
      4, 
      6
    ], 
    "req": [
      "我"
    ], 
    "word": "名字", 
    "rel": -1
  }, 
  "8": {
    "range": [
      0, 
 

In [26]:

def get_words_from_dag(DAG, FREQ=None):
    words = []
    for k in DAG:
        if FREQ != None:
            if DAG[k]['word'] in FREQ:          
                if DAG[k]['word'] not in words:
                    words.append(DAG[k]['word'])
        else:
            if DAG[k]['word'] not in words:
                words.append(DAG[k]['word'])
    return words

# freeq = [u'你', u'名字']

print_pretty_json(get_words_from_dag(DAG))


[
  "到", 
  "底", 
  "认", 
  "识", 
  "不", 
  "认识", 
  "到底"
]


In [32]:
dict_file = './dict/dict.txt.small'
f = open(dict_file, 'r+')

FREQ = {}
c = 0
while c < 100000:
    c += 1
    line = next(f)
    line = line.split(' ')
    key = unicode(line[0], 'utf-8')
    FREQ[key] = line[1]
    
# print_pretty_json(FREQ)


In [33]:
sentence = u'到底认识不认识这个地方天安门口'
# sentence = u'到底认识不认识我啊我住在这里住了很久了我最喜欢的是上海这里的食堂天安门也不错不过哪个在北京'
DAG = generate_dag(sentence, FREQ)
# stack = generate_path_stack(DAG)
# path_nodes, path_words = get_possible_paths(stack)

# print_pretty_json(DAG)
xx = [i for i in get_words_from_dag(DAG, FREQ) if len(i) > 1]
print_pretty_json(xx)

# print_pretty_json(DAG)
# print_pretty_json(stack)
# print_pretty_json(path_nodes)
# print_pretty_json(path_words)

[
  "门口", 
  "天安", 
  "天安门", 
  "地方", 
  "这个", 
  "认识", 
  "到底"
]
