In [1]:
import configparser
import networkx as nx
import itertools
import math
import random
import json
from tqdm import tqdm
import sys
import time
import timeit
import nltk
import json
# print('NLTK Version: %s' % (nltk.__version__))
nltk.download('stopwords')
nltk_stopwords = nltk.corpus.stopwords.words('english')
nltk_stopwords += ["like", "gone", "did", "going", "would", "could", "get", "in", "up", "may", "wanter"]


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
config = configparser.ConfigParser()
config.read("paths.cfg")

cpnet = None
concept2id = None
relation2id = None
id2relation = None
id2concept = None
blacklist = set(["uk", "us", "take", "make", "object", "person", "people"])

In [3]:
config = configparser.ConfigParser()
config.read("paths.cfg")

cpnet = None
concept2id = None
relation2id = None
id2relation = None
id2concept = None
blacklist = set(["uk", "us", "take", "make", "object", "person", "people"])

In [4]:
def load_resources():
    global concept2id, relation2id, id2relation, id2concept
    concept2id = {}
    id2concept = {}
    with open(config["paths"]["concept_vocab"], "r", encoding="utf8") as f:
        for w in f.readlines():
            concept2id[w.strip()] = len(concept2id)
            id2concept[len(id2concept)] = w.strip()

    print("concept2id done")
    id2relation = {}
    relation2id = {}
    with open(config["paths"]["relation_vocab"], "r", encoding="utf8") as f:
        for w in f.readlines():
            id2relation[len(id2relation)] = w.strip()
            relation2id[w.strip()] = len(relation2id)
    print("relation2id done")

In [5]:
load_resources()

concept2id done
relation2id done


In [6]:
def save_cpnet():
    global concept2id, relation2id, id2relation, id2concept, blacklist
    load_resources()
    graph = nx.MultiDiGraph()
    with open(config["paths"]["conceptnet_en"], "r", encoding="utf8") as f:
        lines = f.readlines()

        def not_save(cpt):
            if cpt in blacklist:
                return True
            for t in cpt.split("_"):
                if t in nltk_stopwords:
                    return True
            return False

        for line in tqdm(lines, desc="saving to graph"):
            ls = line.strip().split('\t')
            rel = relation2id[ls[0]]
            subj = concept2id[ls[1]]
            obj = concept2id[ls[2]]
            weight = float(ls[3])
            if id2relation[rel] == "hascontext":
                continue
            if not_save(ls[1]) or not_save(ls[2]):
                continue
            if id2relation[rel] == "relatedto" or id2relation[rel] == "antonym":
                weight -= 0.3
                # continue
            if subj == obj: # delete loops
                continue
            weight = 1+float(math.exp(1-weight))
            graph.add_edge(subj, obj, rel=rel, weight=weight)
            graph.add_edge(obj, subj, rel=rel+len(relation2id), weight=weight)


    nx.write_gpickle(graph, config["paths"]["conceptnet_en_graph"])

In [7]:
#save_cpnet()

In [8]:
def load_cpnet():
    global cpnet,concept2id, relation2id, id2relation, id2concept, cpnet_simple
    print("loading cpnet....")
    cpnet = nx.read_gpickle(config["paths"]["conceptnet_en_graph"])
    print("Done")

    cpnet_simple = nx.Graph()
    for u, v, data in cpnet.edges(data=True):
        w = data['weight'] if 'weight' in data else 1.0
        if cpnet_simple.has_edge(u, v):
            cpnet_simple[u][v]['weight'] += w
        else:
            cpnet_simple.add_edge(u, v, weight=w)

In [9]:
def get_edge(src_concept, tgt_concept):
    global cpnet, concept2id, relation2id, id2relation, id2concept
    rel_list = cpnet[src_concept][tgt_concept]
    # tmp = [rel_list[item]["weight"] for item in rel_list]
    # s = tmp.index(min(tmp))
    # rel = rel_list[s]["rel"]
    return list(set([rel_list[item]["rel"] for item in rel_list]))

In [10]:
def find_paths(source, target, ifprint = False):
    try:
        global cpnet, concept2id, relation2id, id2relation, id2concept, cpnet_simple
        s = concept2id[source]
        t = concept2id[target]

        # try:
        #     lenth, path = nx.bidirectional_dijkstra(cpnet, source=s, target=t, weight="weight")
        #     # print(lenth)
        #     print(path)
        # except nx.NetworkXNoPath:
        #     print("no path")
        # paths = [path]

        if s not in cpnet_simple.nodes() or t not in cpnet_simple.nodes():
            return
        # paths =
        all_path = []
        all_path_set = set()

        for max_len in range(1, 3):
            for p in nx.all_simple_paths(cpnet_simple, source=s, target=t, cutoff=max_len):
                path_str = "-".join([str(c) for c in p])
                if path_str not in all_path_set:
                    all_path_set.add(path_str)
                    all_path.append(p)
                if len(all_path) >= 3:  # top shortest 300 paths
                    break
            if len(all_path) >= 3:  # top shortest 300 paths
                break

        # all_path = [[int(c) for c in p.split("-")] for p in list(set(["-".join([str(c) for c in p]) for p in all_path]))]
        # print(len(all_path))
        all_path.sort(key=len, reverse=False)
        pf_res = []
        for p in all_path:
            # print([id2concept[i] for i in p])
            rl = []
            for src in range(len(p) - 1):
                src_concept = p[src]
                tgt_concept = p[src + 1]

                rel_list = get_edge(src_concept, tgt_concept)
                rl.append(rel_list)
                if ifprint:
                    rel_list_str = []
                    for rel in rel_list:
                        if rel < len(id2relation):
                            rel_list_str.append(id2relation[rel])
                        else:
                            rel_list_str.append(id2relation[rel - len(id2relation)]+"*")
                    print(id2concept[src_concept], "-[%s]-> " %("/".join(rel_list_str)), end="")
                    if src + 1 == len(p) - 1:
                        print(id2concept[tgt_concept], end="")
            if ifprint:
                print()

            pf_res.append({"path": p, "rel": rl})
        return pf_res
    except:
        path_ = []
        return path_

In [11]:
def process(example, batch_id=-1):
    pf = []
    #output_path = filename + ".%d" % (batch_id) + ".pf"
    import os
    if os.path.exists(output_path):
        print(output_path + " exists. Skip!")
        return

    load_resources()
    load_cpnet()
    with open(filename, 'r') as fp:
        mcp_data = json.load(fp)
        mcp_data = list(np.array_split(mcp_data, 100)[batch_id])

        for item in tqdm(mcp_data, desc="batch_id: %d "%batch_id):
            acs = item["ac"]
            qcs = item["qc"]
            pfr_qa = []  # path finding results
            for ac in acs:
                for qc in qcs:
                    pf_res = find_paths(qc, ac)
                    pfr_qa.append({"ac":ac, "qc":qc, "pf_res":pf_res})
            pf.append(pfr_qa)

    with open(output_path, 'w') as fi:
        json.dump(pf, fi)

In [12]:
load_cpnet()

loading cpnet....
Done


In [13]:
find_paths("palm", "dry", ifprint=True)
print("--------")

palm -[isa]-> area -[relatedto*]-> dry
--------


In [14]:
with open("../datasets/ob_data/train/ob_concepts_train_final.mcp",'r') as f:
    data = json.load(f)

In [63]:
#c2id

In [15]:
ex = data[0:10]
len(ex)

10

In [35]:
len(ex[0])

4

In [28]:
a = ex[0][1]

In [29]:
a

[{'sent': ' ',
  'option': 'The sun is responsible for puppies learning new tricks',
  'premise': ['youngs',
   'of_love',
   'difference',
   'seem',
   'stench',
   'enough',
   'wanter',
   'heart',
   'tell',
   'waye',
   'young_and',
   'way',
   'long',
   'old_plants',
   'independence',
   'signing',
   'children_at',
   'childs',
   'real',
   'point',
   'great',
   'imagine',
   'storied',
   'tell_me',
   'planting',
   'childe',
   'child',
   'plant',
   'hear',
   'getting_enough',
   'may',
   'shade',
   'showest',
   'grows',
   'young',
   'get',
   'plants_and',
   'may_be',
   'and_children',
   'hears',
   'size',
   'hearting',
   'low',
   'wantest',
   'get_old',
   'old',
   'death',
   'enjoyest',
   'show',
   'want_to',
   'inspire',
   'giant',
   'expert',
   'consider',
   'exist',
   'story',
   'time',
   'sunflow',
   'age',
   'loud',
   'much',
   'food',
   'signest',
   'shaded',
   'grow',
   'sign',
   'existest',
   'grown',
   'love',
   'too

In [30]:
a[0]['premise']

['youngs',
 'of_love',
 'difference',
 'seem',
 'stench',
 'enough',
 'wanter',
 'heart',
 'tell',
 'waye',
 'young_and',
 'way',
 'long',
 'old_plants',
 'independence',
 'signing',
 'children_at',
 'childs',
 'real',
 'point',
 'great',
 'imagine',
 'storied',
 'tell_me',
 'planting',
 'childe',
 'child',
 'plant',
 'hear',
 'getting_enough',
 'may',
 'shade',
 'showest',
 'grows',
 'young',
 'get',
 'plants_and',
 'may_be',
 'and_children',
 'hears',
 'size',
 'hearting',
 'low',
 'wantest',
 'get_old',
 'old',
 'death',
 'enjoyest',
 'show',
 'want_to',
 'inspire',
 'giant',
 'expert',
 'consider',
 'exist',
 'story',
 'time',
 'sunflow',
 'age',
 'loud',
 'much',
 'food',
 'signest',
 'shaded',
 'grow',
 'sign',
 'existest',
 'grown',
 'love',
 'too_much',
 'start',
 'grow_up',
 'rightest',
 'younge',
 'actively',
 'olde',
 'storye',
 'right',
 'get_child',
 'all_children',
 'too_long',
 'enjoy']

In [40]:
for e in ex:
    for item in e:
        print(item[0])
        print(len(item[0]))
    break

{'sent': ' ', 'option': 'D', 'premise': ['sun', 'special', 'new', 'performs', 'learning', 'suns', 'deal', 'trick', 'student', 'find', 'learns', 'parroting', 'skille', 'great_deal_of', 'world', 'cockatoo', 'finding', 'front_of', 'responsible', 'respond', 'sure', 'evens', 'conure', 'great', 'maked', 'alert', 'ostrich', 'learn_new', 'turn', 'skill', 'and_love', 'in_front', 'perform_in', 'adept', 'live', 'end_of', 'grows', 'nearly', 'telescope', 'throughout', 'artisan_who', 'capable', 'bring', 'old', 'always', 'puppy', 'training', 'koto', 'technique', 'new_family', 'dog', 'beak', 'even', 'learn_how_to', 'mirror', 'great_deal', 'parrot', 'know', 'well', 'learn_new_tricks', 'probleme', 'family', 'dealest', 'adepts', 'willing', 'every', 'perform', 'old_dog', 'object', 'front', 'performing_in_front_of', 'in_front_of', 'knowing_yourself', 'grow', 'learn', 'sunned', 'grown', 'love', 'mirrored', 'arise', 'artisan', 'breeder', 'kotos', 'make_sure', 'clawing', 'for_learning', 'ferret', 'learn_how',

In [17]:
find_paths('sun','puppy')

[{'path': [3461, 2210, 9841], 'rel': [[15], [17]]},
 {'path': [3461, 28366, 9841], 'rel': [[18], [32]]}]

In [41]:
pf = []
for e in tqdm(ex):
    pfr_gpre = []  # path finding results
    pfr_hyp = []
    for item in e:
        gpre = item[0]["premise"]
        ghyp = item[0]["hypothesis"]
        ans = item[0]["ans"]
        for pre in gpre:
            for ac in ans:
                gpf_res = find_paths(pre, ac)
                #print(pf_res)
                pfr_gpre.append({"pre":pre, "ac":ac, "path":gpf_res})
        for hyp in ghyp:
            for answer in ans:
                hpf_res = find_paths(hyp,answer)
                pfr_hyp.append({"hyp":hyp,"hac":answer, "path":hpf_res})
        combine =[pfr_gpre,pfr_hyp]
        pf.append(combine)

100%|██████████| 10/10 [00:26<00:00,  2.63s/it]


In [42]:
len(pf[0][0])

4540

In [46]:
len(pf)

40

In [48]:
#pf[0]

In [49]:
len(pf[0][0])

4540

In [None]:
def process_arc(data,start,end):
    data = data[start:end]
    pf = []
    for item in tqdm(ex):
        gpre = item["premise"]
        ghyp = item["hypothesis"]
        ans = item["ans"]
        pfr_gpre = []  # path finding results
        pfr_hyp = []
        for pre in gpre:
            for ac in ans:
                gpf_res = find_paths(pre, ac)
                #print(pf_res)
                pfr_gpre.append({"pre":pre, "ac":ac, "path":gpf_res})
        for hyp in ghyp:
            for answer in ans:
                hpf_res = find_paths(hyp,answer)
                pfr_hyp.append({"hyp":hyp,"hac":answer, "path":hpf_res})
        combine =[pfr_gpre,pfr_hyp]
        pf.append(combine)

In [None]:
import jsbeautifier
opts = jsbeautifier.default_options()
opts.indent_size = 2

with open("example.mcp","w") as fp:
    fp.write(jsbeautifier.beautify(json.dumps(pf), opts))

In [None]:
a