In [800]:
import glob
import pandas as pd
import re
import itertools
import numpy as np
import networkx as nx
import json

#pattern = re.compile(',\s*(?=([^\"]*"[^\"]*\")*[^\"]*$)')

In [801]:
const1 = "ENT10812879a"
const2 = '","'

def feature_to_list_postgres(s, key):
    if k not in {"sentid", "docid"}:
        s = s.replace("{","").replace("}","")
        s = s.replace(const2, const1)
        s = s.split(",")
        res = ["," if x == const1 else x for x in s]
    else:
        res = s
        
    return(res)    

def feature_to_list(s, key):
    if k not in {"sentid", "docid"}:
        s = s.replace("{","").replace("}","")
        s = "[" + s + "]"
        res = eval(s)
    else:
        res = s
        
    return(res)

def index_to_tokens(x, row):
    start = x[0]
    end = start + len(x)
    return(row["word"][start:end])

def consecutive(data, stepsize=1):
    res = np.split(data, np.where(np.diff(data) != stepsize)[0]+1)
    res = np.array(res)
    return(res)

def tokens_nonconsecutive_ner(word, ners, label = "TAXA"):  
    tagged = [i for i,(w, ner) in enumerate(zip(word, ners)) if ner == label]

    ner_label_indices = consecutive(tagged)
    
    tagged_offset = tagged[1:]
    tagged_offset.append(None)
    
    w = np.array(word)
    targets = [w[i] for i in ner_label_indices]
    
    entities = []
    
    for i, (idx, target) in enumerate(zip(ner_label_indices, targets)):
        res = dict()
        res["idx"] = idx.tolist()
        res[label] = w[idx].tolist()
        entities.append(res)

    return(entities)

pattern_genus = re.compile("[A-Z][.]")

class smart_dict(dict):
    def __missing__(self, key):
        return(key)

def obtain_candidates(df):
    candidates = []
    
    for i, row in df.iterrows():
        if len({"TAXA", "INTERVALNAME"}.intersection(set(row["ners"]))) > 1:
            taxa = tokens_nonconsecutive_ner(row["word"], row["ners"], "TAXA")
            intervals = tokens_nonconsecutive_ner(row["word"], row["ners"], "INTERVALNAME")

            #Deabbreviate genus names
            is_taxa = np.array(row["ners"]) == "TAXA"
            is_abbrev = np.array([True if pattern_genus.match(x) else False for x in row["word"]])

            abbrevs = set(np.array(row["word"])[is_taxa & is_abbrev])

            if abbrevs:
                d = smart_dict()
                for abbrev in abbrevs:
                    d[abbrev] = replace_abbrev(abbrev, df, i, 8)
                
                for entity in taxa:
                    for k, v in d.items():
                        entity["TAXA"] = [v if x == k else x for x in entity["TAXA"]]
            
            ## set up dependency tree in networkx
            G = nx.Graph()

            dep = row["dep_parents"]
            nodes = [x+1 for x in range(len(dep))]
            parents = [int(x) for x in dep]
            edges = zip(parents, nodes)

            G.add_edges_from(edges)

            ## All combinations of the spans (product)
            for p in itertools.product(taxa, intervals):
                sdp = dict()
                a, b = sorted([p[0], p[1]], key = lambda x: x["idx"])
                
                try:
                    sdp["idx"] = nx.shortest_path(G, a["idx"][-1], b["idx"][0])
                except:
                    print(row["docid"])
                    print(row["sentid"])
                    #break
                    raise Exception("Could not compute SPD")
                    
                sdp["words"] = [row["word"][i] for i in sdp["idx"]]

                ## Compute SPD for each
                candidate = dict()

                candidate["sdp"] = sdp

                candidate["TAXA"] = p[0]
                candidate["INTERVALNAME"] = p[1]
                candidate["sentid"] = int(i)
                candidate["sentence"] = row["word"]
                candidate["gddid"] = row["docid"]

                candidates.append(candidate)

    return(candidates)

def replace_abbrev(abbrev, df, index, count):
    previous_words = df.iloc[index]["word"]
    
    genus_toks = [x for x in previous_words if x.startswith(abbrev[0]) and len(x) > 2]
    
    if genus_toks:
        replacement = genus_toks[-1]
    else:
        if int(count) > 0:
            replacement = replace_abbrev(abbrev, df, index, count -1)
        else:
            replacement = abbrev
    return(replacement)

In [750]:
header = ["docid", "sentid", "wordidx", "word", "poses", "ners", "lemmas", "dep_paths", "dep_parents"]

In [799]:
fpaths = glob.glob("data/nlp390_bryozoa/*")
fpaths = glob.glob("data/*.tsv")

import tqdm

## Read the csv files

for fpath in tqdm.tqdm_notebook(fpaths):

    df = pd.read_csv(fpath, header=None, names = header, sep ="\t")

    for k, v in df.iteritems():
        df[k] = [feature_to_list(x, k) for x in v]
    candidates = obtain_candidates(df)

    if candidates:
        gddid = candidates[0]["gddid"]


        with open("output/{}.json".format(gddid), "w") as f:
            json.dump(candidates, f, indent=4, sort_keys=True)




gddid: 5ab3d0c3cf58f10e4a1e0ea5

2 	 1 	 Crisia
0 	 2 	 tenuis
2 	 3 	 MacGillivray
2 	 4 	 ,
2 	 5 	 1879
5 	 6 	 recorded
11 	 7 	 from
11 	 8 	 the
11 	 9 	 Port
11 	 10 	 Phillip
6 	 11 	 area
11 	 12 	 ,
11 	 13 	 and
17 	 14 	 also
17 	 15 	 from
17 	 16 	 the
11 	 17 	 Miocene
20 	 18 	 of
20 	 19 	 Muddy
17 	 20 	 Creek
2 	 21 	 ,
25 	 22 	 Victoria
25 	 23 	 ,
25 	 24 	 Australia
2 	 25 	 exhibits
25 	 26 	 circular
29 	 27 	 to
29 	 28 	 oval
25 	 29 	 pseudopores
32 	 30 	 on
32 	 31 	 the
29 	 32 	 zooids
25 	 33 	 and
35 	 34 	 distinctly
25 	 35 	 slit
37 	 36 	 like
35 	 37 	 pseudopores
40 	 38 	 on
40 	 39 	 the
37 	 40 	 ovicell
2 	 41 	 .


In [707]:
#df.iloc[182]
for c in candidates:
    print(c["sdp"]["words"])
    print(c["TAXA"])
    
candidates[0]

['Holocene', '71', 'from', '-LRB-', 'Exidmonea']
{'idx': [10], 'TAXA': []}
['atlantica', 'auct.is', 'a', 'that', 'by', '-LRB-', 'Miocene']
{'idx': [6, 7], 'TAXA': []}
['atlantica', 'auct.is', 'a', 'that', 'by', '-LRB-', '-RRB-', 'Burdigalian']
{'idx': [6, 7], 'TAXA': []}
['Miocene', '-LRB-', 'by', '.', 'Exidmonea']
{'idx': [42, 43], 'TAXA': []}
['Burdigalian', '-RRB-', '-LRB-', 'by', '.', 'Exidmonea']
{'idx': [42, 43], 'TAXA': []}
['proboscidea', 'ranges', 'in', 'to', 'Cretaceous']
{'idx': [0, 1], 'TAXA': ['66']}
['proboscidea', 'ranges', 'in', 'to', ',', 'Recent']
{'idx': [0, 1], 'TAXA': ['66']}
['proboscidea', 'ranges', 'in', 'from', 'Cenozoic']
{'idx': [0, 1], 'TAXA': ['66']}
['elongata', 'from', 'Crisia', 'of', 'Neogene']
{'idx': [6, 7], 'TAXA': []}
['sp.', 'by', 'under', 'circular', 'Pliocene']
{'idx': [5, 6], 'TAXA': []}
['tenuis', 'MacGillivray', 'recorded', 'from', ',', 'of', 'Miocene']
{'idx': [0, 1], 'TAXA': ['66']}
['acropora', ',', 'is', 'from', 'and', 'Recent']
{'idx': [0,

{'INTERVALNAME': {'INTERVALNAME': [], 'idx': [7]},
 'TAXA': {'TAXA': [], 'idx': [10]},
 'gddid': '5ab3d0c3cf58f10e4a1e0ea5',
 'sdp': {'idx': [7, 9, 5, 11, 10],
  'words': ['Holocene', '71', 'from', '-LRB-', 'Exidmonea']},
 'sentence': ['Location',
  'map',
  'Fossil',
  'Cyclostome',
  'Bryozoans',
  'from',
  'the',
  'Holocene',
  'Rocks',
  '71',
  'Exidmonea',
  '-LRB-',
  '?',
  '-RRB-'],
 'sentid': 38}

In [708]:
df

Unnamed: 0,docid,sentid,wordidx,word,poses,ners,lemmas,dep_paths,dep_parents
0,5ab3d0c3cf58f10e4a1e0ea5,1,"[1, 2]","[Gond, .]","[NNP, .]","[O, O]","[Gond, .]","[, punct]","[0, 1]"
1,5ab3d0c3cf58f10e4a1e0ea5,2,"[1, 2]","[Geol, .]","[NNP, .]","[O, O]","[Geol, .]","[, punct]","[0, 1]"
2,5ab3d0c3cf58f10e4a1e0ea5,3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[Magz., ,, V., 23, -LRB-, 1, -RRB-, ,, June, ,...","[NNP, ,, NNP, CD, -LRB-, CD, -RRB-, ,, NNP, ,,...","[O, O, O, NUMBER, O, NUMBER, O, O, DATE, DATE,...","[Magz., ,, V., 23, -lrb-, 1, -rrb-, ,, June, ,...","[, punct, appos, nummod, punct, appos, punct, ...","[0, 1, 1, 3, 6, 3, 6, 3, 3, 9, 9, 1]"
3,5ab3d0c3cf58f10e4a1e0ea5,4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[pp., 69-76, Fossil, Cyclostome, Bryozoans, fr...","[NN, CD, NNP, NNP, NNPS, IN, DT, NNP, NNP, IN,...","[O, NUMBER, MISC, MISC, MISC, O, O, INTERVALNA...","[pp., 69-76, Fossil, Cyclostome, Bryozoans, fr...","[, nummod, compound, compound, dep, case, det,...","[0, 1, 5, 5, 1, 9, 9, 9, 5, 12, 12, 9, 14, 12,..."
4,5ab3d0c3cf58f10e4a1e0ea5,5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[Institute, of, Science, ,, Auragabad, ,, Maha...","[NNP, IN, NNP, ,, NNP, ,, NNP, ,, NNP, .]","[ORGANIZATION, ORGANIZATION, ORGANIZATION, O, ...","[Institute, of, Science, ,, Auragabad, ,, Maha...","[, case, nmod, punct, compound, punct, appos, ...","[0, 3, 1, 3, 9, 9, 9, 9, 3, 1]"
5,5ab3d0c3cf58f10e4a1e0ea5,6,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[*, Address, for, correspondence, :, 425/75, ,...","[SYM, NN, IN, NN, :, CD, ,, NNP, NNP, ,, NNP, ...","[O, O, O, O, O, NUMBER, O, O, O, O, PERSON, PE...","[*, address, for, correspondence, :, 425/75, ,...","[dep, dep, case, nmod, punct, , punct, compoun...","[2, 6, 4, 2, 6, 0, 6, 9, 6, 6, 14, 14, 14, 6, ..."
6,5ab3d0c3cf58f10e4a1e0ea5,7,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Email, -, drmohansonar@yahoo.com, Email, -, d...","[VB, :, NNP, VB, :, NNP, NN, NN, VBZ, CD, NNS,...","[O, O, EMAIL, O, O, EMAIL, O, O, O, NUMBER, O,...","[email, -, drmohansonar@yahoo.com, email, -, d...","[, punct, nsubj, xcomp, punct, compound, compo...","[0, 4, 4, 1, 4, 8, 8, 9, 4, 11, 9, 15, 15, 15,..."
7,5ab3d0c3cf58f10e4a1e0ea5,8,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Taxa, belong, to, the, famililies, Tubulopori...","[NN, VBP, TO, DT, NNS, NNP, NNP, ,, CD, ,, NNP...","[O, O, O, O, O, O, PERSON, O, DATE, O, PERSON,...","[taxa, belong, to, the, familily, Tubuloporida...","[nsubj, , case, det, nmod, compound, dobj, pun...","[2, 0, 5, 5, 2, 7, 2, 7, 7, 7, 12, 7, 7, 7, 7,..."
8,5ab3d0c3cf58f10e4a1e0ea5,9,"[1, 2, 3, 4, 5, 6, 7]","[These, species, are, Exidmonea, -LRB-, ?, -RRB-]","[DT, NNS, VBP, NNP, -LRB-, ., -RRB-]","[O, O, O, TAXA, O, O, O]","[these, species, be, Exidmonea, -lrb-, ?, -rrb-]","[det, nsubj, cop, , punct, punct, punct]","[2, 4, 4, 0, 4, 4, 4]"
9,5ab3d0c3cf58f10e4a1e0ea5,10,"[1, 2, 3, 4, 5, 6, 7, 8]","[atlantica, David, ,, Mongereau, and, Pouyet, ...","[NN, NNP, ,, NNP, CC, NNP, ,, .]","[TAXA, PERSON, O, PERSON, O, LOCATION, O, O]","[atlantica, David, ,, Mongereau, and, Pouyet, ...","[compound, , punct, appos, cc, conj, punct, pu...","[2, 0, 2, 2, 4, 4, 2, 2]"


In [469]:
[' '.join(x["sentence"]) for x in candidates]

['Canu and Bassler -LRB- 1919 -RRB- , reporting on two Dominican Republic Miocene localities , described a new genus Corynostylusa , nd two new species , C. labiatus , the type species , and C. ellipticus.The authors placed Corynostyluisn the family Calpensiidae .',
 "Discussion : In lacking heterozooecia and in having articulated fusiform internodes , Fusicanna bears a resemblance to Fusicellaria d'Orbigny , 1851 , from the Senonian of France .",
 "Discussion : In lacking heterozooecia and in having articulated fusiform internodes , Fusicanna bears a resemblance to Fusicellaria d'Orbigny , 1851 , from the Senonian of France ."]