In [1]:
def merge_entities(doc):
    """Preprocess a spaCy doc, merging entities into a single token.
    Best used with nlp.add_pipe(merge_entities).

    doc (spacy.tokens.Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun entities.
    """
    spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
             for e in doc.ents]
    for start, end, tag, dep, ent_type in spans:
        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
    return doc

In [2]:
import dateparser as dp
import string
exclude = set(string.punctuation)

# Support for maths
import numpy as np
# Plotting tools
from matplotlib import pyplot as plt
# we use the following for plotting figures in jupyter
%matplotlib inline


import spacy
from spacy_hunspell import spaCyHunSpell

# load nlp parser with spellchecker
nlp = spacy.load('en')
hunspell = spaCyHunSpell(nlp, ('./src/hunspell/en_US.dic', './src/hunspell/en_US.aff'))

nlp.add_pipe(hunspell)
nlp.add_pipe(merge_entities, name='merge_entities')

In [18]:
from nltk.tree import Tree
import networkx as nx

In [4]:
def tokenize(sentence):
    tokenized_sentence = []
    for token in sentence.split(' '): # simplest split is
        token = ''.join(ch for ch in token if ch not in exclude)
        if token != '':
            tokenized_sentence.append(token.lower())
    return ' '.join(tokenized_sentence)

In [5]:
def correction_spacy(parsed):
    corrected = []
    for w in parsed.doc:
        if not(w._.hunspell_spell):
            corrected.append(str(w._.hunspell_suggest[0]))
        else:
            corrected.append(str(w))
    return nlp(' '.join(corrected))

In [6]:
def get_date_sentence(sentence):
    date_in_sentence = []
    for entity in sentence.ents:
        if entity.label_=="DATE":
            date_in_sentence.append(entity.text)
    return date_in_sentence

In [7]:
# Date extraction
def time_convert(list_str_date):
    return [dp.parse(w) for w in list_str_date]


In [8]:
def get_nsubj(sentence):
    nsubj = []
    for token in sentence:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.sentiment)
        if token.dep_ == 'nsubj':
            nsubj.append(token)
    return nsubj

In [98]:
input_text = "This is a test text, which mean i am avalable  the 24th of febraury, and tomorrow. But John will be here tuesday"

In [99]:
## First Task, cleaning Data

input_text = tokenize(input_text)

print(input_text)

this is a test text which mean i am avalable the 24th of febraury and tomorrow but john will be here tuesday


In [100]:
## Next, the text is parse with nlp from Spacy
parsed = nlp(input_text)

In [101]:
# Spellcheck
corrected = correction_spacy(parsed)
print(corrected)

this is a test text which mean i am available the 24th of February and tomorrow but john will be here Tuesday


In [102]:
# Date extraction
dates_str = get_date_sentence(corrected)
print(dates_str)

['the 24th of February', 'tomorrow', 'Tuesday']


In [103]:
# Date conversion
dates = time_convert(dates_str)
print(dates)

[datetime.datetime(2019, 2, 24, 0, 0), datetime.datetime(2019, 1, 27, 22, 33, 56, 18574), datetime.datetime(2019, 1, 22, 0, 0)]


In [104]:
# nsubj extraction 
nsubj = get_nsubj(corrected)
print(nsubj)

this this DET DT nsubj xxxx 0.0
is be VERB VBZ ROOT xx 0.0
a a DET DT det x 0.0
test test NOUN NN compound xxxx 0.0
text text NOUN NN attr xxxx 0.0
which which ADJ WDT nsubj xxxx 0.0
mean mean VERB VBP relcl xxxx 0.0
i i PRON PRP nsubj x 0.0
am be VERB VBP ccomp xx 0.0
available available ADJ JJ acomp xxxx 0.0
the 24th of February the NOUN NN npadvmod xxx ddxx xx Xxxxx 0.0
and and CCONJ CC cc xxx 0.0
tomorrow tomorrow NOUN NN conj xxxx 0.0
but but CCONJ CC cc xxx 0.0
john john PROPN NNP nsubj xxxx 0.0
will will VERB MD aux xxxx 0.0
be be VERB VB conj xx 0.0
here here ADV RB advmod xxxx 0.0
Tuesday tuesday PROPN NNP npadvmod Xxxxx 0.0
[this, which, i, john]


In [105]:
def get_dep_graph(document):
    edges = []
    for token in document:
        # FYI https://spacy.io/docs/api/token
        for child in token.children:
            edges.append(('{0}'.format(token),
                          '{0}'.format(child)))

    return nx.Graph(edges)


In [112]:
def get_subj_date(document, dates_str, subjects):
    graph = get_dep_graph(document)
    
    paths =[[(subj.text, date) for subj in subjects ] for date in dates_str]
    lengths =[[nx.shortest_path_length(graph, source=subj.text, target=date) for subj in nsubj ] for date in dates_str]
    
    index_min = [np.argmin(length) for length in lengths]
    subject_date = []
    for i, date in enumerate(dates_str):
        subject_date.append(paths[i][index_min[i]])

    return subject_date

[('i', 'the 24th of February'), ('i', 'tomorrow'), ('john', 'Tuesday')]
