In [1]:
import dateparser as dp
import string
exclude = set(string.punctuation)

import spacy
from spacy_hunspell import spaCyHunSpell

# load nlp parser with spellchecker
nlp = spacy.load('en')
hunspell = spaCyHunSpell(nlp, ('./src/hunspell/en_US.dic', './src/hunspell/en_US.aff'))

nlp.add_pipe(hunspell)

In [3]:
def tokenize(sentence):
    tokenized_sentence = []
    for token in sentence.split(' '): # simplest split is
        token = ''.join(ch for ch in token if ch not in exclude)
        if token != '':
            tokenized_sentence.append(token.lower())
    return ' '.join(tokenized_sentence)

In [32]:
def correction_spacy(parsed):
    corrected = []
    for w in parsed.doc:
        if not(w._.hunspell_spell):
            corrected.append(str(w._.hunspell_suggest[0]))
        else:
            corrected.append(str(w))
    return nlp(' '.join(corrected))

In [37]:
def get_date_sentence(sentence):
    date_in_sentence = []
    for entity in sentence.ents:
        if entity.label_=="DATE":
            date_in_sentence.append(entity.text)
    return date_in_sentence

In [47]:
# Date extraction
def time_convert(list_str_date):
    return [dp.parse(w) for w in list_str_date]


In [26]:
input_text = "This is a test text, which mean i am avalable  the 24th of febraury, and tomorrow"

In [30]:
## First Task, cleaning Data

input_text = tokenize(input_text)

print(input_text)

this is a test text which mean i am avalable the 24th of febraury and tomorrow


In [31]:
## Next, the text is parse with nlp from Spacy
parsed = nlp(input_text)

In [39]:
# Spellcheck
corrected = correction_spacy(parsed)
print(corrected)

this is a test text which mean i am available the 24th of February and tomorrow
['the 24th of February', 'tomorrow']


In [46]:
# Date extraction
dates_str = get_date_sentence(corrected)
print(dates_str)

['the 24th of February', 'tomorrow']


In [48]:
# Date conversion
dates = time_convert(dates_str)
print(dates)

[datetime.datetime(2019, 2, 24, 0, 0), datetime.datetime(2019, 1, 27, 20, 9, 0, 563482)]
