In [1]:
import nltk
import re
import numpy as np
import functools

import spacy
import en_core_web_lg

from rules import *

In [2]:
def visual(doc, _type):
    if _type == 'entity':
        for ent in doc.ents: 
            print(ent.label_, '\t', ent.text)
    if _type == 'noun_chunks':
        for chunk in doc.noun_chunks:
            print(chunk.root.text, '\t', chunk.text)

def precook(sent, vocab):
    # similar to basic_cleaner.py
    sent = sent.lower()
    tokens = nltk.word_tokenize(sent)
    tokens = [w for w in tokens if w in vocab]
    sent = ' '.join(tokens)
    sent = sent.strip()
    return sent

def read_txt(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = [l.strip() for l in f]
    return data

wiki_vocab = read_txt('cache/vocab_clean.txt')

In [3]:
def _build_useless_pattern():
        # Month
        month = [x.lower() for x in MONTH]
        in_month = ['in ' + x for x in month]
        month += in_month
        month.remove('may')
        # Maybe Others...

        # Combined
        useless = month
        r = r'\b(%s)\b' % '|'.join(useless)
        return r
    
useless_pattern = _build_useless_pattern()
display(useless_pattern)

'\\b(january|february|march|april|june|july|august|september|october|november|december|in january|in february|in march|in april|in may|in june|in july|in august|in september|in october|in november|in december)\\b'

In [4]:
# __init__
print("Load language model...")
nlp = en_core_web_lg.load()
print("Done!")

Load language model...
Done!


In [5]:
pos_dict = {
            'IN': 'IN',
            'DT': 'DT', 'PDT': 'DT',
            'JJ': 'JJ', 'JJR': 'JJ', 'JJS': 'JJ', 
            'NN': 'NN', 'NNS': 'NN', 'NNP': 'NN', 'NNPS': 'NN',
        }

grammar1 = r"""
            CHUNK: {<IN>+<DT|PRP\$>?<JJ>*<NN>+}
        """
chunk_parser1 = nltk.RegexpParser(grammar1)

grammar2 = r"""
            NP: {<NN><NN>+}
        """
chunk_parser2 = nltk.RegexpParser(grammar2)

In [30]:
alt_texts = [
    "Harrison Ford and Calista Flockhart attend the premiere of Hollywood Homicide at the 29th American Film Festival September 5, 2003 in Deauville, France.",
    "Side view of a British Airways Airbus A319 aircraft on approach to land with landing gear down",
    "Two sculptures by artist Duncan McKellar adorn trees outside the derelict Norwich Union offices in Bristol, UK",
    "A Pakistani worker helps to clear the debris from the Taj Mahal Hotel November 7, 2005 in Balakot, Pakistan.",
    "Musician Justin Timberlake performs at the 2017 Pilgrimage Music & Cultural Festival on September 23, 2017 in Franklin, Tennessee."
]
alt_text = precook(alt_texts[0].lower(), wiki_vocab)
alt_text

'harrison ford and calista flockhart attend the premiere of hollywood homicide at the american film festival september in deauville france'

In [31]:
# remove Date month
text = alt_text
text = re.sub(useless_pattern, "", text)
text = ' '.join(text.split())
text

'harrison ford and calista flockhart attend the premiere of hollywood homicide at the american film festival in deauville france'

In [32]:
# noun_chunker_simplify
doc = nlp(text)

person_li = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
dt_li = set([token.text for token in doc if token.tag_ in ['DT', 'PDT']])

display('person_li', person_li)
display('dt_li', dt_li)

def contain_person(chunk):
    for p in person_li:
        if p in chunk:
            return True
    return False

li = []
for chunk in doc.noun_chunks:
    src = chunk.text

    if contain_person(src):
        continue

    splits = src.split()
    dst = ' '.join([splits[0], chunk.root.text]) \
          if splits[0] in dt_li \
          else chunk.root.text
    li.append((src, dst))

for item in li:
    text = text.replace(item[0], item[1])

text = ' '.join(text.split())
text

'person_li'

['harrison ford', 'calista flockhart']

'dt_li'

{'the'}

'harrison ford and calista flockhart attend the premiere of homicide at the festival in france'

In [33]:
# name_entity_hyper
doc = nlp(text)
hyper_classes = REPLACE_CLASSES + DROP_CLASSES
newString = text
visual(doc, 'entity')
for e in reversed(doc.ents):
    if e.label_ in hyper_classes:
        start = e.start_char
        end = start + len(e.text)
        newString = newString[:start] + e.label_ + newString[end:]
text = newString
text

PERSON 	 harrison ford
PERSON 	 calista flockhart
GPE 	 france


'PERSON and PERSON attend the premiere of homicide at the festival in GPE'

In [34]:
def _multi2single_head(text):
    """
    hard code implement
    """
    text = text.replace('PERSON and PERSON', 'people')
    text = text.replace('PERSON, PERSON and PERSON', 'people')
    return text

text = _multi2single_head(text)
print(text)

people attend the premiere of homicide at the festival in GPE


In [35]:
def pos_tag(text):
    doc = nlp(text)
    pos_tags = [
        (token.text, pos_dict[token.tag_] if token.tag_ in pos_dict else token.tag_) \
        for token in doc
    ]
    return pos_tags

In [36]:
# remove_pos_tag_patterns
text = pos_tag(text)
result = chunk_parser1.parse(text)
print(result)
terms = [[e[0]] if isinstance(e, tuple) else [w for w,t in e] for e in result]
res = [x for x in terms if len(set(x).intersection(set(DROP_CLASSES))) == 0]
text = ' '.join(functools.reduce(lambda x,y: x+y,res))
text

(S
  people/NN
  attend/VBP
  the/DT
  premiere/NN
  (CHUNK of/IN homicide/NN)
  (CHUNK at/IN the/DT festival/NN)
  (CHUNK in/IN GPE/NN))


'people attend the premiere of homicide at the festival'

In [37]:
text = pos_tag(text)
result = chunk_parser2.parse(text)
print(result)
terms = [e[0] if isinstance(e, tuple) else [w for w,t in e][-1] for e in result]
text = ' '.join(terms)
text

(S
  people/NN
  attend/VBP
  the/DT
  premiere/NN
  of/IN
  homicide/NN
  at/IN
  the/DT
  festival/NN)


'people attend the premiere of homicide at the festival'

In [52]:
# Test nlp.disable_pipes
# https://spacy.io/usage/processing-pipelines#disabling

with nlp.disable_pipes("tagger", "parser"):
    doc = nlp("harrison ford and calista flockhart attend the premiere of hollywood homicide at the american film festival september in deauville france")

# visual(doc, 'noun_chunks')
visual(doc, 'entity')
# [
#     (token.text, pos_dict[token.tag_] if token.tag_ in pos_dict else token.tag_) \
#     for token in doc
# ]

PERSON 	 harrison ford
PERSON 	 calista flockhart
GPE 	 hollywood
EVENT 	 the american film festival september
GPE 	 deauville france
