In [6]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')


In [7]:
# Import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)


In [8]:
pat1 = [{'LOWER': 'solarpower'}]
pat2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pat3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pat1, pat2, pat3)

In [9]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')


In [10]:
found_matches = matcher(doc)
print(found_matches)


[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [15]:
for match_id, start, end in found_matches:
    # print(match_id, start, end)
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [16]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {
    'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern1, pattern2)


In [22]:
doc2=nlp(u'Solar--power is solarpower!!')

In [23]:
found_matches = matcher(doc2)
print(found_matches)


[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [24]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'},
            {'LEMMA': 'power'}]  # CHANGE THIS PATTERN

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern1, pattern2)


In [25]:
doc3 = nlp(u'Solar-powered energy runs solar-powered cars.')

In [27]:
found_matches = matcher(doc3)
print(found_matches)


[(8656102463236116519, 0, 3)]


In [28]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [36]:
import chardet

with open('/Users/harkiratchahal/Desktop/Coding/Tutorials/NLP/UPDATED_NLP_COURSE/TextFiles/reaganomics.txt', 'rb') as f:
    byte_content = f.read()
    detected_encoding = chardet.detect(byte_content)['encoding']
    content = byte_content.decode(detected_encoding)
    doc3 = nlp(content)


In [38]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics',
               'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)

# Build a list of matches:
matches = matcher(doc3)


In [39]:
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2987, 2991)]

In [41]:
for match_id, start, end in matches:
    # print(match_id, start, end)
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start:end]
    print(match_id, string_id, start, end, span)


3473369816841043438 VoodooEconomics 41 45 supply-side economics
3473369816841043438 VoodooEconomics 49 53 trickle-down economics
3473369816841043438 VoodooEconomics 54 56 voodoo economics
3473369816841043438 VoodooEconomics 61 65 free-market economics
3473369816841043438 VoodooEconomics 673 677 supply-side economics
3473369816841043438 VoodooEconomics 2987 2991 trickle-down economics
