In [18]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy import displacy

nlp = spacy.load('en_core_web_lg')

def analyze_text(text):
	return nlp(text)



In [2]:
# # # Tokenization Explained # # #
doc = analyze_text('It costs $1 million.')
print("Index: ", [token.i for token in doc])
print("Orth:", [token.orth for token in doc]) 
print("Text: ", [token.text for token in doc]) 
print("is_alpha: ", [token.is_alpha for token in doc])
print("is_punct: ", [token.is_punct for token in doc])
print("like_num: ", [token.like_num for token in doc])

Index:  [0, 1, 2, 3, 4, 5]
Orth: [7859011591137717335, 2772908615185555060, 11283501755624150392, 5533571732986600803, 17365054503653917826, 12646065887601541794]
Text:  ['It', 'costs', '$', '1', 'million', '.']
is_alpha:  [True, True, False, False, True, False]
is_punct:  [False, False, False, False, False, True]
like_num:  [False, False, False, True, True, False]


In [4]:
# # # spaCy Labels Explained # # #
def spacy_explain(label):
	print(spacy.explain(label))


spacy_explain('DET')
spacy_explain('nsubj')
spacy_explain('GPE')

determiner
nominal subject
Countries, cities, states


In [3]:
# # # Linguistic Features # # #
doc = analyze_text('John Smith ate a pizza yesterday!')

for token in doc:
    # The part-of-speech tag of the token head.
    print('Part of speech', token.text, '-->', token.pos_)
    # The syntactic relation connecting child to head.
    print('Dependency parser', token.text, '-->', token.dep_)
    # The original text of the token head.
    print('Head Text', token.head.text, '-->', token.text)


displacy.render(doc, style="dep")

Part of speech John --> PROPN
Dependency parser John --> compound
Head Text Smith --> John
Part of speech Smith --> PROPN
Dependency parser Smith --> nsubj
Head Text ate --> Smith
Part of speech ate --> VERB
Dependency parser ate --> ROOT
Head Text ate --> ate
Part of speech a --> DET
Dependency parser a --> det
Head Text pizza --> a
Part of speech pizza --> NOUN
Dependency parser pizza --> dobj
Head Text ate --> pizza
Part of speech yesterday --> NOUN
Dependency parser yesterday --> npadvmod
Head Text ate --> yesterday
Part of speech ! --> PUNCT
Dependency parser ! --> punct
Head Text ate --> !


In [6]:
spacy_explain('npadvmod')

noun phrase as adverbial modifier


In [7]:
# # # Predicting Named Entities # # #

doc = analyze_text('Apple is looking at buying U.K. startup for $1 Billion')

for ent in doc.ents:
    print(ent.text, '-->', ent.label_)


displacy.render(doc, style="ent")

Apple --> ORG
U.K. --> GPE
$1 Billion --> MONEY


In [8]:
spacy_explain('GPE')

Countries, cities, states


In [9]:
# # # LEMMA attribute # # #
# Lemma (Finds the root word "win" -> ["won", "winning", "win"]), 
# (Stemming “winning” -> “winn” and that’s not an english word!)

def lemmatisation():
    doc = analyze_text('winning won wins has')
    print("Text -->", [token.text for token in doc])
    print("Lemma -->", [token.lemma_ for token in doc])

lemmatisation()

Text --> ['winning', 'won', 'wins', 'has']
Lemma --> ['win', 'won', 'win', 'have']


In [13]:
# # # Efficient Phrase Matcher # # #
matcher = Matcher(nlp.vocab)

In [10]:
def sportsPatterns(sport):
	return [
		{'IS_DIGIT': True},
		{'LOWER': f'{sport}', 'OP': '?'},
		{'LOWER': 'world'},
		{'LOWER': 'cup'},
		{'IS_PUNCT': True},
	]


def emotionPatterns(emotion):
	return [
		{'LEMMA': f'{emotion}', 'POS': 'VERB'},
	]


def gadgetPatterns(gadget, extensionName):
	return [
		{'LOWER': f'{gadget}'}, 
		{'LOWER': f'{extensionName}', 'OP': '?'}
	]

In [11]:
fifa = sportsPatterns(sport='fifa')
rugby = sportsPatterns(sport='rugby')

love = emotionPatterns(emotion='love')
hate = emotionPatterns(emotion='hate')

phone = gadgetPatterns(gadget='iphone', extensionName='x')
computer = gadgetPatterns(gadget='mac', extensionName=' ')

In [15]:
def add_matchers(matcher):
	matcher.add("World_Cups", [fifa, rugby]);
	matcher.add("Emotion", [love, hate]);
	matcher.add("Gadgets", [phone, computer]);


add_matchers(matcher)

def showMatcher(doc):
	matches = matcher(doc)
	for match_id, start, end in matches:
		string_id = nlp.vocab.strings[match_id]  # Get string representation of matcher
		span = doc[start:end]  # The matched span
		print(
			f"""match_id: {match_id},
string_id: {string_id},
start: {start},
end: {end},
TEXT: {span.text}
		""")

In [16]:
def showRepresentationOfMatchers():
	doc = analyze_text('Upcoming Mac Pro, has leaked the release date')
	doc2 = analyze_text('2018 FIFA world cup: France won!')
	doc3 = analyze_text('I loved dogs now I love cats more')
	doc4 = analyze_text('I hate tomatoes')
	showMatcher(doc)
	showMatcher(doc2)
	showMatcher(doc3)
	showMatcher(doc4)


showRepresentationOfMatchers()

match_id: 14626195295482834101,
string_id: Gadgets,
start: 1,
end: 2,
TEXT: Mac
		
match_id: 12355752904639115815,
string_id: World_Cups,
start: 0,
end: 5,
TEXT: 2018 FIFA world cup:
		
match_id: 12577823007746369398,
string_id: Emotion,
start: 1,
end: 2,
TEXT: loved
		
match_id: 12577823007746369398,
string_id: Emotion,
start: 5,
end: 6,
TEXT: love
		
match_id: 12577823007746369398,
string_id: Emotion,
start: 1,
end: 2,
TEXT: hate
		


In [None]:
# # # Efficient Phrase Matcher # # #

matcher = PhraseMatcher(nlp.vocab)
pattern = analyze_text('Golden Retriever')
pattern2 = analyze_text('Golden Retriever')
matcher.add('DOG', [pattern, pattern2])
doc = nlp("I have a Golden Retriever")


for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print('Matched phrase: ', span.text)

In [None]:
# # # Similarity # # #

# # 2 documents
doc1 = analyze_text('I like fast food')
doc2 = analyze_text('I love pizza')
print(f"{round(doc1.similarity(doc2) * 100, 2)}%")

# # 2 tokens
doc = analyze_text('I like pizza and pasta')
print(f"{round(doc[2].similarity(doc[4]) * 100, 2)}%")

# # document and token
doc3 = analyze_text('I love pizza')
token = analyze_text('soap')[0]
print(f"{round(doc3.similarity(token) * 100, 2)}%")

# span and document
span = analyze_text('I like pizza and pasta')[2: 5]
document = analyze_text('MacDonald\'s sells burgers')

print(f"{round(span.similarity(document) * 100, 2)}%")