In [22]:
Tasmia = "In the name of Allah, the Most Benoficient, the Most Merciful"
print(Tasmia)

In the name of Allah, the Most Benoficient, the Most Merciful


### Import Libraries

In [23]:
import spacy
nlp = spacy.load('en_core_web_lg')

from spacy.lang.en import English
import re
from spacy.matcher import Matcher

import nlp
import spacy
from spacy.matcher import matcher

### Basics

In [24]:
nlp = English()

doc = nlp("This is a Text.")

print(doc.text)

This is a Text.


**Print First Token**

In [25]:
nlp = English()

doc = nlp("I like tree kangaroos and narwhals.")

print(doc[0].text)

I


**Extract Percentages**

In [26]:
text = "In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are."

In [27]:
# Find Percentages
pattern = r'\d+%'

re.findall(pattern, text)

['60%', '4%']

### Statistical Models

In [28]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("He ate a pizza")

for token in doc:
    print(token.text, token.pos_)

He PRON
ate VERB
a DET
pizza NOUN


**Predicting Named Entities**

In [29]:
doc = nlp("Apple is looking to buy a UK startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
UK GPE
$1 billion MONEY


**Similarity**

In [30]:
nlp = spacy.load("en_core_web_lg")

In [31]:
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")

doc1.similarity(doc2)

0.8627203210548107

In [32]:
doc = nlp("TV and Books")

token1, token2 = doc[0], doc[2]

# Get the similarity of the tokens "TV" and "books"
similarity = token1.similarity(token2)
similarity

0.22325327

### Creating Training Data

In [33]:
matcher = Matcher(nlp.vocab)

In [34]:
text = ['How to preorder the iPhone X',
 'iPhone X is coming',
 'Should I pay $1,000 for the iPhone X?',
 'The iPhone 8 reviews are here',
 'Your iPhone goes up to 11 today',
 'I need a new phone! Any tips?']

In [41]:
# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{'LOWER': 'iphone'}, {'LOWER': 'x'}]

# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{'LOWER': 'iphone'}, {'IS_DIGIT': True, 'OP': '?'}]

# Add patterns to the matcher
matcher.add('GADGET',[pattern1, pattern1],on_match = None)

In [44]:
# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(text):
    # Find the matches in the doc
    matches = matcher(doc)
    
    # Get a list of (start, end, label) tuples of matches in the text
    entities = [(start, end, 'GADGET') for match_id, start, end in matches]
    print(doc.text, entities)    

How to preorder the iPhone X [(4, 6, 'GADGET')]
iPhone X is coming [(0, 2, 'GADGET')]
Should I pay $1,000 for the iPhone X? [(7, 9, 'GADGET')]
The iPhone 8 reviews are here []
Your iPhone goes up to 11 today []
I need a new phone! Any tips? []


In [45]:
TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(text):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, 'GADGET') for span in spans]
    
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {'entities': entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)
    
print(*TRAINING_DATA, sep='\n')    

('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET')]})
('iPhone X is coming', {'entities': [(0, 8, 'GADGET')]})
('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET')]})
('The iPhone 8 reviews are here', {'entities': []})
('Your iPhone goes up to 11 today', {'entities': []})
('I need a new phone! Any tips?', {'entities': []})
