### Import spacy

In [2]:
# import sapcy
import spacy

In [5]:
# create a language object
nlp = spacy.blank("en")

# create a document
doc = nlp("Dr. Strange loves samosa chat of karachi as it costs only 1$ per plate.")

for token in doc:
    print(token)


Dr.
Strange
loves
samosa
chat
of
karachi
as
it
costs
only
1
$
per
plate
.


In [8]:
# Accessing individual tokens
doc[0]


Dr.

In [10]:

# Accessing slices
doc[0:4]


Dr. Strange loves samosa

In [12]:

# Accessing token attributes
doc[0].text

'Dr.'

In [15]:
# Another Sentence with prefinx and suffic
# to avoid the signle quote issue in jupyter notebook \ is used

# doc2 = nlp('"Let\'s go to N.Y.!"')
# Or use tripple quotes

doc2 = nlp('''"Let's go to N.Y.!"''')

for token in doc2:
    print(token)


"
Let
's
go
to
N.Y.
!
"


In [16]:
type(doc2)

spacy.tokens.doc.Doc

In [17]:
type(nlp)

spacy.lang.en.English

In [22]:
# To get slice of tokens
doc[1:5]

# or use the text attribute
doc[1:5].text

'Strange loves samosa chat'

In [24]:
# 
doc3 = nlp("Tony gave two $ to Steve.")
toekn0 = doc3[0]
toekn0 

Tony

In [27]:
token2 = doc3[2]
token2.is_alpha

True

In [31]:
token2.text

'two'

In [33]:
# tokanizer of spacy can also detect like_num as it is a number.
token2.like_num

True

In [35]:
# can detect currency symbol as well.
token3 = doc3[3]
token3.is_currency

True

In [38]:
for token in doc3:
    print(token, "==>","is_alpha:", token.is_alpha, ", is_stop:", token.is_stop, ", is_punct:", token.is_punct, ", like_num:", token.like_num, ", is_currency:", token.is_currency)
                 

Tony ==> is_alpha: True , is_stop: False , is_punct: False , like_num: False , is_currency: False
gave ==> is_alpha: True , is_stop: False , is_punct: False , like_num: False , is_currency: False
two ==> is_alpha: True , is_stop: True , is_punct: False , like_num: True , is_currency: False
$ ==> is_alpha: False , is_stop: False , is_punct: False , like_num: False , is_currency: True
to ==> is_alpha: True , is_stop: True , is_punct: False , like_num: False , is_currency: False
Steve ==> is_alpha: True , is_stop: False , is_punct: False , like_num: False , is_currency: False
. ==> is_alpha: False , is_stop: False , is_punct: True , like_num: False , is_currency: False


In [54]:
# loading students data from text file
with open("students.txt") as f:
    text = f.readlines()
text


['LIttle_Angel high school, 8th grade students information\n',
 '\n',
 'Name\tBirthday   \temail\n',
 '-----\t------------\t------\n',
 'Abdullah   5 June, 1882    abdullah@kohli.com\n',
 'Ahmad\t12 April, 2001  ahmad@sharapova.com\n',
 'Majid  24 June, 1998   majid@williams.com \n',
 'Fardan      1 May, 1997    fardan@root.com']

In [55]:
text = " ".join(text)
text



In [57]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails


['abdullah@kohli.com',
 'ahmad@sharapova.com',
 'majid@williams.com',
 'fardan@root.com']

### work with urdu language

In [63]:
# Load the spaCy model for urdu
nlp = spacy.blank("ur") 

# Create a document
doc5 = nlp("مجھے 1,000 روپے ملے، لیکن خرچ بھی اتنے ہی ہو گئے۔")
for token in doc5:
    print(token)

مجھے
1,000
روپے
ملے
،
لیکن
خرچ
بھی
اتنے
ہی
ہو
گئے
۔


In [68]:
for token in doc5:
    print("Token", token, "currency", token.is_currency, "number", token.like_num)

Token مجھے currency False number False
Token 1,000 currency False number True
Token روپے currency False number False
Token ملے currency False number False
Token ، currency False number False
Token لیکن currency False number False
Token خرچ currency False number False
Token بھی currency False number False
Token اتنے currency False number False
Token ہی currency False number False
Token ہو currency False number False
Token گئے currency False number False
Token ۔ currency False number False


## Customizing Tokenizer

In [71]:
doc = nlp("gimme double cheese burger and 1.5L coke")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'burger', 'and', '1.5L', 'coke']

In [73]:
# Customizing the tokenizer
# to split gimme into gim and me
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("gimme", [{ORTH: "gim"}, {ORTH: "me"}])

doc = nlp("gimme double cheese burger and 1.5L coke")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'burger', 'and', '1.5L', 'coke']

## Tokanization or Segmentation of sentences

In [74]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

    

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [75]:
nlp.pipeline

# there is nothing in the pipeline as we have loaded a blank model
# because there is an error abvoe.

[]

In [76]:
#add a sentencizer to the pipeline
nlp.add_pipe("sentencizer")
nlp.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x20433759850>)]

In [78]:
doc = nlp("Dr. Strange loves pav bhaji of Karachi. Hulk loves chat of Islamabad")
for sentence in doc.sents:
    print(sentence) 

Dr.
Strange loves pav bhaji of Karachi.
Hulk loves chat of Islamabad


## Exercise

(1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf)

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy