### INSTALL LIBRARY

In [None]:
!pip install spacy 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import spacy
spacy.__version__

'2.2.4'

In [None]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


### POS (PART-OF-SPEECH)


kata benda, kata kerja, verb

Tags: https://ashutoshtripathi.com/2020/04/13/parts-of-speech-tagging-and-dependency-parsing-using-spacy-nlp/

In [None]:
import en_core_web_sm

In [None]:
nlp =  spacy.load("en_core_web_sm")

In [None]:
nlp

<spacy.lang.en.English at 0x7fefd3f607d0>

In [None]:
document = nlp('i am learning natural language processing. the course is in london')

In [None]:
for token in document:
  print(token.text, token.pos_)

i PRON
am AUX
learning VERB
natural ADJ
language NOUN
processing NOUN
. PUNCT
the DET
course NOUN
is AUX
in ADP
london PROPN


#### others

* lemma : kata dasar
* pos : part of speech
* tag : morfological information (future, past, present)
* dep : syntatic dependency
* shape : lowercase, uppercase
* alpha : if its alphanumeric
* stop : if its stopwords

In [None]:
for token in document:
  print(token.text, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

i PRON PRP nsubj x True True
am AUX VBP aux xx True True
learning VERB VBG ROOT xxxx True False
natural ADJ JJ amod xxxx True False
language NOUN NN compound xxxx True False
processing NOUN NN dobj xxxx True False
. PUNCT . punct . False False
the DET DT det xxx True True
course NOUN NN nsubj xxxx True False
is AUX VBZ ROOT xx True True
in ADP IN prep xx True True
london PROPN NNP pobj xxxx True False


In [None]:
for token in document:
  if token.pos_ == 'VERB':
    print(token.text)

learning


In [None]:
for token in document:
  if token.pos_ == 'PROPN':
    print(token.text)

london


### LEMMATIZATION AND STEMMING
* Lemmatization : mengubah suatu kata menjadi kata dasar dengan mengetahui konteks dari kata tersebut (stories -> story, leaves -> leaf)
* Stemming : mengubah suatu kata menjadi kata dasar tanpa mengetahui konteks dari kata tersebut seperti memotong ujung kata-kata (stories -> stori, leaves -> leav)

In [None]:
for token in document:
  print(token.text, token.lemma_)

i i
am be
learning learn
natural natural
language language
processing processing
. .
the the
course course
is be
in in
london london


In [None]:
doc = nlp('watch watches watching watched watchs')
[token.lemma_ for token in doc]

['watch', 'watch', 'watch', 'watch', 'watch']

In [None]:
import nltk

In [None]:
stemmer = nltk.PorterStemmer()

In [None]:
stemmer.stem('watching')

'watch'

In [None]:
for token in document:
  print(token.text, token.lemma_, stemmer.stem(token.text))

i i i
am be am
learning learn learn
natural natural natur
language language languag
processing processing process
. . .
the the the
course course cours
is be is
in in in
london london london


### NAMED-ENTITY RECOGNITION (NER)
- List of tags: https://towardsdatascience.com/named-entity-recognition-ner-using-spacy-nlp-part-4-28da2ece57c6

In [None]:
text = 'IBM is a US company on information technology. it is located in San Fransisko and revenue in 2018 was approximately 320 billion dollars'

In [None]:
document = nlp(text)

In [None]:
for entity in document.ents:
  print(entity.text, entity.label_)

IBM ORG
US GPE
San Fransisko GPE
2018 DATE
approximately 320 billion dollars MONEY


In [None]:
from spacy import displacy
displacy.render(document, style = 'ent', jupyter=True)

In [None]:
text = 'Bill Gates was born in Seattle on 1955-10-28 and is the founder of Microsoft'

In [None]:
document = nlp(text)
for entity in document.ents:
  print(entity.text, entity.label_)

Bill Gates PERSON
Seattle GPE
1955-10-28 DATE
Microsoft ORG


In [None]:
displacy.render(document, style = 'ent', jupyter=True)

In [None]:
for entity in document.ents:
  if entity.label_ == 'PERSON':
    print(entity.text)

Bill Gates


### STOP WORDS

kata yang sering muncul dan tidak memberikan informasi apupun dalam memahami konteks kalimat

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'almost', 'both', 'last', 'next', 'six', 'fifty', 'further', 'under', 'anyone', 'than', 'again', 'whoever', 'something', 'out', 'part', "n't", 'anything', "'m", 'rather', 'whether', 'sometime', 'i', 'unless', 'all', 'over', 'thru', 'will', "'ve", 'themselves', 'hereby', 'whereby', 'several', 'else', 'with', 'you', 'show', 'two', '‘m', 'yet', 'our', '‘ll', 'nobody', 'myself', 'no', 'for', 'if', 'already', 'been', 'less', 'three', 'more', 'my', 'thereupon', 'their', 'never', 'when', 'just', 'become', 'may', 'other', 'latterly', 'often', 'ten', 'elsewhere', 'below', 'it', 'quite', 'though', 'ca', 'really', 'must', 'onto', 'which', 'done', 'he', 'as', 'such', 'across', 'once', 'perhaps', 'mostly', 'was', 'alone', 'everyone', 'whenever', 'are', 'or', 'whom', 'name', 'during', '’re', 'back', 'whence', 'everywhere', 'hereupon', 'being', 'former', 'sixty', 'nine', 'against', 'top', 'and', 'also', 'say', 'she', 'whatever', 'would', 'anywhere', 'herein', 'the', '‘ve', 'meanwhile', 'they', 'very

In [None]:
from spacy.lang.id.stop_words import STOP_WORDS
print(STOP_WORDS)

{'luar', 'jumlahnya', 'terjadi', 'entah', 'bertutur', 'memberikan', 'sekali', 'rasanya', 'satu', 'tersebutlah', 'tertuju', 'semasih', 'jelas', 'sini', 'menyangkut', 'seperti', 'balik', 'perlunya', 'masalahnya', 'dan', 'persoalan', 'meyakini', 'terdahulu', 'sebagian', 'berlalu', 'sekalian', 'waktu', 'jauh', 'ditambahkan', 'diperkirakan', 'inginkan', 'soalnya', 'tutur', 'terdiri', 'sebaik', 'sedemikian', 'sambil', 'beri', 'hendaknya', 'sedangkan', 'katakanlah', 'diakhirinya', 'dimintai', 'kalaulah', 'atau', 'masa', 'kalau', 'menginginkan', 'dirinya', 'paling', 'dari', 'setiap', 'mulai', 'inilah', 'ke', 'kira', 'pantas', 'mungkinkah', 'karenanya', 'dong', 'sebaik-baiknya', 'tinggi', 'hari', 'bermacam', 'berakhirlah', 'berlangsung', 'dituturkan', 'semuanya', 'kelamaan', 'akankah', 'mengatakannya', 'suatu', 'agak', 'siap', 'dikarenakan', 'bermacam-macam', 'tandas', 'terus', 'memihak', 'seusai', 'ibu', 'apakah', 'berikutnya', 'memang', 'dimulai', 'usah', 'berujar', 'pihak', 'tegasnya', 'bisa

In [None]:
'dan' in STOP_WORDS

True

In [None]:
len(STOP_WORDS)

757

In [None]:
nlp.vocab['it'].is_stop

True

In [None]:
document = nlp('i am learning natural language processing. the course is in london')

In [None]:
for token in document:
  if nlp.vocab[token.text].is_stop:
    print(token.text)

i
am
the
is
in


In [None]:
for token in document:
  if not nlp.vocab[token.text].is_stop:
    print(token.text)

learning
natural
language
processing
.
course
london


### DEPENDENCY PARSING

relasi parent-child

#### Example 1

In [None]:
document = nlp('book a ticket from London to Paris')

In [None]:
origin = document[4]
destiny = document[6]
print(origin, destiny)

London Paris


In [None]:
for entity in document.ents:
  if entity.label_ == 'GPE':
    print(entity.text)

London
Paris


In [None]:
list(origin.ancestors)

[from, ticket, book]

In [None]:
list(destiny.ancestors)

[to, ticket, book]

In [None]:
document[0].is_ancestor(document[2])

True

#### Example 2

In [None]:
document = nlp("Book a table for the restaurant and a taxi to the hotel")

In [None]:
tasks = document[2], document[8]
locations =  document[5], document[11]
print(tasks, locations)

(table, taxi) (restaurant, hotel)


In [None]:
for local in locations:
  print('-----', local)
  for obj in local.ancestors:
    print(obj)

----- restaurant
for
table
Book
----- hotel
to
Book


In [None]:
displacy.render(document, style = 'dep', jupyter=True, options={'distance': 90})

### WORLD SIMILARITY

In [None]:
w1 = nlp('hi')
w2 = nlp('hello')
w3 = nlp('hey')
w4 = nlp('stop')

In [None]:
w1.similarity(w2)

  "__main__", mod_spec)


0.718274374933058

In [None]:
w1.similarity(w3)

  "__main__", mod_spec)


0.7394632387600851

In [None]:
w1.similarity(w4)

  "__main__", mod_spec)


0.32841440025588475

In [None]:
w2.similarity(w4)

  "__main__", mod_spec)


0.3951331075633348