<a href="https://colab.research.google.com/github/isnanmulia/colab-machinelearning/blob/main/ML_NLP_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This tutorial uses codes from these sources, with several adjustments:
- https://realpython.com/nltk-nlp-python/

In [64]:
# Import libraries
import nltk

In [65]:
# Sentence examples
sentence_1 = 'Natural Language Processing allows your device to hear what you say, then understand the hidden meaning in your sentence, and finally act on that meaning.'
sentence_2 = 'Classification sorts data into specific categories using a labeled dataset. Clustering is partitioning an unlabeled dataset into groups of similar objects.'
sentence_3 = 'Erzurum is home to many different types of local cuisine, most famously, the Cag Kebab.'
kalimat_1 = 'Kesatuan Integrated System adalah SIstem informasi Akademik bagi mahasiswa, dosen dan unit administrasi perkuliahan, baik dari sisi pembayaran, jadwal mata kuliah, dan pengumuman-pengumuman.'

In [66]:
# Tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

sent_tkn = sent_tokenize(sentence_2)
word_tkn_1 = word_tokenize(sentence_1)
word_tkn_2 = word_tokenize(sentence_2)
word_tkn_3 = word_tokenize(sentence_3)
kata_tkn_1 = word_tokenize(kalimat_1)

print(sent_tkn)
print('---')
print(word_tkn_1)
print('---')
print(word_tkn_2)
print('---')
print(word_tkn_3)
print('---')
print(kata_tkn_1)

['Classification sorts data into specific categories using a labeled dataset.', 'Clustering is partitioning an unlabeled dataset into groups of similar objects.']
---
['Natural', 'Language', 'Processing', 'allows', 'your', 'device', 'to', 'hear', 'what', 'you', 'say', ',', 'then', 'understand', 'the', 'hidden', 'meaning', 'in', 'your', 'sentence', ',', 'and', 'finally', 'act', 'on', 'that', 'meaning', '.']
---
['Classification', 'sorts', 'data', 'into', 'specific', 'categories', 'using', 'a', 'labeled', 'dataset', '.', 'Clustering', 'is', 'partitioning', 'an', 'unlabeled', 'dataset', 'into', 'groups', 'of', 'similar', 'objects', '.']
---
['Erzurum', 'is', 'home', 'to', 'many', 'different', 'types', 'of', 'local', 'cuisine', ',', 'most', 'famously', ',', 'the', 'Cag', 'Kebab', '.']
---
['Kesatuan', 'Integrated', 'System', 'adalah', 'SIstem', 'informasi', 'Akademik', 'bagi', 'mahasiswa', ',', 'dosen', 'dan', 'unit', 'administrasi', 'perkuliahan', ',', 'baik', 'dari', 'sisi', 'pembayaran'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
# Filtering stop words
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words_en = set(stopwords.words('english'))
stop_words_id = set(stopwords.words('indonesian'))

filtered_list_1 = [word for word in word_tkn_1 if not word in stop_words_en]
filtered_list_2 = [word for word in word_tkn_2 if not word in stop_words_en]
filtered_list_id_1 = [word for word in kata_tkn_1 if not word in stop_words_id]

print(filtered_list_1)
print('---')
print(filtered_list_2)
print('---')
print(filtered_list_id_1)

['Natural', 'Language', 'Processing', 'allows', 'device', 'hear', 'say', ',', 'understand', 'hidden', 'meaning', 'sentence', ',', 'finally', 'act', 'meaning', '.']
---
['Classification', 'sorts', 'data', 'specific', 'categories', 'using', 'labeled', 'dataset', '.', 'Clustering', 'partitioning', 'unlabeled', 'dataset', 'groups', 'similar', 'objects', '.']
---
['Kesatuan', 'Integrated', 'System', 'SIstem', 'informasi', 'Akademik', 'mahasiswa', ',', 'dosen', 'unit', 'administrasi', 'perkuliahan', ',', 'sisi', 'pembayaran', ',', 'jadwal', 'mata', 'kuliah', ',', 'pengumuman-pengumuman', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
# Stemming & Lemmatizing
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed_words = [stemmer.stem(word) for word in word_tkn_2]
lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tkn_2]

print(stemmed_words)
print(lemmatized_words)

['classif', 'sort', 'data', 'into', 'specif', 'categori', 'use', 'a', 'label', 'dataset', '.', 'cluster', 'is', 'partit', 'an', 'unlabel', 'dataset', 'into', 'group', 'of', 'similar', 'object', '.']
['Classification', 'sort', 'data', 'into', 'specific', 'category', 'using', 'a', 'labeled', 'dataset', '.', 'Clustering', 'is', 'partitioning', 'an', 'unlabeled', 'dataset', 'into', 'group', 'of', 'similar', 'object', '.']


In [72]:
# POS tagging
nltk.download('tagsets')
pos_tagged_word_1 = nltk.pos_tag(word_tkn_1)
pos_tagged_word_3 = nltk.pos_tag(word_tkn_3)
print(pos_tagged_word_1)
print(pos_tagged_word_3)
# nltk.help.upenn_tagset()

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('allows', 'VBZ'), ('your', 'PRP$'), ('device', 'NN'), ('to', 'TO'), ('hear', 'VB'), ('what', 'WP'), ('you', 'PRP'), ('say', 'VBP'), (',', ','), ('then', 'RB'), ('understand', 'VB'), ('the', 'DT'), ('hidden', 'JJ'), ('meaning', 'NN'), ('in', 'IN'), ('your', 'PRP$'), ('sentence', 'NN'), (',', ','), ('and', 'CC'), ('finally', 'RB'), ('act', 'VB'), ('on', 'IN'), ('that', 'DT'), ('meaning', 'NN'), ('.', '.')]
[('Erzurum', 'NNP'), ('is', 'VBZ'), ('home', 'VBN'), ('to', 'TO'), ('many', 'JJ'), ('different', 'JJ'), ('types', 'NNS'), ('of', 'IN'), ('local', 'JJ'), ('cuisine', 'NN'), (',', ','), ('most', 'RBS'), ('famously', 'RB'), (',', ','), ('the', 'DT'), ('Cag', 'NNP'), ('Kebab', 'NNP'), ('.', '.')]


[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [70]:
# Named-Entity Recognition
from nltk.tree import Tree
from nltk.draw.tree import TreeView
nltk.download('maxent_ne_chunker')
nltk.download('words')

lotr_quote = "It's a dangerous business, Frodo, going out your door."
words_in_lotr_quote = word_tokenize(lotr_quote)
lotr_pos_tags = nltk.pos_tag(words_in_lotr_quote)
tree = nltk.ne_chunk(lotr_pos_tags, binary = True)
print(lotr_pos_tags)
print(tree)

[('It', 'PRP'), ("'s", 'VBZ'), ('a', 'DT'), ('dangerous', 'JJ'), ('business', 'NN'), (',', ','), ('Frodo', 'NNP'), (',', ','), ('going', 'VBG'), ('out', 'RP'), ('your', 'PRP$'), ('door', 'NN'), ('.', '.')]
(S
  It/PRP
  's/VBZ
  a/DT
  dangerous/JJ
  business/NN
  ,/,
  (NE Frodo/NNP)
  ,/,
  going/VBG
  out/RP
  your/PRP$
  door/NN
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [71]:
# Named-Entity Recognition (Advanced)
def extract_ne(quote):
  words = word_tokenize(quote, language='english')
  tags = nltk.pos_tag(words)
  tree = nltk.ne_chunk(tags, binary=True)
  return set(
      " ".join(i[0] for i in t)
      for t in tree
      if hasattr(t, "label") and t.label() == "NE"
      )

text_NER = """
 Men like Schiaparelli watched the red planet—it is odd, by-the-bye, that
 for countless centuries Mars has been the star of war—but failed to
 interpret the fluctuating appearances of the markings they mapped so well.
 All that time the Martians must have been getting ready.

 During the opposition of 1894 a great light was seen on the illuminated
 part of the disk, first at the Lick Observatory, then by Perrotin of Nice,
 and then by other observers. English readers heard of it first in the
 issue of Nature dated August 2."""

extract_ne(text_NER)

{'Lick Observatory', 'Mars', 'Nature', 'Perrotin', 'Schiaparelli'}