## Spacy Implementation

In [1]:
import spacy

model = spacy.load("en_core_web_sm")

In [2]:
text = "I bought a pair of watch for Tom and Elizabeth which costs $50 each."
processed = model(text)


print("text -- POS\n ---------")
for token in processed:
    print(f"{token.text} -- {token.pos_}")

text -- POS
 ---------
I -- PRON
bought -- VERB
a -- DET
pair -- NOUN
of -- ADP
watch -- NOUN
for -- ADP
Tom -- PROPN
and -- CCONJ
Elizabeth -- PROPN
which -- PRON
costs -- VERB
$ -- SYM
50 -- NUM
each -- PRON
. -- PUNCT


In [3]:
print("text -- POS -- POS hash\n ---------")
for token in processed:
    print(f"{token.text} -- {token.pos_} -- {token.pos}")

text -- POS -- POS hash
 ---------
I -- PRON -- 95
bought -- VERB -- 100
a -- DET -- 90
pair -- NOUN -- 92
of -- ADP -- 85
watch -- NOUN -- 92
for -- ADP -- 85
Tom -- PROPN -- 96
and -- CCONJ -- 89
Elizabeth -- PROPN -- 96
which -- PRON -- 95
costs -- VERB -- 100
$ -- SYM -- 99
50 -- NUM -- 93
each -- PRON -- 95
. -- PUNCT -- 97


In [4]:
# NLP pipelines
model.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
model.disable_pipes('parser', 'ner')
model.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']

## Gensim Implementation

In [1]:
import gensim
from gensim.corpora import Dictionary

document = ["Tomorrow Tom and Elizabeth are getting married, I need to buy a gift for them.",
            "I bought a pair of watch for Tom and Elizabeth which costs $50 each."]


# Tokenization
tokens = [[token for token in docs.split()] for docs in document]

# Create dictionary
dictionary = Dictionary(tokens)
print(dictionary)

Dictionary<23 unique tokens: ['Elizabeth', 'I', 'Tom', 'Tomorrow', 'a']...>


In [2]:
dictionary.token2id

{'Elizabeth': 0,
 'I': 1,
 'Tom': 2,
 'Tomorrow': 3,
 'a': 4,
 'and': 5,
 'are': 6,
 'buy': 7,
 'for': 8,
 'getting': 9,
 'gift': 10,
 'married,': 11,
 'need': 12,
 'them.': 13,
 'to': 14,
 '$50': 15,
 'bought': 16,
 'costs': 17,
 'each.': 18,
 'of': 19,
 'pair': 20,
 'watch': 21,
 'which': 22}

## Vector

In [3]:
new_document = "I hope they like my gift"
vector = dictionary.doc2bow(new_document.lower().split())
print(vector)

[(10, 1)]


## Model
Let's compute the tf-idf of a document and compare it with the its vector representation.

In [4]:
from gensim import models
import numpy


documents = ['This is first line',
            'These are second lines',
            'This is third line']

# Token
tokens = [[token for token in docs.split()]for docs in documents]

# Dictionary
dictionary = Dictionary(tokens)
print("Dictionary : \n ", dictionary.token2id)
print("\n")


# Vector
print('Vector of each document: ')
vector = [dictionary.doc2bow(token) for token in tokens]
for vect in vector:
    print(vect)
print("\n")


# BOW of each documents
print("Vector of each document in term of token:")
for document in vector:
    print([[dictionary[id], freq] for id, freq in document])



# tfidf model
tfidf = models.TfidfModel(vector)


# Output of tfidf model
print("\n")
print("tf-idf assigned to each token:")
for document in tfidf[vector]:
    print([[dictionary[id], numpy.around(freq, decimals=3)] for id, freq in document])

Dictionary : 
  {'This': 0, 'first': 1, 'is': 2, 'line': 3, 'These': 4, 'are': 5, 'lines': 6, 'second': 7, 'third': 8}


Vector of each document: 
[(0, 1), (1, 1), (2, 1), (3, 1)]
[(4, 1), (5, 1), (6, 1), (7, 1)]
[(0, 1), (2, 1), (3, 1), (8, 1)]


Vector of each document in term of token:
[['This', 1], ['first', 1], ['is', 1], ['line', 1]]
[['These', 1], ['are', 1], ['lines', 1], ['second', 1]]
[['This', 1], ['is', 1], ['line', 1], ['third', 1]]


tf-idf assigned to each token:
[['This', 0.311], ['first', 0.843], ['is', 0.311], ['line', 0.311]]
[['These', 0.5], ['are', 0.5], ['lines', 0.5], ['second', 0.5]]
[['This', 0.311], ['is', 0.311], ['line', 0.311], ['third', 0.843]]


## NLTK

In [1]:
import nltk
from nltk.corpus import gutenberg  # corpus reader



# download gutenberg corpus
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


True

In [2]:
# list of all the methods and attributes
dir(gutenberg)

['_LazyCorpusLoader__args',
 '_LazyCorpusLoader__kwargs',
 '_LazyCorpusLoader__load',
 '_LazyCorpusLoader__name',
 '_LazyCorpusLoader__reader_cls',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_unload',
 'subdir']

Few common methods in corpus reader are:

- fileids() lists the name of the text documents, i.e, each book in gutenberg.

- readme() methods prints the readme file of the corpus.

- encoding() gives the encoding type of a text file in the corpus.

In [3]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [10]:
import nltk

# Download necessary resources
nltk.download('punkt')

# Read raw text
print("*********************")
raw = gutenberg.raw("chesterton-brown.txt")
print("Raw texts:\n")
print(raw[:40])
print("\n")
print("*********************")

# Read as word tokens
words = gutenberg.words("chesterton-brown.txt")
print("Token of words:\n")
print(words[:10])  # Print first 10 for readability
print("\n")
print("*********************")


*********************
Raw texts:

[The Wisdom of Father Brown by G. K. Che


*********************
Token of words:

['[', 'The', 'Wisdom', 'of', 'Father', 'Brown', 'by', 'G', '.', 'K']


*********************


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
from nltk.corpus import udhr
nltk.download('udhr')


# Field ids:
udhr.fileids()

[nltk_data] Downloading package udhr to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\udhr.zip.


['Abkhaz-Cyrillic+Abkh',
 'Abkhaz-UTF8',
 'Achehnese-Latin1',
 'Achuar-Shiwiar-Latin1',
 'Adja-UTF8',
 'Afaan_Oromo_Oromiffa-Latin1',
 'Afrikaans-Latin1',
 'Aguaruna-Latin1',
 'Akuapem_Twi-UTF8',
 'Albanian_Shqip-Latin1',
 'Amahuaca',
 'Amahuaca-Latin1',
 'Amarakaeri-Latin1',
 'Amuesha-Yanesha-UTF8',
 'Arabela-Latin1',
 'Arabic_Alarabia-Arabic',
 'Asante-UTF8',
 'Ashaninca-Latin1',
 'Asheninca-Latin1',
 'Asturian_Bable-Latin1',
 'Aymara-Latin1',
 'Balinese-Latin1',
 'Bambara-UTF8',
 'Baoule-UTF8',
 'Basque_Euskara-Latin1',
 'Batonu_Bariba-UTF8',
 'Belorus_Belaruski-Cyrillic',
 'Belorus_Belaruski-UTF8',
 'Bemba-Latin1',
 'Bengali-UTF8',
 'Beti-UTF8',
 'Bichelamar-Latin1',
 'Bikol_Bicolano-Latin1',
 'Bora-Latin1',
 'Bosnian_Bosanski-Cyrillic',
 'Bosnian_Bosanski-Latin2',
 'Bosnian_Bosanski-UTF8',
 'Breton-Latin1',
 'Bugisnese-Latin1',
 'Bulgarian_Balgarski-Cyrillic',
 'Bulgarian_Balgarski-UTF8',
 'Cakchiquel-Latin1',
 'Campa_Pajonalino-Latin1',
 'Candoshi-Shapra-Latin1',
 'Caquinte-Latin

In [12]:
udhr.raw('Nepali-UTF8')

'                     मानव अधिकारको विश्वव्यापी घोषणा \n\nप्रस्तावना \n       मानव परिवारका सबै सदस्यहरूको अन्तर्निहित मान तथा सम्मान र अवछिन्न अधिकारहरूको मान्यता नै स्वतन्त्रता, न्याय, र शान्तिको आधार भएकोले,\n \n       मानव अधिकारहरू प्रति अवहेलना तथा अनादरको परिणामबाटै नै काम भड मानव जातिको अन्त स्करणमा चिट पुर्\u200dयाइएको हुनाले र मानवहरूले धर्म र वाक स्वन्त्रता तथा भए र अभावबाट मुक्ति पाउनु पर्छ भन्ने सर्व साधारण जनताको घोषित  आकांक्षा भएकोले,\n\nअत्याचार र दमनको विरुद्ध अरू उपाय नपाएर विद्रोह गर्नू नै अन्तिम उपाय हो भन्ने नउठानुन  हो भने मानव अधिकारहरू कानुनी शासनद्वारा संरक्षित रहनु अति आवश्यक भएकोले, \n\n      राष्ट्रहरूका बीच मैत्री सम्बन्ध वृद्धि गर्न आवश्यक भएकोले \n\nसंयुक्त राष्ट्र संघका जनता हरूले मानवका मौलिक अधिकारहरू र मनुष्यको मान तथा कदर र नर -नारीहरूको सम्मान अधिकारहरू प्रति पुनः विश्वासको पुस्ट्याइँ अधिकार पत्रमा गरि बढि स्वतन्त्रताको आधारमा सामाजिक प्रगति एवं जीवनको\n\n      सदस्य राष्ट्रहरू र ती राष्ट्रहरूका अधिकारमा रहेका प्रादेशिक जनताहरूमा समेत ती अधिकारहरू 

In [13]:
from nltk.corpus import stopwords


# Download stopwords corpus
nltk.download('stopwords')

#### Note: stopwords is a corpus reader and 'stopwords' is a corpus. ###

# stopwords from english language
stop_words = stopwords.words('english')

# Print stopwords
print(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data]   Unzipping corpora\stopwords.zip.


### Tokenization

In [19]:
from nltk import word_tokenize
nltk.download('punkt_tab')

text = "Tomorrow Tom and Elizabeth are getting married, I need to buy a gift for them."

# Tokenize
tokens = word_tokenize(text)
print(tokens)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


['Tomorrow', 'Tom', 'and', 'Elizabeth', 'are', 'getting', 'married', ',', 'I', 'need', 'to', 'buy', 'a', 'gift', 'for', 'them', '.']


In [20]:
from nltk import sent_tokenize

text = "On my God! I almost fall down."
tokens = sent_tokenize(text)
print(tokens)

['On my God!', 'I almost fall down.']


In [21]:
from nltk.tokenize import RegexpTokenizer

text = "I bought a pair of watch for Tom and Elizabeth which costs $50 each."
tokenizer = RegexpTokenizer(r'[A-Za-z0-9]+')
tokens = tokenizer.tokenize(text)
print(tokens)

['I', 'bought', 'a', 'pair', 'of', 'watch', 'for', 'Tom', 'and', 'Elizabeth', 'which', 'costs', '50', 'each']


### Frequency distribution

In [22]:
words = gutenberg.words('shakespeare-caesar.txt')
freq_dist = nltk.FreqDist(word.lower() for word in words if word.isalpha())
freq_dist.most_common(10)

[('and', 627),
 ('the', 579),
 ('i', 533),
 ('to', 446),
 ('you', 391),
 ('of', 354),
 ('that', 289),
 ('a', 267),
 ('not', 257),
 ('is', 253)]

### Encoding

In [23]:
# Unicode
unicode = '\u0915'
print("Unicode:", unicode)
encoded = unicode.encode('utf8')
print("After encoding:",encoded)
print("After decoding:",encoded.decode('utf8'))

Unicode: क
After encoding: b'\xe0\xa4\x95'
After decoding: क
