## Spacy Implementation

In [1]:
import spacy

model = spacy.load("en_core_web_sm")

In [2]:
text = "I bought a pair of watch for Tom and Elizabeth which costs $50 each."
processed = model(text)


print("text -- POS\n ---------")
for token in processed:
    print(f"{token.text} -- {token.pos_}")

text -- POS
 ---------
I -- PRON
bought -- VERB
a -- DET
pair -- NOUN
of -- ADP
watch -- NOUN
for -- ADP
Tom -- PROPN
and -- CCONJ
Elizabeth -- PROPN
which -- PRON
costs -- VERB
$ -- SYM
50 -- NUM
each -- PRON
. -- PUNCT


In [3]:
print("text -- POS -- POS hash\n ---------")
for token in processed:
    print(f"{token.text} -- {token.pos_} -- {token.pos}")

text -- POS -- POS hash
 ---------
I -- PRON -- 95
bought -- VERB -- 100
a -- DET -- 90
pair -- NOUN -- 92
of -- ADP -- 85
watch -- NOUN -- 92
for -- ADP -- 85
Tom -- PROPN -- 96
and -- CCONJ -- 89
Elizabeth -- PROPN -- 96
which -- PRON -- 95
costs -- VERB -- 100
$ -- SYM -- 99
50 -- NUM -- 93
each -- PRON -- 95
. -- PUNCT -- 97


In [4]:
# NLP pipelines
model.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
model.disable_pipes('parser', 'ner')
model.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']

## Gensim Implementation

In [1]:
import gensim
from gensim.corpora import Dictionary

document = ["Tomorrow Tom and Elizabeth are getting married, I need to buy a gift for them.",
            "I bought a pair of watch for Tom and Elizabeth which costs $50 each."]


# Tokenization
tokens = [[token for token in docs.split()] for docs in document]

# Create dictionary
dictionary = Dictionary(tokens)
print(dictionary)

Dictionary<23 unique tokens: ['Elizabeth', 'I', 'Tom', 'Tomorrow', 'a']...>


In [2]:
dictionary.token2id

{'Elizabeth': 0,
 'I': 1,
 'Tom': 2,
 'Tomorrow': 3,
 'a': 4,
 'and': 5,
 'are': 6,
 'buy': 7,
 'for': 8,
 'getting': 9,
 'gift': 10,
 'married,': 11,
 'need': 12,
 'them.': 13,
 'to': 14,
 '$50': 15,
 'bought': 16,
 'costs': 17,
 'each.': 18,
 'of': 19,
 'pair': 20,
 'watch': 21,
 'which': 22}

## Vector

In [3]:
new_document = "I hope they like my gift"
vector = dictionary.doc2bow(new_document.lower().split())
print(vector)

[(10, 1)]


## Model
Let's compute the tf-idf of a document and compare it with the its vector representation.

In [4]:
from gensim import models
import numpy


documents = ['This is first line',
            'These are second lines',
            'This is third line']

# Token
tokens = [[token for token in docs.split()]for docs in documents]

# Dictionary
dictionary = Dictionary(tokens)
print("Dictionary : \n ", dictionary.token2id)
print("\n")


# Vector
print('Vector of each document: ')
vector = [dictionary.doc2bow(token) for token in tokens]
for vect in vector:
    print(vect)
print("\n")


# BOW of each documents
print("Vector of each document in term of token:")
for document in vector:
    print([[dictionary[id], freq] for id, freq in document])



# tfidf model
tfidf = models.TfidfModel(vector)


# Output of tfidf model
print("\n")
print("tf-idf assigned to each token:")
for document in tfidf[vector]:
    print([[dictionary[id], numpy.around(freq, decimals=3)] for id, freq in document])

Dictionary : 
  {'This': 0, 'first': 1, 'is': 2, 'line': 3, 'These': 4, 'are': 5, 'lines': 6, 'second': 7, 'third': 8}


Vector of each document: 
[(0, 1), (1, 1), (2, 1), (3, 1)]
[(4, 1), (5, 1), (6, 1), (7, 1)]
[(0, 1), (2, 1), (3, 1), (8, 1)]


Vector of each document in term of token:
[['This', 1], ['first', 1], ['is', 1], ['line', 1]]
[['These', 1], ['are', 1], ['lines', 1], ['second', 1]]
[['This', 1], ['is', 1], ['line', 1], ['third', 1]]


tf-idf assigned to each token:
[['This', 0.311], ['first', 0.843], ['is', 0.311], ['line', 0.311]]
[['These', 0.5], ['are', 0.5], ['lines', 0.5], ['second', 0.5]]
[['This', 0.311], ['is', 0.311], ['line', 0.311], ['third', 0.843]]
