In [8]:
import spacy


In [9]:
text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""

In [12]:
# tokenize the word
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
token_list = [token for token in doc]

In [13]:
# remove stop words
filtered_tokens = [token for token in doc if not token.is_stop]
filtered_tokens

[,
 Dave,
 watched,
 forest,
 burned,
 hill,
 ,,
 ,
 miles,
 house,
 .,
 car,
 ,
 hastily,
 packed,
 Marta,
 inside,
 trying,
 round,
 ,
 pets,
 .,
 ",
 ?,
 ",
 wondered,
 ,
 continued,
 wait,
 Marta,
 appear,
 pets,
 .,
 ]

In [15]:
# normalize words
lemmas = [
        f"Token: {token}, lemma: {token.lemma}_"
        for token in filtered_tokens
]
lemmas

['Token: \n, lemma: 962983613142996970_',
 'Token: Dave, lemma: 15237984737769454380_',
 'Token: watched, lemma: 2054481287215635300_',
 'Token: forest, lemma: 12560106647199032635_',
 'Token: burned, lemma: 12905682277821018784_',
 'Token: hill, lemma: 1647358963876657122_',
 'Token: ,, lemma: 2593208677638477497_',
 'Token: \n, lemma: 962983613142996970_',
 'Token: miles, lemma: 15996833532744392865_',
 'Token: house, lemma: 9471806766518506264_',
 'Token: ., lemma: 12646065887601541794_',
 'Token: car, lemma: 17545852598994811774_',
 'Token: \n, lemma: 962983613142996970_',
 'Token: hastily, lemma: 16524687012062183671_',
 'Token: packed, lemma: 11929990034961539164_',
 'Token: Marta, lemma: 3686051643097225522_',
 'Token: inside, lemma: 3410355712981309345_',
 'Token: trying, lemma: 4812066089261065646_',
 'Token: round, lemma: 10404471077220350636_',
 'Token: \n, lemma: 962983613142996970_',
 'Token: pets, lemma: 8199115189604440881_',
 'Token: ., lemma: 12646065887601541794_',
 '

In [16]:
# vectorize the text
filtered_tokens[1].vector

array([-1.0732638 , -1.589313  , -0.7485428 ,  0.8033879 ,  0.19977242,
        0.00840408,  1.5419115 ,  0.78789234, -0.10507897, -0.08379465,
        1.6370184 ,  0.99815553, -0.27276033, -0.9078418 , -1.2485981 ,
       -0.5253064 ,  0.3660615 ,  0.32205266,  0.26947686, -0.6838585 ,
       -1.3466268 ,  0.011222  , -0.24088496, -0.48466784, -0.33174878,
       -0.05325326,  1.8773433 ,  0.56494987, -0.96057415,  0.78610957,
       -0.44939822, -1.4648833 , -0.38066617,  1.0480769 , -0.8341217 ,
       -0.2217494 , -0.85443366,  0.35594553, -0.1127467 ,  1.2787786 ,
       -0.8223141 ,  0.18473059, -0.08983876,  0.6325263 , -1.1029459 ,
        0.3719486 ,  0.11167981,  1.5298815 ,  0.73126984, -0.01238485,
       -0.38741022,  0.24374121,  0.66934216, -0.51473886, -0.05107652,
       -0.6836413 ,  1.2553529 , -0.4258146 ,  0.82571185, -0.40290013,
       -1.0714419 ,  0.8215423 ,  0.10354415, -0.5627635 ,  0.34108153,
       -0.46954852, -0.644461  , -0.4248718 , -0.74732137, -0.93