## What is natural language processing?

In [None]:
x = 'was'
y = 'is'
x == y

False

## Lemmatization of words

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemma1 = lemmatizer.lemmatize('vegetables', 'n')
lemma2 = lemmatizer.lemmatize('vegetable', 'v')

lemma1

'vegetable'

## Lemmatization of Sentences

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
sentence = 'Vegetables are types of plants.'

Tokenizing sentences

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
sentence_tokens = nltk.word_tokenize(sentence.lower())
sentence_tokens

['vegetables', 'are', 'types', 'of', 'plants', '.']

In [None]:
pos_tags

[('vegetables', 'NNS'),
 ('are', 'VBP'),
 ('types', 'NNS'),
 ('of', 'IN'),
 ('plants', 'NNS'),
 ('.', '.')]

In [None]:
import nltk 
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemma_me(sent):
    sentence_tokens = nltk.word_tokenize(sent.lower())
    pos_tags = nltk.pos_tag(sentence_tokens)

    sentence_lemmas = []
    for token, pos_tag in zip(sentence_tokens, pos_tags):
        if pos_tag[1][0].lower() in ['n', 'v', 'a', 'r']:
            lemma = lemmatizer.lemmatize(token, pos_tag[1][0].lower())
            sentence_lemmas.append(lemma)

    return sentence_lemmas

In [None]:
l1 = lemma_me('Vegetables are types of plants.')
l1

['vegetable', 'be', 'type', 'plant']

In [None]:
l2 = lemma_me('A vegetable is a type of plant')
l2 

['vegetable', 'be', 'type', 'plant']

In [None]:
l1 == l2

True

## Find the most similar sentence

In [None]:
text = 'Originally, vegetables were collected from the wild by hunter-gatherers. Vegetables are all plants. Vegetables can be eaten either raw or cooked.'
question = 'What are vegetables?' 

In [None]:
import nltk 

def lemma_me(sent):
    sentence_tokens = nltk.word_tokenize(sent.lower())
    pos_tags = nltk.pos_tag(sentence_tokens)

    sentence_lemmas = []
    for token, pos_tag in zip(sentence_tokens, pos_tags):
        if pos_tag[1][0].lower() in ['n', 'v', 'a', 'r']:
            lemma = lemmatizer.lemmatize(token, pos_tag[1][0].lower())
            sentence_lemmas.append(lemma)

    return sentence_lemmas

In [None]:
sentence_tokens = nltk.sent_tokenize(text)
sentence_tokens.append(question)
sentence_tokens

['Originally, vegetables were collected from the wild by hunter-gatherers.',
 'Vegetables are all plants.',
 'Vegetables can be eaten either raw or cooked.',
 'What are vegetables?']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tv = TfidfVectorizer(tokenizer=lemma_me)
tv

TfidfVectorizer(tokenizer=<function lemma_me at 0x7f31a2caf9e0>)

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
tf = tv.fit_transform(sentence_tokens)

In [None]:
tf

<4x8 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [None]:
tf.toarray()

array([[0.27717414, 0.53114624, 0.        , 0.        , 0.53114624,
        0.53114624, 0.        , 0.27717414],
       [0.41988018, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.8046125 , 0.41988018],
       [0.32713399, 0.        , 0.62688384, 0.62688384, 0.        ,
        0.        , 0.        , 0.32713399],
       [0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678]])

In [None]:
import pandas
df = pandas.DataFrame(tf.toarray(), columns=tv.get_feature_names())
df



Unnamed: 0,be,collect,cook,eat,hunter-gatherer,originally,plant,vegetable
0,0.277174,0.531146,0.0,0.0,0.531146,0.531146,0.0,0.277174
1,0.41988,0.0,0.0,0.0,0.0,0.0,0.804612,0.41988
2,0.327134,0.0,0.626884,0.626884,0.0,0.0,0.0,0.327134
3,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.707107


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
values = cosine_similarity(tf[-1], tf)
values

array([[0.39198343, 0.59380024, 0.46263733, 1.        ]])

In [None]:
text = 'Originally, vegetables were collected from the wild by hunter-gatherers. Vegetables are all plants. Vegetables can be eaten either raw or cooked.'
question = 'What are vegetables?' 

In [None]:
index = values.argsort()[0][-2]
index

1

In [None]:
values_flat = values.flatten()
values_flat

array([0.39198343, 0.59380024, 0.46263733, 1.        ])

In [None]:
values_flat.sort()
values_flat

array([0.39198343, 0.46263733, 0.59380024, 1.        ])

In [None]:
coeff = values_flat[-2]
coeff

0.593800244493221

In [None]:
if coeff > 0.3:
    print(sentence_tokens[index])

Vegetables are all plants.


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=440311b0-c5f8-41a0-9978-e85a9f4d767d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>