In [6]:
# Let's create a Python notebook to explain various NLP concepts

# preprocessing library
import nltk
import string
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer  # feature engineering library
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Sample text for demonstration
text1 = "How to create sales order"
text2 = "How to create Purchase Order"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mousumi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mousumi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mousumi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# 1. Tokenization
from nltk.tokenize import word_tokenize

tokens1 = word_tokenize(text1)
tokens2 = word_tokenize(text2)

print("Tokens1:", tokens1)
print("Tokens2:", tokens2)

Tokens1: ['How', 'to', 'create', 'sales', 'order']
Tokens2: ['How', 'to', 'create', 'Purchase', 'Order']


In [8]:
# 2. Lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_tokens1 = [lemmatizer.lemmatize(token.lower()) for token in tokens1]
lemmatized_tokens2 = [lemmatizer.lemmatize(token.lower()) for token in tokens2]

print("Lemmatized Tokens1:", lemmatized_tokens1)
print("Lemmatized Tokens2:", lemmatized_tokens2)

Lemmatized Tokens1: ['how', 'to', 'create', 'sale', 'order']
Lemmatized Tokens2: ['how', 'to', 'create', 'purchase', 'order']


In [9]:
# 3. Stopword Removal
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens1 = [token for token in lemmatized_tokens1 if token not in stop_words and token not in string.punctuation]
filtered_tokens2 = [token for token in lemmatized_tokens2 if token not in stop_words and token not in string.punctuation]

print("Filtered Tokens1:", filtered_tokens1)
print("Filtered Tokens2:", filtered_tokens2)

Filtered Tokens1: ['create', 'sale', 'order']
Filtered Tokens2: ['create', 'purchase', 'order']


In [10]:
# 4. TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(filtered_tokens1), ' '.join(filtered_tokens2)])
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
print(df_tfidf)

TF-IDF Matrix:
     create     order  purchase      sale
0  0.501549  0.501549  0.000000  0.704909
1  0.501549  0.501549  0.704909  0.000000


In [11]:
# 5. Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print("Cosine Similarity:", cosine_sim[0][0])

Cosine Similarity: 0.5031026124151314
