In [17]:
!pip install word2number



In [18]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import words
from word2number import w2n

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
text = "I have 2 cars. The first car is a 2020 Toyota, and the second is a 2015 Honda. It is an amazing experience."
print("CORPUS: \n",text)

CORPUS: 
 I have 2 cars. The first car is a 2020 Toyota, and the second is a 2015 Honda. It is an amazing experience.


#Tokenization

In [20]:
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['I', 'have', '2', 'cars', '.', 'The', 'first', 'car', 'is', 'a', '2020', 'Toyota', ',', 'and', 'the', 'second', 'is', 'a', '2015', 'Honda', '.', 'It', 'is', 'an', 'amazing', 'experience', '.']


#Converting Text to Lower Case

In [21]:
lower_case_text = text.lower()
print("\nLower Case Text:", lower_case_text)


Lower Case Text: i have 2 cars. the first car is a 2020 toyota, and the second is a 2015 honda. it is an amazing experience.


#Remove Numbers

In [22]:

text_no_numbers = re.sub(r'\d+', '', text)
print("\nText without Numbers:", text_no_numbers)


Text without Numbers: I have  cars. The first car is a  Toyota, and the second is a  Honda. It is an amazing experience.


#Convert Numbers to Words

In [23]:
def convert_numbers_to_words(text):
    words_in_text = text.split()
    converted_text = []

    for word in words_in_text:
        try:
            # Check if the word is a number and convert it to words
            number = w2n.word_to_num(word)
            converted_text.append(str(number))
        except ValueError:
            converted_text.append(word)

    return ' '.join(converted_text)

text_with_numbers_in_words = convert_numbers_to_words(text)
print("\nText with Numbers Converted to Words:", text_with_numbers_in_words)



Text with Numbers Converted to Words: I have 2 cars. The first car is a 2020 Toyota, and the second is a 2015 Honda. It is an amazing experience.


#Remove Punctuation

In [24]:
text_no_punctuation = re.sub(r'[^\w\s]', '', text)
print("\nText without Punctuation:", text_no_punctuation)


Text without Punctuation: I have 2 cars The first car is a 2020 Toyota and the second is a 2015 Honda It is an amazing experience


#Remove Whitespace

In [25]:
text_no_whitespace = ' '.join(text.split())
print("\nText without Extra Whitespace:", text_no_whitespace)


Text without Extra Whitespace: I have 2 cars. The first car is a 2020 Toyota, and the second is a 2015 Honda. It is an amazing experience.


#Remove Stop Words

In [26]:
stop_words = set(stopwords.words('english'))
tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]
print("\nTokens without Stop Words:", tokens_without_stopwords)


Tokens without Stop Words: ['2', 'cars', '.', 'first', 'car', '2020', 'Toyota', ',', 'second', '2015', 'Honda', '.', 'amazing', 'experience', '.']


#Count Word Frequency

In [27]:
from collections import Counter
word_frequency = Counter(tokens_without_stopwords)
print("\nWord Frequency:", word_frequency)


Word Frequency: Counter({'.': 3, '2': 1, 'cars': 1, 'first': 1, 'car': 1, '2020': 1, 'Toyota': 1, ',': 1, 'second': 1, '2015': 1, 'Honda': 1, 'amazing': 1, 'experience': 1})


#Stemming (Porter and Lancaster Stemmer)

In [28]:
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

porter_stems = [porter_stemmer.stem(word) for word in tokens]
lancaster_stems = [lancaster_stemmer.stem(word) for word in tokens]

print("\nPorter Stemmer Stems:", porter_stems)
print("\nLancaster Stemmer Stems:", lancaster_stems)



Porter Stemmer Stems: ['i', 'have', '2', 'car', '.', 'the', 'first', 'car', 'is', 'a', '2020', 'toyota', ',', 'and', 'the', 'second', 'is', 'a', '2015', 'honda', '.', 'it', 'is', 'an', 'amaz', 'experi', '.']

Lancaster Stemmer Stems: ['i', 'hav', '2', 'car', '.', 'the', 'first', 'car', 'is', 'a', '2020', 'toyot', ',', 'and', 'the', 'second', 'is', 'a', '2015', 'hond', '.', 'it', 'is', 'an', 'amaz', 'expery', '.']


#Lemmatization

In [29]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
print("\nLemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['I', 'have', '2', 'car', '.', 'The', 'first', 'car', 'is', 'a', '2020', 'Toyota', ',', 'and', 'the', 'second', 'is', 'a', '2015', 'Honda', '.', 'It', 'is', 'an', 'amazing', 'experience', '.']
