In [20]:
# Import libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk

In [17]:
# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /roo

True

In [4]:
# Defining a sample text
text = "Hello! My name is Random Joe, and I am a software engineer at Google in Miami. I have been working there for over five years, and I specialize in Natural Language Processing (NLP) and Machine Learning. In my free time, I enjoy hiking in the Adirondack Mountains, reading books about artificial intelligence, and exploring new coffee shops in Brooklyn. Last summer, I visited Paris, France, and it was an incredible experience. I also love collaborating with my colleagues, such as Dr. Jane Smith, who is a leading expert in computer vision. Together, we are working on a project to improve sentiment analysis algorithms using advanced NLP techniques. By the way, did you know that the Eiffel Tower was completed in 1889? It's one of the most iconic landmarks in the world!"

In [7]:
# Tokenization - Tokenization splits the text into individual sentences and words for further processing.
# Function: Tokenize the text into sentences and words
sentences = sent_tokenize(text)
words = word_tokenize(text)

print("Tokenization:")
print("Sentences:", sentences)
print("Words:", words)

Tokenization:
Sentences: ['Hello!', 'My name is Random Joe, and I am a software engineer at Google in Miami.', 'I have been working there for over five years, and I specialize in Natural Language Processing (NLP) and Machine Learning.', 'In my free time, I enjoy hiking in the Adirondack Mountains, reading books about artificial intelligence, and exploring new coffee shops in Brooklyn.', 'Last summer, I visited Paris, France, and it was an incredible experience.', 'I also love collaborating with my colleagues, such as Dr. Jane Smith, who is a leading expert in computer vision.', 'Together, we are working on a project to improve sentiment analysis algorithms using advanced NLP techniques.', 'By the way, did you know that the Eiffel Tower was completed in 1889?', "It's one of the most iconic landmarks in the world!"]
Words: ['Hello', '!', 'My', 'name', 'is', 'Random', 'Joe', ',', 'and', 'I', 'am', 'a', 'software', 'engineer', 'at', 'Google', 'in', 'Miami', '.', 'I', 'have', 'been', 'worki

In [8]:
# Stemming - Stemming reduces words to their root form, which helps in normalizing text.
# Function: Apply stemming to the word tokens using NLTK's PorterStemmer
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]

print("\nStemming:")
print("Stemmed Words:", stemmed_words)


Stemming:
Stemmed Words: ['hello', '!', 'my', 'name', 'is', 'random', 'joe', ',', 'and', 'i', 'am', 'a', 'softwar', 'engin', 'at', 'googl', 'in', 'miami', '.', 'i', 'have', 'been', 'work', 'there', 'for', 'over', 'five', 'year', ',', 'and', 'i', 'special', 'in', 'natur', 'languag', 'process', '(', 'nlp', ')', 'and', 'machin', 'learn', '.', 'in', 'my', 'free', 'time', ',', 'i', 'enjoy', 'hike', 'in', 'the', 'adirondack', 'mountain', ',', 'read', 'book', 'about', 'artifici', 'intellig', ',', 'and', 'explor', 'new', 'coffe', 'shop', 'in', 'brooklyn', '.', 'last', 'summer', ',', 'i', 'visit', 'pari', ',', 'franc', ',', 'and', 'it', 'wa', 'an', 'incred', 'experi', '.', 'i', 'also', 'love', 'collabor', 'with', 'my', 'colleagu', ',', 'such', 'as', 'dr.', 'jane', 'smith', ',', 'who', 'is', 'a', 'lead', 'expert', 'in', 'comput', 'vision', '.', 'togeth', ',', 'we', 'are', 'work', 'on', 'a', 'project', 'to', 'improv', 'sentiment', 'analysi', 'algorithm', 'use', 'advanc', 'nlp', 'techniqu', '.', 

In [9]:
# Lemmatization - Lemmatization reduces words to their base or dictionary form, providing more accurate normalization than stemming.
# Function: Apply lemmatization to the word tokens using NLTK's WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("\nLemmatization:")
print("Lemmatized Words:", lemmatized_words)


Lemmatization:
Lemmatized Words: ['Hello', '!', 'My', 'name', 'is', 'Random', 'Joe', ',', 'and', 'I', 'am', 'a', 'software', 'engineer', 'at', 'Google', 'in', 'Miami', '.', 'I', 'have', 'been', 'working', 'there', 'for', 'over', 'five', 'year', ',', 'and', 'I', 'specialize', 'in', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'and', 'Machine', 'Learning', '.', 'In', 'my', 'free', 'time', ',', 'I', 'enjoy', 'hiking', 'in', 'the', 'Adirondack', 'Mountains', ',', 'reading', 'book', 'about', 'artificial', 'intelligence', ',', 'and', 'exploring', 'new', 'coffee', 'shop', 'in', 'Brooklyn', '.', 'Last', 'summer', ',', 'I', 'visited', 'Paris', ',', 'France', ',', 'and', 'it', 'wa', 'an', 'incredible', 'experience', '.', 'I', 'also', 'love', 'collaborating', 'with', 'my', 'colleague', ',', 'such', 'a', 'Dr.', 'Jane', 'Smith', ',', 'who', 'is', 'a', 'leading', 'expert', 'in', 'computer', 'vision', '.', 'Together', ',', 'we', 'are', 'working', 'on', 'a', 'project', 'to', 'improve', 'sent

In [12]:
# Parts of Speech (POS) Tagging - POS tagging assigns grammatical labels (e.g., noun, verb) to each word in the text.
# Function: Collect the parts of speech for each word in the text
pos_tags = pos_tag(words)

print("\nPOS Tagging:")
print("POS Tags:", pos_tags)


POS Tagging:
POS Tags: [('Hello', 'NN'), ('!', '.'), ('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Random', 'NNP'), ('Joe', 'NNP'), (',', ','), ('and', 'CC'), ('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('software', 'NN'), ('engineer', 'NN'), ('at', 'IN'), ('Google', 'NNP'), ('in', 'IN'), ('Miami', 'NNP'), ('.', '.'), ('I', 'PRP'), ('have', 'VBP'), ('been', 'VBN'), ('working', 'VBG'), ('there', 'RB'), ('for', 'IN'), ('over', 'IN'), ('five', 'CD'), ('years', 'NNS'), (',', ','), ('and', 'CC'), ('I', 'PRP'), ('specialize', 'VBP'), ('in', 'IN'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('and', 'CC'), ('Machine', 'NNP'), ('Learning', 'NNP'), ('.', '.'), ('In', 'IN'), ('my', 'PRP$'), ('free', 'JJ'), ('time', 'NN'), (',', ','), ('I', 'PRP'), ('enjoy', 'VBP'), ('hiking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('Adirondack', 'NNP'), ('Mountains', 'NNP'), (',', ','), ('reading', 'VBG'), ('books', 'NNS'), ('about', 'IN'), ('artificial

In [13]:
# Identify Punctuation - Punctuation identification separates punctuation marks from the text for further processing.
# Function: Identify punctuation marks from the tokenized words
punctuation = [word for word in words if not word.isalnum()]

print("\nPunctuation Identification:")
print("Punctuation Marks:", punctuation)


Punctuation Identification:
Punctuation Marks: ['!', ',', '.', ',', '(', ')', '.', ',', ',', ',', '.', ',', ',', ',', '.', ',', 'Dr.', ',', '.', ',', '.', ',', '?', "'s", '!']


In [14]:
# Stop Words Removal - Eliminates common words (e.g., 'the', 'is') that do not contribute significantly to the meaning of the text.
# Function: Identify and remove stop words from the word tokens
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]

print("\nStop Words Removal:")
print("Filtered Words:", filtered_words)


Stop Words Removal:
Filtered Words: ['Hello', '!', 'name', 'Random', 'Joe', ',', 'software', 'engineer', 'Google', 'Miami', '.', 'working', 'five', 'years', ',', 'specialize', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'Machine', 'Learning', '.', 'free', 'time', ',', 'enjoy', 'hiking', 'Adirondack', 'Mountains', ',', 'reading', 'books', 'artificial', 'intelligence', ',', 'exploring', 'new', 'coffee', 'shops', 'Brooklyn', '.', 'Last', 'summer', ',', 'visited', 'Paris', ',', 'France', ',', 'incredible', 'experience', '.', 'also', 'love', 'collaborating', 'colleagues', ',', 'Dr.', 'Jane', 'Smith', ',', 'leading', 'expert', 'computer', 'vision', '.', 'Together', ',', 'working', 'project', 'improve', 'sentiment', 'analysis', 'algorithms', 'using', 'advanced', 'NLP', 'techniques', '.', 'way', ',', 'know', 'Eiffel', 'Tower', 'completed', '1889', '?', "'s", 'one', 'iconic', 'landmarks', 'world', '!']


In [15]:
# Clean the Data - Removes unnecessary elements like punctuation and stop words to prepare the text for analysis.
# Function: Remove punctuation and stop words to create a clean version of the text
clean_words = [word for word in filtered_words if word.isalnum()]

print("\nCleaned Data:")
print("Clean Words:", clean_words)


Cleaned Data:
Clean Words: ['Hello', 'name', 'Random', 'Joe', 'software', 'engineer', 'Google', 'Miami', 'working', 'five', 'years', 'specialize', 'Natural', 'Language', 'Processing', 'NLP', 'Machine', 'Learning', 'free', 'time', 'enjoy', 'hiking', 'Adirondack', 'Mountains', 'reading', 'books', 'artificial', 'intelligence', 'exploring', 'new', 'coffee', 'shops', 'Brooklyn', 'Last', 'summer', 'visited', 'Paris', 'France', 'incredible', 'experience', 'also', 'love', 'collaborating', 'colleagues', 'Jane', 'Smith', 'leading', 'expert', 'computer', 'vision', 'Together', 'working', 'project', 'improve', 'sentiment', 'analysis', 'algorithms', 'using', 'advanced', 'NLP', 'techniques', 'way', 'know', 'Eiffel', 'Tower', 'completed', '1889', 'one', 'iconic', 'landmarks', 'world']


In [22]:
# Named Entity Recognition (NER) - Identifies and classifies named entities (e.g., people, locations) in the text.
# Function: Perform Named Entity Recognition on the text
ner_tags = ne_chunk(pos_tags)

named_entities = []
for chunk in ner_tags:
    if hasattr(chunk, 'label'):
        entity = " ".join(c[0] for c in chunk)
        label = chunk.label()
        named_entities.append((entity, label))

# Create DF to display the named entities in better visual (table)
ner_df = pd.DataFrame(named_entities, columns=["Entity", "Label"])

print("Named Entity Recognition (NER) Results:")
print(ner_df)

Named Entity Recognition (NER) Results:
                  Entity         Label
0                  Hello           GPE
1             Random Joe        PERSON
2                 Google  ORGANIZATION
3                  Miami           GPE
4       Natural Language  ORGANIZATION
5                    NLP  ORGANIZATION
6       Machine Learning        PERSON
7   Adirondack Mountains  ORGANIZATION
8               Brooklyn           GPE
9                  Paris           GPE
10                France           GPE
11            Jane Smith        PERSON
12                   NLP  ORGANIZATION
13          Eiffel Tower  ORGANIZATION
