# 2.1 Understanding Text Data

2.1.3 Example: Exploring Raw Text Data

In [None]:
# Sample text
text = "Natural Language Processing (NLP) enables computers to understand human language."

# Display the text
print("Original Text:")
print(text)

# Length of the text
print("\\nLength of the text:", len(text))

# Unique characters in the text
unique_characters = set(text)
print("\\nUnique characters:", unique_characters)

# Number of words in the text
words = text.split()
print("\\nNumber of words:", len(words))

# Display the words
print("\\nWords in the text:")
print(words)

2.1.5 Practical Example: Basic Text Preprocessing Steps

In [None]:
import string

# Sample text
text = "Natural Language Processing (NLP) enables computers to understand human language."

# Convert to lowercase
text = text.lower()
print("Lowercased Text:")
print(text)

# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
print("\\nText without Punctuation:")
print(text)

# Tokenize the text
tokens = text.split()
print("\\nTokens:")
print(tokens)

# 2.2 Text Cleaning: Stop Word Removal, Stemming, Lemmatization

2.2.1 Stop Word Removal

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Sample text
text = "Natural Language Processing enables computers to understand human language."

# Tokenize the text
tokens = text.split()

# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print("Original Tokens:")
print(tokens)

print("\\nFiltered Tokens:")
print(filtered_tokens)

2.2.2 Stemming

In [None]:
from nltk.stem import PorterStemmer

# Sample text
text = "Natural Language Processing enables computers to understand human language."

# Tokenize the text
tokens = text.split()

# Initialize the stemmer
stemmer = PorterStemmer()

# Stem the tokens
stemmed_tokens = [stemmer.stem(word) for word in tokens]

print("Original Tokens:")
print(tokens)

print("\\nStemmed Tokens:")
print(stemmed_tokens)

2.2.3 Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample text
text = "Natural Language Processing enables computers to understand human language."

# Tokenize the text
tokens = text.split()

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize the tokens
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

print("Original Tokens:")
print(tokens)

print("\\nLemmatized Tokens:")
print(lemmatized_tokens)

2.2.4 Practical Example: Combining Text Cleaning Techniques

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text
text = "Natural Language Processing enables computers to understand human language."

# Convert to lowercase
text = text.lower()

# Remove punctuation
import string
text = text.translate(str.maketrans('', '', string.punctuation))

# Tokenize the text
tokens = text.split()

# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Stem and lemmatize the filtered tokens
processed_tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in filtered_tokens]

print("Original Text:")
print(text)

print("\\nFiltered Tokens (Stop Words Removed):")
print(filtered_tokens)

print("\\nProcessed Tokens (Stemmed and Lemmatized):")
print(processed_tokens)

# 2.3 Regular Expressions

2.3.1 Basics of Regular Expressions

In [None]:
import re

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Define a pattern to search for the word "fox"
pattern = r"fox"

# Use re.search() to find the pattern in the text
match = re.search(pattern, text)

# Display the match
if match:
    print("Match found:", match.group())
else:
    print("No match found.")

2.3.3 Practical Examples of Regex in Python

In [None]:
import re

# Sample text with email addresses
text = "Please contact us at support@example.com or sales@example.com for further information."

# Define a regex pattern to match email addresses
pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"

# Use re.findall() to find all matches
emails = re.findall(pattern, text)

# Display the extracted email addresses
print("Extracted Email Addresses:")
print(emails)

In [None]:
import re

# Sample text with phone numbers
text = "Contact us at (123) 456-7890 or (987) 654-3210."

# Define a regex pattern to match phone numbers
pattern = r"\(\d{3}\) \d{3}-\d{4}"

# Use re.findall() to find all matches
phone_numbers = re.findall(pattern, text)

# Display the extracted phone numbers
print("Extracted Phone Numbers:")
print(phone_numbers)

In [None]:
import re

# Sample text
text = "The quick brown fox jumps over the lazy dog. The fox is clever."

# Define a pattern to match the word "fox"
pattern = r"fox"

# Use re.sub() to replace "fox" with "cat"
new_text = re.sub(pattern, "cat", text)

# Display the modified text
print("Modified Text:")
print(new_text)

2.3.4 Advanced Regex Techniques

In [None]:
import re

# Sample text with dates
text = "The event is scheduled for 2022-08-15. Another event is on 15/08/2022."

# Define a regex pattern to match dates
pattern = r"\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}"

# Use re.findall() to find all matches
dates = re.findall(pattern, text)

# Display the extracted dates
print("Extracted Dates:")
print(dates)

In [None]:
import re

# Sample text with hashtags
text = "Loving the new features of this product! #excited #newrelease #tech"

# Define a regex pattern to match hashtags
pattern = r"#\w+"

# Use re.findall() to find all matches
hashtags = re.findall(pattern, text)

# Display the extracted hashtags
print("Extracted Hashtags:")
print(hashtags)

# 2.4 Tokenization

2.4.3 Word Tokenization

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Add this line to download the missing resource
from nltk.tokenize import word_tokenize

# Sample text
text = "Natural Language Processing enables computers to understand human language."

# Perform word tokenization
tokens = word_tokenize(text)

print("Word Tokens:")
print(tokens)

In [None]:
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Natural Language Processing enables computers to understand human language."

# Perform word tokenization
doc = nlp(text)
tokens = [token.text for token in doc]

print("Word Tokens:")
print(tokens)

2.4.4 Sentence Tokenization

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Sample text
text = "Natural Language Processing enables computers to understand human language. It is a fascinating field."

# Perform sentence tokenization
sentences = sent_tokenize(text)

print("Sentences:")
print(sentences)

In [None]:
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Natural Language Processing enables computers to understand human language. It is a fascinating field."

# Perform sentence tokenization
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]

print("Sentences:")
print(sentences)

2.4.5 Character Tokenization

In [None]:
# Sample text
text = "Natural Language Processing"

# Perform character tokenization
characters = list(text)

print("Characters:")
print(characters)

2.4.6 Practical Example: Tokenization Pipeline

In [None]:
import nltk
import spacy
nltk.download('punkt')

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Natural Language Processing enables computers to understand human language. It is a fascinating field."

# Perform word tokenization using NLTK
word_tokens = nltk.word_tokenize(text)
print("Word Tokens:")
print(word_tokens)

# Perform sentence tokenization using NLTK
sentence_tokens = nltk.sent_tokenize(text)
print("\\nSentence Tokens:")
print(sentence_tokens)

# Perform sentence tokenization using SpaCy
doc = nlp(text)
spacy_sentence_tokens = [sent.text for sent in doc.sents]
print("\\nSentence Tokens (SpaCy):")
print(spacy_sentence_tokens)

# Perform word tokenization using SpaCy
spacy_word_tokens = [token.text for token in doc]
print("\\nWord Tokens (SpaCy):")
print(spacy_word_tokens)

# Perform character tokenization
char_tokens = list(text)
print("\\nCharacter Tokens:")
print(char_tokens)