<a href="https://colab.research.google.com/github/isalut/couch-potato/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Import necessary libraries
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK datasets
nltk.download('punkt')  # Tokenizer models
nltk.download('stopwords')  # List of stopwords for multiple languages
nltk.download('wordnet')  # WordNet lemmatizer data

# Initialize tools for stemming and lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Set of stopwords for English
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    """
    Preprocess the given text by performing a series of transformations
    like removing URLs, emails, punctuation, and more.
    """
    print("Original Text:\n", text)

    # Step 1: Convert text to lowercase
    text = text.lower()
    print("\nLowercased Text:\n", text)

    # Step 2: Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    print("\nText without URLs:\n", text)

    # Step 3: Remove email addresses
    text = re.sub(r'\S*@\S*\s?', '', text)
    print("\nText without Email Addresses:\n", text)

    # Step 4: Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    print("\nText without Punctuation:\n", text)

    # Step 5: Remove numbers
    text = re.sub(r'\d+', '', text)
    print("\nText without Numbers:\n", text)

    # Step 6: Remove special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    print("\nText without Special Characters:\n", text)

    # Step 7: Tokenize the text into words
    tokens = word_tokenize(text)
    print("\nTokenized Text:\n", tokens)

    # Step 8: Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    print("\nText without Stop Words:\n", tokens)

    # Step 9: Perform stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    print("\nStemmed Tokens:\n", stemmed_tokens)

    # Step 10: Perform lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    print("\nLemmatized Tokens:\n", lemmatized_tokens)

    # Step 11: Join tokens back into a single string (using stemmed tokens here)
    processed_text = ' '.join(stemmed_tokens)
    print("\nProcessed Text:\n", processed_text)

    return processed_text

# Example usage: Sample text to preprocess
text = """
The Telugu Film Industry is buzzing!! with excitement as superstar Mahesh Babu announced his new movie.
The film, directed by SS Rajamouli, is expected to break box office records.
For more updates, visit https://www.tfinews.com or contact us at info@tfinews.com.
"""
processed_text = preprocess_text(text)

# Regex Extraction and Masking Example
# Define sample text containing phone numbers, emails, and URLs
text = """
Contact me at +91 8096696726 or 9573471012.
You can also reach me at tulasi.doe@example.com or bala123@example.net.
Check out our website at https://www.example.com for more info.
Alternatively, visit http://example.org.
My other number is +91 8096696726 or 9573471012.
Another number: 917250460104.
"""

# Define regex patterns for phone numbers, emails, and URLs
phone_pattern = re.compile(r'(?:\+91|91)?[-.\s]?[6789]\d{9}')
email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
url_pattern = re.compile(r'https?://[^\s]+')

# Step 1: Extract phone numbers
phone_numbers = phone_pattern.findall(text)
print("\nPhone Numbers:", phone_numbers)

# Step 2: Extract email addresses
email_addresses = email_pattern.findall(text)
print("\nEmail Addresses:", email_addresses)

# Step 3: Extract URLs
urls = url_pattern.findall(text)
print("\nURLs:", urls)

# Step 4: Mask phone numbers in text
masked_text = phone_pattern.sub('[PHONE NUMBER]', text)

# Step 5: Mask email addresses in text
masked_text = email_pattern.sub('[EMAIL ADDRESS]', masked_text)

# Print masked text
print("\nMasked Text:\n", masked_text)

# Step 6: Split text into sentences
sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
sentences = sentence_pattern.split(text)
print("\nSentences:", sentences)


Original Text:
 
The Telugu Film Industry is buzzing!! with excitement as superstar Mahesh Babu announced his new movie.
The film, directed by SS Rajamouli, is expected to break box office records.
For more updates, visit https://www.tfinews.com or contact us at info@tfinews.com.


Lowercased Text:
 
the telugu film industry is buzzing!! with excitement as superstar mahesh babu announced his new movie.
the film, directed by ss rajamouli, is expected to break box office records.
for more updates, visit https://www.tfinews.com or contact us at info@tfinews.com.


Text without URLs:
 
the telugu film industry is buzzing!! with excitement as superstar mahesh babu announced his new movie.
the film, directed by ss rajamouli, is expected to break box office records.
for more updates, visit  or contact us at info@tfinews.com.


Text without Email Addresses:
 
the telugu film industry is buzzing!! with excitement as superstar mahesh babu announced his new movie.
the film, directed by ss rajamou

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
