<a href="https://colab.research.google.com/github/isalut/couch-potato/blob/main/NLPusecase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DurgaBhavani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DurgaBhavani\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DurgaBhavani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Sample TFI news text
text = """
The Telugu Film Industry is buzzing with excitement as superstar Mahesh Babu announced his new movie.
The film, directed by SS Rajamouli, is expected to break box office records.
For more updates, visit https://www.tfinews.com or contact us at info@tfinews.com.
"""

In [None]:
# Initialize necessary tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    print("Original Text:\n", text)

    # Convert to lowercase
    text = text.lower()
    print("\nLowercased Text:\n", text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    print("\nText without URLs:\n", text)

    # Remove email addresses
    text = re.sub(r'\S*@\S*\s?', '', text)
    print("\nText without Email Addresses:\n", text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    print("\nText without Punctuation:\n", text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    print("\nText without Numbers:\n", text)

    # Remove special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    print("\nText without Special Characters:\n", text)

    # Tokenize into words
    tokens = word_tokenize(text)
    print("\nTokenized Text:\n", tokens)

    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    print("\nText without Stop Words:\n", tokens)

    # Stemming (or Lemmatization)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    print("\nStemmed Tokens:\n", stemmed_tokens)

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    print("\nLemmatized Tokens:\n", lemmatized_tokens)

    # Join tokens back to string
    processed_text = ' '.join(stemmed_tokens)
    print("\nProcessed Text:\n", processed_text)

    return processed_text

# Preprocess the sample text
processed_text = preprocess_text(text)

Original Text:
 
The Telugu Film Industry is buzzing with excitement as superstar Mahesh Babu announced his new movie. 
The film, directed by SS Rajamouli, is expected to break box office records. 
For more updates, visit https://www.tfinews.com or contact us at info@tfinews.com.


Lowercased Text:
 
the telugu film industry is buzzing with excitement as superstar mahesh babu announced his new movie. 
the film, directed by ss rajamouli, is expected to break box office records. 
for more updates, visit https://www.tfinews.com or contact us at info@tfinews.com.


Text without URLs:
 
the telugu film industry is buzzing with excitement as superstar mahesh babu announced his new movie. 
the film, directed by ss rajamouli, is expected to break box office records. 
for more updates, visit  or contact us at info@tfinews.com.


Text without Email Addresses:
 
the telugu film industry is buzzing with excitement as superstar mahesh babu announced his new movie. 
the film, directed by ss rajamoul

In [3]:
import re

# Sample text containing  phone numbers, emails, and URLs
text = """
Contact me at +91 8096696726 or 9573471012.
You can also reach me at tulasi.doe@example.com or bala123@example.net.
Check out our website at https://www.example.com for more info.
Alternatively, visit http://example.org.
My other number is +91 8096696726 or 9573471012.
Another number: 917250460104.
"""

# Define regex patterns
phone_pattern = re.compile(r'(?:\+91|91)?[-.\s]?[6789]\d{9}')
email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
url_pattern = re.compile(r'https?://[^\s]+')

# Extract  phone numbers
phone_numbers = phone_pattern.findall(text)
print(" Phone Numbers:", phone_numbers)

# Extract email addresses
email_addresses = email_pattern.findall(text)
print("Email Addresses:", email_addresses)

# Extract URLs
urls = url_pattern.findall(text)
print("URLs:", urls)

# Mask Indian phone numbers
masked_text = indian_phone_pattern.sub('[ PHONE NUMBER]', text)

# Mask email addresses
masked_text = email_pattern.sub('[EMAIL ADDRESS]', masked_text)

# Print masked text
print("\nMasked Text:\n", masked_text)

# Split text into sentences
sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
sentences = sentence_pattern.split(text)
print("\nSentences:", sentences)


 Phone Numbers: ['+91 8096696726', ' 9573471012', '+91 8096696726', ' 9573471012', ' 9172504601']
Email Addresses: ['tulasi.doe@example.com', 'bala123@example.net']
URLs: ['https://www.example.com', 'http://example.org.']

Masked Text:
 
Contact me at [ PHONE NUMBER] or[ PHONE NUMBER].
You can also reach me at [EMAIL ADDRESS] or [EMAIL ADDRESS].
Check out our website at https://www.example.com for more info.
Alternatively, visit http://example.org.
My other number is [ PHONE NUMBER] or[ PHONE NUMBER].
Another number:[ PHONE NUMBER]04.


Sentences: ['\nContact me at +91 8096696726 or 9573471012.', 'You can also reach me at tulasi.doe@example.com or bala123@example.net.', 'Check out our website at https://www.example.com for more info.', 'Alternatively, visit http://example.org.', 'My other number is +91 8096696726 or 9573471012.', 'Another number: 917250460104.', '']
