In [1]:
import nltk
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")

# Sample Text
text = "AI is transforming finance! Email: ai_research@finance.com, Phone: 9876543210. Born on 27-March-2000. Don't ignore the impact of AI. "

# Tokenization
tokens = word_tokenize(text)
unique_tokens = set(tokens)
print("Unique Tokens Count:", len(unique_tokens))

# Count and Remove Punctuation
punctuation_count = sum(1 for token in tokens if token in string.punctuation)
text_no_punct = ''.join([char for char in text if char not in string.punctuation])
print("Punctuation Count:", punctuation_count)

# Stopwords Distribution
stop_words = set(stopwords.words('english'))
stopword_counts = Counter([word for word in tokens if word.lower() in stop_words])
sns.barplot(x=list(stopword_counts.keys()), y=list(stopword_counts.values()))
plt.title("Stopword Distribution")
plt.show()

# Remove Stopwords
filtered_text = ' '.join([word for word in tokens if word.lower() not in stop_words])
print("Filtered Text:", filtered_text)

# POS Tagging Distribution
pos_tags = nltk.pos_tag(tokens)
pos_counts = Counter(tag for _, tag in pos_tags)
sns.barplot(x=list(pos_counts.keys()), y=list(pos_counts.values()))
plt.xticks(rotation=45)
plt.title("POS Tag Distribution")
plt.show()

# Lemmatization
lemmatizer = nltk.WordNetLemmatizer()
lemmas = set(lemmatizer.lemmatize(word.lower()) for word in tokens)
print("Unique Lemma Count:", len(lemmas))

# Frequency Distribution of Top 10 Words
word_freq = Counter(tokens)
sns.barplot(x=[word for word, freq in word_freq.most_common(10)], y=[freq for _, freq in word_freq.most_common(10)])
plt.title("Top 10 Word Frequency")
plt.show()

# N-grams
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))
quadgrams = list(ngrams(tokens, 4))
print("Unique Bigrams:", len(set(bigrams)))
print("Unique Trigrams:", len(set(trigrams)))
print("Unique Quadgrams:", len(set(quadgrams)))

# Convert Dates to DD-MM-YYYY
text = re.sub(r'\b(\d{1,2})[-](\w+)[-](\d{4})\b', lambda x: f"{x.group(1)}-{'03' if 'March' in x.group(2) else x.group(2)}-{x.group(3)}", text)
print("Formatted Dates:", text)

# Extract and Validate Phone Numbers
valid_phone_pattern = re.compile(r'\b\d{10}\b')
phone_numbers = valid_phone_pattern.findall(text)
print("Valid Phone Numbers:", phone_numbers)

# Year Distribution
years = re.findall(r'\b\d{4}\b', text)
year_counts = Counter(years)
sns.barplot(x=list(year_counts.keys()), y=list(year_counts.values()))
plt.title("Year Distribution")
plt.show()

# Insights & Applications
insights = """
1. Stopwords impact text processing by increasing noise. Removing them enhances keyword relevance.
2. POS tagging aids in NLP tasks like Named Entity Recognition and parsing.
3. Lemmatization ensures words are reduced to meaningful roots for better text analysis.
4. N-grams provide insights into commonly occurring phrases, useful for predictive text models.
5. Identifying valid phone numbers and standardizing dates improve text consistency.

Applications:
1. Financial Market Analysis - NLP helps in sentiment analysis of news impacting stocks.
2. Fraud Detection - Validating numbers and entity recognition assist in fraud prevention.
3. Chatbots - Proper text processing ensures effective AI-driven conversations.
4. Automated Report Generation - Structured text helps in summarizing financial reports.
"""
print(insights)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.