In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
# Importing the dataset
data = pd.read_csv('sample_dataset.csv')
text = ' '.join(data['text_column'])

# Tokenization
tokens = word_tokenize(text)

# Stop-word Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

print("Tokens:", tokens[:10])
print("Filtered Tokens:", filtered_tokens[:10])
print("Lemmatized Tokens:", lemmatized_tokens[:10])


Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of']
Filtered Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'artificial', 'intelligence', 'focuses']
Lemmatized Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'artificial', 'intelligence', 'focus']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer

# Frequency Analysis
word_freq = Counter(lemmatized_tokens)
print("Word Frequency:", word_freq.most_common(10))

# TF-IDF
corpus = data['text_column'].tolist()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
print("TF-IDF Scores:", tfidf_matrix.toarray()[:10])

# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print("Sentiment Scores:", sentiment_scores)


Word Frequency: [('NLP', 5), ('.', 5), (',', 5), ('human', 3), ('language', 2), ('computer', 2), ('report', 2), ('Natural', 1), ('processing', 1), ('(', 1)]
TF-IDF Scores: [[0.         0.         0.         0.12599134 0.         0.26440698
  0.         0.21332189 0.         0.         0.         0.
  0.21332189 0.         0.         0.         0.         0.
  0.         0.         0.         0.26440698 0.26440698 0.
  0.         0.         0.         0.         0.26440698 0.
  0.         0.26440698 0.26440698 0.         0.17707644 0.26440698
  0.         0.         0.         0.26440698 0.12599134 0.21332189
  0.26440698 0.         0.         0.26440698 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.26440698 0.17707644 0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.23580313 0.         0.
  0.         0.1996247  0.24742967 0.         0.         0.24742967
  0.1996247  0.   

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

In [None]:
from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization")

# Summarization
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)[0]['summary_text']
print("Summary:", summary)

# Template-based Report Generation
report_template = f"""
Automatic Report
Summary:
{summary}
Word Frequency:
{word_freq.most_common(10)}
TF-IDF Scores:
{tfidf_matrix.toarray()[:10]}
Sentiment Scores:
{sentiment_scores}
"""

print("Generated Report:")
print(report_template)


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Summary:  Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans . NLP enables computers to understand and process human languages . Automatic report generation using NLP can significantly reduce the time and effort
Generated Report:

Automatic Report
Summary:
 Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans . NLP enables computers to understand and process human languages . Automatic report generation using NLP can significantly reduce the time and effort
Word Frequency:
[('NLP', 5), ('.', 5), (',', 5), ('human', 3), ('language', 2), ('computer', 2), ('report', 2), ('Natural', 1), ('processing', 1), ('(', 1)]
TF-IDF Scores:
[[0.         0.         0.         0.12599134 0.         0.26440698
  0.         0.21332189 0.         0.         0.         0.
  0.21332189 0.         0.         0.         0.         0.
  0.         0. 