In [None]:
import nltk
import gensim
import pandas as pd
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.summarization import summarize
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('news_articles.csv')
texts = df['text'].tolist()

# Preprocess text (lowercase, remove stopwords)
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in nltk.word_tokenize(text.lower()) if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

preprocessed_texts = [preprocess_text(text) for text in texts]

# Summarize text using TextRank algorithm
def summarize_text(text):
    return summarize(text, ratio=0.2)  # Extract top 20% sentences

summaries = [summarize_text(text) for text in preprocessed_texts]

# Apply K-Means Clustering to group sentences
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(summaries)
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)
clusters = kmeans.labels_

# Display summaries and clusters
for i, summary in enumerate(summaries):
    print(f"Article {i+1} Summary:\n{summary}\nCluster: {clusters[i]}\n")
