In [10]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')

# Load the dataset of news articles into memory
news_df = pd.read_csv('news.csv')

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert to string
    text = str(text)
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stopwords_list = stopwords.words('english')
    text_tokens = nltk.word_tokenize(text)
    text = ' '.join([word for word in text_tokens if not word in stopwords_list])
    return text

news_df['clean_text'] = news_df['content'].apply(preprocess_text)

# Tokenize the sentences in the news articles using NLTK
sentences = []
for article in news_df['clean_text']:
    article_sentences = nltk.sent_tokenize(article)
    sentences.extend(article_sentences)

# Eliminate stopwords from the news articles using NLTK
stopwords_list = stopwords.words('english')
filtered_sentences = []
for sentence in sentences:
    sentence_tokens = nltk.word_tokenize(sentence)
    filtered_sentence = ' '.join([word for word in sentence_tokens if not word in stopwords_list])
    filtered_sentences.append(filtered_sentence)

# Remove any sentences that contain no meaningful information such as sentences containing only dates or numbers
clean_sentences = []
for sentence in filtered_sentences:
    if re.search('[a-zA-Z]', sentence):
        clean_sentences.append(sentence)

# Calculate the TF-IDF scores of each sentence using TfidfVectorizer from sklearn
vectorizer = TfidfVectorizer(use_idf=True)
tf_idf_scores = vectorizer.fit_transform(clean_sentences)

# Rank the sentences based on their scores
sentence_scores = []
for i, sentence in enumerate(clean_sentences):
    score = tf_idf_scores[i].sum()
    sentence_scores.append((sentence, score))

sentence_scores.sort(key=lambda x: x[1], reverse=True)

# Select the top N sentences with the highest scores to create a summary of the news article
N = 5
top_sentences = [sentence[0] for sentence in sentence_scores[:N]]

# Print the top N sentences
print('\n'.join(top_sentences))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vinitkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


1 btsnbsp bts south korean boy band formed 2010 debuting 2013 bighit music septetmdashconsisting members jin suga jhope rm jimin v jungkookmdashcowrites coproduces much material originally hip hop group musical style evolved incorporate wide range genres lyrics often discussed mental health troubles schoolage youth coming age loss journey towards selflove individualism work also frequently references literature philosophy psychological concepts includes alternate universe storyline 2022 bts bestselling artist south korean history sold excess 30 million albums via circle chart studio album map soul 7 2020 bestselling album time south korea since inception bts emphasized hip hop musical base largely due influence rm suga39s background underground rappers visits us group received mentoring american rappers bts39 rising popularity us represents continuation ways kpop functions part global rampb tradition 2 exonbsp exo south koreanchinese boy band based seoul formed sm entertainment 2011 de