In [21]:
import gensim
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from gensim.models import CoherenceModel
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from bs4 import BeautifulSoup
import requests
import re

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [22]:
def save_article_text(url):
    # Set headers to mimic a web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    # Send a GET request to the URL with headers
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title, subtitle, description, and main text
    title_element = soup.find('title')
    title = title_element.text.strip() if title_element else ""

    subtitle_element = soup.find('meta', attrs={'name': 'description'})
    subtitle = subtitle_element['content'].strip() if subtitle_element and 'content' in subtitle_element.attrs else ""

    description_element = soup.find('meta', attrs={'name': 'og:description'})
    description = description_element['content'].strip() if description_element and 'content' in description_element.attrs else ""

    # Find and exclude unwanted elements by class names or content patterns
    unwanted_elements = soup.find_all(['script', 'style', 'a', 'div', 'span'], class_=['follow-us', 'newsletter', 'advertisement'])
    patterns_to_exclude = ['next article', 'read next', 'correlated']
    for element in unwanted_elements:
        if any(pattern in str(element).lower() for pattern in patterns_to_exclude):
            element.extract()

    # Find and exclude footer container and "All rights reserved" text
    footer_elements = soup.find_all(['footer', 'div'], class_=['footer', 'bottom-footer'])
    for element in footer_elements:
        element.extract()
    all_rights_reserved_elements = soup.find_all(text=re.compile(r'\bAll rights reserved\b', re.IGNORECASE))
    for element in all_rights_reserved_elements:
        element.extract()

    # Find the main text element(s) based on the HTML structure of the page
    main_text_elements = soup.find_all('p')
    main_text = "\n\n".join([element.text.strip() for element in main_text_elements if element.text.strip()])

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{main_text}"

    return article_text

In [23]:
def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Generate bigrams
    bigrams = list(nltk.bigrams(lemmatized_words))
    lemmatized_bigrams = [bigram for bigram in bigrams]
    
    return lemmatized_bigrams

def apply_lda_to_articles(article_text, min_topics, max_topics):
    lemmatized_bigrams = preprocess_text(article_text)
            
    # Create Dictionary
    id2word = corpora.Dictionary(lemmatized_bigrams)
    
    # Create Corpus
    texts = lemmatized_bigrams
    corpus = [id2word.doc2bow(text) for text in texts]
    
    if not corpus:
        print("Error: No data in the corpus. Unable to compute LDA model.")
        return None
    
    best_num_topics = None
    best_coherence = float('-inf')
    
    for num_topics in range(min_topics, max_topics + 1):
        lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized_bigrams, dictionary=id2word, coherence='c_v')
        coherence = coherence_model_lda.get_coherence()
        
        if coherence > best_coherence:
            best_num_topics = num_topics
            best_coherence = coherence
    
    if best_num_topics is not None:
        return best_num_topics
    else:
        print("Error: Unable to determine the optimal number of topics.")
        return None



In [24]:
url = "https://www.foxnews.com/politics/biden-vetoes-bill-cancelling-student-loan-handout"

article = save_article_text(url)
num_topics = apply_lda_to_articles(article, 1, 10)

print(f'Number of topics based on article coherence: ', num_topics)


Number of topics based on article coherence:  3
