Obtain Newsmax URLs

In [1]:
# Import necessary libraries
import time
import requests
import urllib
import lxml.html
from bs4 import BeautifulSoup
import newspaper
import trafilatura
from datetime import datetime

In [2]:
# Newsmax
def obtain_page_urls(year="2016", month="1"):
    """
    Obtain the urls for a given year and month from the politics section of Newsmax
    Inputs:
    -year(str): Year of the articles to search for
    -month(str): Month of the articles to search for
    Return:
    links_list(list): List of urls
    """
    url = "https://www.newsmax.com/archives/politics/1/{}/{}/".format(year, month)
    resp = requests.get(url)
    root = lxml.html.fromstring(resp.text)
    links_elements = root.cssselect("h5.archiveH5")
    links_list = []
    for element in links_elements:
        link = element.cssselect("a")
        href = link[0].get("href")
        full_link = "newsmax.com" + href
        links_list.append(full_link)
    return links_list

In [5]:
newsmax_links

{'2016-1': {'newsmax.com/politics/abrams-supreme-court-cruz-eligibility/2016/01/17/id/709816/',
  'newsmax.com/politics/adam-kinzinger-barack-obama-work-congress/2016/01/04/id/708029/',
  'newsmax.com/politics/amlegal-bbchto-bgovbillgo-bgovcodes/2016/01/20/id/710221/',
  'newsmax.com/politics/anthony-weiner-huma-abedin-hillary-clinton-campaign/2016/01/19/id/710023/',
  'newsmax.com/politics/barbara-bush-aid-jeb-bush-tweets/2016/01/25/id/710785/',
  'newsmax.com/politics/barbara-bush-praise-jeb-bush-new/2016/01/22/id/710444/',
  'newsmax.com/politics/ben-carson-campaign-disarray/2016/01/13/id/709303/',
  'newsmax.com/politics/ben-carson-car-crash/2016/01/19/id/710049/',
  'newsmax.com/politics/ben-carson-carly-fiorina-gop-power-rankings/2016/01/05/id/708209/',
  'newsmax.com/politics/ben-carson-confirms-house-speaker/2016/01/08/id/708768/',
  'newsmax.com/politics/ben-carson-fifth-graders-point/2016/01/08/id/708800/',
  'newsmax.com/politics/ben-carson-recruited-house-speaker/2016/01/07

In [4]:
def obtain_newsmax_urls(min_year=2016):
    """
    Obtain all newsmax urls from the politics section
    Inputs: None
    Return:
    newsmax_links (dict): Dictionary where the keys are str for date (year-mth)
    and values are lists with the urls of that given key
    """
    years = [*range(min_year, 2024, 1)]
    months = [*range(1, 13, 1)]

    newsmax_links = {}

    # Obtain news for
    for year in years:
        for month in months:
            date = str(year) + "-" + str(month)
            print("Obtaining news from:", date)
            newsmax_links[date] = obtain_page_urls(str(year), str(month))
            time.sleep(1)

    return newsmax_links


newsmax_links = obtain_newsmax_urls()

Obtaining news from: 2016-1
Obtaining news from: 2016-2
Obtaining news from: 2016-3
Obtaining news from: 2016-4
Obtaining news from: 2016-5
Obtaining news from: 2016-6
Obtaining news from: 2016-7
Obtaining news from: 2016-8
Obtaining news from: 2016-9
Obtaining news from: 2016-10
Obtaining news from: 2016-11
Obtaining news from: 2016-12
Obtaining news from: 2017-1
Obtaining news from: 2017-2
Obtaining news from: 2017-3
Obtaining news from: 2017-4
Obtaining news from: 2017-5
Obtaining news from: 2017-6
Obtaining news from: 2017-7
Obtaining news from: 2017-8
Obtaining news from: 2017-9
Obtaining news from: 2017-10
Obtaining news from: 2017-11
Obtaining news from: 2017-12
Obtaining news from: 2018-1
Obtaining news from: 2018-2
Obtaining news from: 2018-3
Obtaining news from: 2018-4
Obtaining news from: 2018-5
Obtaining news from: 2018-6
Obtaining news from: 2018-7
Obtaining news from: 2018-8
Obtaining news from: 2018-9
Obtaining news from: 2018-10
Obtaining news from: 2018-11
Obtaining ne

In [33]:
today = datetime.now()
month3 = today.strftime("%m")
print("Current Month with Decimal Number :", month3)

Current Month with Decimal Number : 06


In [35]:
import trafilatura

test_list = obtain_page_urls()

test_article = test_list[0]
test_article

# downloaded = trafilatura.fetch_url(test_article)
# content = trafilatura.extract(downloaded, include_images=True)
# content

'newsmax.com/politics/rent-too-high-mcmillan/2016/01/31/id/712077/'

In [32]:
for article in test_list:
    print(article)
    downloaded = trafilatura.fetch_url(article)
    content_2 = trafilatura.bare_extraction(
        downloaded, include_images=True, include_comments=False
    )
    print(content_2)

newsmax.com/politics/rent-too-high-mcmillan/2016/01/31/id/712077/
{'title': 'Rent Is Too Damn High Party Founder Backs Trump', 'author': 'Greg Richter', 'url': 'https://www.newsmax.com/politics/rent-too-high-mcmillan/2016/01/31/id/712077/', 'hostname': 'newsmax.com', 'description': 'Jimmy McMillan, the white-haired, white-bearded founder of the Rent is Too Damn High Party has endorsed Donald Trump for president, the New York Daily News reports.', 'sitename': 'Newsmax', 'date': '2016-01-31', 'categories': ['politics'], 'tags': ['rent, too, high, mcmillan, endorse, trump'], 'fingerprint': None, 'id': None, 'license': None, 'body': None, 'comments': None, 'commentsbody': None, 'raw_text': None, 'text': 'Jimmy McMillan, the white-haired, white-bearded founder of the Rent is Too Damn High Party has endorsed Donald Trump for president, the New York Daily News\nreports.\nThe Daily News found it ironic that McMillan would support a billionaire real estate developer, given the name of his party

KeyboardInterrupt: 

In [34]:
downloaded = trafilatura.fetch_url(test_article)
content_2 = trafilatura.bare_extraction(
    downloaded, include_images=True, include_comments=False
)
content_2["image"]

'https://www.newsmax.com/CMSPages/GetFile.aspx?guid=28826aff-2ff4-4054-9024-7ee3e719ebb5&SiteName=Newsmax'

In [63]:
def extract_html(
    url, article_selector, img_selector="img", p_selector=None, t_selector=None
):
    """
    Extract the image and text content from and HTML:
    Inputs:
    - url(str): Url to parse
    - article_selector(str): css selector for article container
    - img_selector(str): css selector for images living inside the article
    container
    - p_selector(str): css selector for paragraphs living inside the article container
    - t_selector(str): css selector for title living inside the article container
    Return:
    -imgs(lst): list where each element is an image represented as a tuple
    with src, alt, title, and caption as elements
    - text(tuple): Tuple where the first element is the article text and the
    second element the title
    """
    resp = requests.get(url)
    root = lxml.html.fromstring(resp.text)
    article_body = root.cssselect(article_selector)[0]
    imgs = extract_imgs(article_body, img_selector)
    text = extract_text(article_body, p_selector, t_selector)
    return imgs, text


def extract_imgs(html, selector="img"):
    """
    Extract the image content from an HTML:
    Inputs:
    - html(str): html to extract images from
    - img_selector(str): css selector for images living inside the article
    container
    Return:
    -imgs(lst): list where each element is an image represented as a tuple
    with src, alt, title, and caption as elements
    """
    imgs = []
    images = html.cssselect(selector)
    for img in images:
        src = img.get("src")
        alt = img.get("alt")
        title = img.get("title")
        caption = img.get("caption")
        image = (src, alt, title, caption)
        imgs.append(image)
    return imgs


def extract_text(html, p_selector=None, t_selector=None):
    """
    Extract the text content from an HTML:
    Inputs:

    - p_selector(str): css selector for paragraphs living inside the article container
    - t_selector(str): css selector for title living inside the article container
    Return:
    - text(tuple): Tuple where the first element is the article text and the
    second element the title
    """
    p_text = None
    t_text = None
    if p_selector:
        paragraphs = html.cssselect(p_selector)
        if paragraphs:
            p_text = ""
            for p in paragraphs:
                p_text += p.text_content()

    if t_selector:
        t_text = html.cssselect(t_selector)[0].text

    text = (p_text, t_text)

    return text

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 68)

In [61]:
test_newsmax = extract_html(
    "https://www.newsmax.com/politics/rent-too-high-mcmillan/2016/01/31/id/712077/",
    "div[itemtype='https://schema.org/NewsArticle']",
    "img",
    "div#mainArticleDiv",
    "h1.article",
)

print("Test Newsmax:", test_newsmax)


npr_url = "https://www.npr.org/sections/money/2023/07/24/1189443223/affirmative-action-for-rich-kids-its-more-than-just-legacy-admissions"
test_npr = extract_html(npr_url, "article.story", "img", "p", "h1")
print("Test NPR:", test_npr)

Test Newsmax: ([('https://www.newsmax.com/CMSPages/GetFile.aspx?guid=28826aff-2ff4-4054-9024-7ee3e719ebb5&SiteName=Newsmax&maxsidesize=600', 'Rent Is Too Damn High Party Founder Backs Trump', None, None), ('/App_Themes/NewsmaxNew/images/tooltipCloseButton.png', '', None, None), ('https://www.newsmax.com/CMSPages/GetFile.aspx?guid=28826aff-2ff4-4054-9024-7ee3e719ebb5&maxsidesize=120&SiteName=Newsmax', '', None, None)], ('\r\n                    Jimmy McMillan, the white-haired, white-bearded founder of the Rent is Too Damn High Party has endorsed Donald Trump for president, the New York Daily News reports. \r\n\r\nThe Daily News found it ironic that McMillan would support a billionaire real estate developer, given the name of his party, noting that rent for a one-bedroom apartment at the\xa0 glitzy Trump Tower runs about $6,000 a month.\r\n\r\nBut the perennial candidate for mayor of New York City, governor of New York state, and president also is a Vietnam War veteran and said Trump\'s