In [48]:
def extract_html(
    html, article_selector,img_p_selector, img_selector="img", p_selector=None,
    t_selector=None):
    """
    Extract the image and text content from and HTML:
    Inputs:
        - html(str): Full html of an artcile url
        - article_selector(str): css selector for article container
        - img_p_selector(str): css selector for the parent elements of images in article
        - img_selector(str): css selector for images living inside the article
        container
        - p_selector(str): css selector for paragraphs living inside the article container
        - t_selector(str): css selector for title living inside the article container
    Return:
        -imgs(lst): list where each element is an image represented as a dictionary
        with src, alt, title, and caption as fields
        - art_text(str): Article text
        - t_text(str): Title
    """
    article_body = html.cssselect(article_selector)[0]
    imgs = extract_imgs(article_body,img_p_selector,img_selector)
    art_text = extract_text(article_body, p_selector)
    if t_selector:
        t_text = html.cssselect(t_selector)[0].text
    return imgs, art_text, t_text


def extract_imgs(html, img_p_selector,img_selector="img"):
    """
    Extract the image content from an HTML:
    Inputs:
        - html(str): html to extract images from
        - img_p_selector(str): css selector for the parent elements of images in articles
    Return:
        - img_selector(str): css selector for the image elements
        Return:
        -imgs(lst): list where each element is an image represented as a dictionary
        with src, alt, title, and caption as fields
    """
    imgs = []
    for selector in img_p_selector:
        img_container = html.cssselect(selector)
        for container in img_container:
            images = container.cssselect(img_selector)
            for img in images:
                img_item = {}
                img_item["src"] = img.get("src")
                img_item["alt"] = img.get("alt")
                img_item["title"] = img.get("title")
                img_item["caption"] = img.get("caption")
                imgs.append(img_item)
    return imgs


def extract_text(html, p_selector=None):
    """
    Extract the article text content from an HTML:
    Inputs:
        - p_selector(str): css selector for paragraphs living inside the article container
    Return:
        - text(str): Article text
    """
    text = None
    if p_selector:
        paragraphs = html.cssselect(p_selector)
        if paragraphs:
            text = ""
            for p in paragraphs:
                text += p.text_content()

    return text

In [49]:
import requests
import lxml.html

#The Hill
#Test The Hill

url_23 = "https://thehill.com/policy/equilibrium-sustainability/4083552-referrals-could-boost-participation-in-low-income-rooftop-solar-programs-study/"
url_21 = "https://thehill.com/policy/technology/overnights/569074-hillicon-valley-millions-exposed-due-to-microsoft/" 
url_20 = "https://thehill.com/policy/finance/520444-advocates-plead-for-housing-aid-as-eviction-cliff-looms/"
resp = requests.get(url_20)
root = lxml.html.fromstring(resp.text)  
the_hill_23 = extract_html(root,article_selector="div.col-main",t_selector="h1.page-title",p_selector="p",img_p_selector=["figure.article__featured-image"])
the_hill_23


([{'src': 'https://i0.wp.com/thehill.com/wp-content/uploads/sites/2/2020/10/eviction-order_az_100720getty_housing.jpg?w=2000&ssl=1',
   'alt': '',
   'title': None,
   'caption': None}],
 'A potentially dire housing crisis could erupt if the Trump administration and Congress fail to reach a deal on further coronavirus relief that includes eviction protections and substantial rent assistance, experts warn.The Centers for Disease Control and Prevention (CDC) issued a sweeping eviction ban last month in an unprecedented flex of its emergency authorities, but the moratorium stands on shaky legal ground — and only runs through the end of the year.Uneven interpretations of the CDC’s ban among judges across the U.S. have hobbled its effectiveness, forcing thousands of families out of their rental homes already. Millions more could face the same fate when the ban expires on Jan. 1.The Trump administration and House Democrats have been locked in volatile negotiations over the past few weeks wit

In [50]:
#NPR
test_npr_w_image = "https://www.npr.org/sections/money/2023/07/24/1189443223/affirmative-action-for-rich-kids-its-more-than-just-legacy-admissions"
resp = requests.get(test_npr_w_image)
root = lxml.html.fromstring(resp.text)

npr_w_img = extract_html(root,article_selector="article.story",p_selector="p",img_p_selector=["div.image"],img_selector="img",t_selector="h1")
npr_w_img


([{'src': 'https://media.npr.org/assets/img/2023/07/21/gettyimages-1268846574_custom-a3f93f4771d72f71cb1e80fef79e45833e29ad87-s1100-c50.jpg',
   'alt': '',
   'title': None,
   'caption': None},
  {'src': None, 'alt': '', 'title': None, 'caption': None}],
 '\n      \n      Greg Rosalsky\n    \n  \n                CAMBRIDGE, MASSACHUSETTS - JUNE 29: People walk through the gate on Harvard Yard at the Harvard University campus on June 29, 2023 in Cambridge, Massachusetts.\n                \n                    \n                    Scott Eisen/Getty Images\n                    \n                \n                hide caption\n            CAMBRIDGE, MASSACHUSETTS - JUNE 29: People walk through the gate on Harvard Yard at the Harvard University campus on June 29, 2023 in Cambridge, Massachusetts.A few weeks ago, the U.S. Supreme Court ended affirmative action in college admissions. The ruling held that the race-conscious admission programs of Harvard University and the University of North 

In [51]:
#Fox
test_fox = "https://www.foxnews.com/politics/hunter-biden-gushed-extravagant-gifts-burisma-exec-focus-corruption-probe"
resp = requests.get(test_fox)
root = lxml.html.fromstring(resp.text)

test_fox = extract_html(root,article_selector="div.article-content",p_selector="p",img_p_selector=["div.m"],img_selector="img",t_selector="h1.headline")
test_fox

([{'src': 'https://a57.foxnews.com/cf-images.us-east-1.prod.boltdns.net/v1/static/694940094001/7d957f1b-57f7-48a9-b9b9-e344f2a609ce/a9df2b8b-2aaa-46dd-8042-065b8b69ae92/1280x720/match/896/500/image.jpg?ve=1&tl=1',
   'alt': 'Mainstream media completely ignores latest development in Hunter Biden probe',
   'title': None,
   'caption': None},
  {'src': 'https://static.foxnews.com/foxnews.com/content/uploads/2023/06/biden-pozharskyi.jpg',
   'alt': 'Hunter Biden and Vadym Pozharskyi emails',
   'title': None,
   'caption': None},
  {'src': 'https://static.foxnews.com/foxnews.com/content/uploads/2023/06/Hunter-Biden-and-Mykola-Zlochevski-.jpg',
   'alt': 'Biden and Zlochevski',
   'title': None,
   'caption': None},
  {'src': 'https://static.foxnews.com/foxnews.com/content/uploads/2023/06/biden-archer-12.jpg',
   'alt': 'Correct spelling of Mykola Zlochevskyi',
   'title': None,
   'caption': None},
  {'src': 'https://static.foxnews.com/foxnews.com/content/uploads/2023/06/GettyImages-12550

In [52]:
#Politico

test_politico = "https://www.politico.com/news/2023/07/02/janet-yellen-china-visit-00104539"
resp = requests.get(test_politico)
root = lxml.html.fromstring(resp.text)

test_politico = extract_html(root,article_selector="div.container__column",p_selector="p.story-text__paragraph",img_p_selector="div.fig-graphic",img_selector="img",t_selector="h2.headline")
test_politico

SelectorSyntaxError: Expected ident, got <EOF at 1> (<string>)

In [55]:
#NBC

test_nbc = "https://www.nbcnews.com/politics/congress/jan-6-committee-unveils-final-report-capping-18-month-probe-rcna62629"
resp = requests.get(test_nbc)
root = lxml.html.fromstring(resp.text)

test_nbc = extract_html(root,article_selector="article",p_selector="p.story-text__paragraph",
img_p_selector=["div.fig-graphic"],img_selector="img",t_selector="h1")
test_nbc

SelectorSyntaxError: Expected ident, got <EOF at 1> (<string>)

In [63]:
#BBC

test_bbc = "https://www.bbc.com/news/world-us-canada-65033875"
resp = requests.get(test_bbc)
root = lxml.html.fromstring(resp.text)

test_bbc = extract_html(root,article_selector="article",p_selector="p",
img_p_selector=["body.video"],img_selector="img",t_selector="h1")
test_bbc

([],
 'Donald Trump\'s prediction that he will be arrested this week has yet to come true - but as the waiting game for criminal charges goes on, the former president is plotting a strategy designed to both keep him out of jail and turbo-boost his historic bid to return to the White House.It is a daunting task, and hunkered down in his Mar-a-Lago home, Mr Trump seems resigned to becoming the first US president to be indicted for a crime. Yet it\'s also clear he will not go quietly. His past choices in moments of political crisis are a useful guide to what we might see next.When cornered by political adversaries, Mr Trump punches back. This video can not be playedWas Trump\'s Stormy Daniels payment legal?Throughout his 2016 presidential campaign, he leaned in to conflict when other politicians might have backed off. Blasted for criticising war hero John McCain? Trump upped his attacks. Accused of sexual harassment before a debate with Hillary Clinton? He held a press conference with Bil