In [6]:
from newsfaces.extract_html import Extractor
from newsfaces.utils import make_link_absolute, page_grab, make_request
from newsfaces.crawlers.crawler import Crawler
from newsfaces.models import Image, Article, ImageType


class Politico(Crawler):
    def __init__(self):
        super().__init__()

    def crawl(self):
        """
        Implement crawl here to override behavior
        """
        return self.politico_get_urls()

    def get_urls(self, url):
        """
        This function takes a URLs and returns lists of URLs
        for containing each article on that page.

        Parameters:
            * url:  a URL to a page of articles

        Returns:
            A list of article URLs on that page.
        """
        response = self.make_request(url)
        urls = []
        container = response.cssselect("div.summary")

        for j in container:
            atr = j.cssselect("a")
            if atr and len(atr) > 0:
                href = atr[0].get("href")
                urls.append(make_link_absolute(href, "https://www.politico.com"))
        return urls

    def politico_get_urls(self):
        urls=set()
        for page in range(1, 3400):
            urls = urls.union(self.get_urls(f"https://www.politico.com/politics/{page}"))
        return urls

class Politico_Extractor(Extractor):
    def __init__(self):
        super().__init__()
        self.article_body = ["div.story-text"]
        self.img_p_selector = [
            "section.media-item.media-item--story.media-item--story-lead"
        ]
        self.img_selector = ["img"]
        self.head_img_div = [
            "section.media-item.media-item--story.media-item--story-lead"
        ]
        self.video = ["div.media-item__video"]
        self.head_img_select = ["img"]
        self.p_selector = ["p"]
        self.t_selector = ["h2.headline"]

    def scrape(self, url):
        """
        Extract html and from
        """
        html = page_grab(url)
        imgs, art_text, t_text = self.extract_html(html)
        imgs += self.extract_video_imgs(html)
        article = Article(title=t_text or "", article_text=art_text or "", images=imgs)
        return article

    def extract_video_imgs(self, html):
        videos = []
        imgs = []

        for i in self.video:
            videos += html.cssselect(i)
        item = []
        cap_elements = []
        for v in videos:
            item += v.cssselect("video")
        cap_elements += html.xpath('//div[contains(@class, "vjs-dock-title")]')
        print(cap_elements, "vidoes")
        # Extract captions from cap_elements
        captions = [element.text_content() for element in cap_elements]
        print(cap_elements, "captions")

        for i, video in enumerate(item):
            
            img_item = Image(
                url=video.get("poster") or "",
                image_type=ImageType("video_thumbnail"),
                caption=captions[i] if i < len(captions) else "",
                alt_text="",
            )
            imgs.append(img_item)
        return imgs
      
a=Politico_Extractor()
a.scrape('https://www.politico.com/news/2023/08/07/bidenomics-white-house-economy-00109977')
#b=Politico()
#b.crawl()

Fetching https://www.politico.com/news/2023/08/07/bidenomics-white-house-economy-00109977
[] vidoes
[] captions


Article(title='\nThe White House plays it cool as ‘Bidenomics’ struggles to catch on\n', article_text='\nDemocrats acknowledge that slapping Joe Biden’s name on the economy is a gamble, given the prospect of it moving in the wrong direction. | Susan Walsh/AP Photo\nBy Jennifer Haberkorn\n08/07/2023 05:01 AM EDT\nLink CopiedPresident Joe Biden is risking a lot on “Bidenomics.” But, about two months in, his efforts to sell his sweeping economic agenda don’t appear to be working.Poll numbers show persistent voter skepticism about the state of the economy, and Republicans are working aggressively to take back the term, dubbing it as synonymous with tax hikes and inflation.Inside the White House, aides remain confident the bet will pay off, adopting the mantra of the hockey legend Wayne Gretzky: Skate to where the puck is going, not where it is now.', images=[Image(url='https://www.politico.com/dims4/default/e8e9ac9/2147483647/strip/true/crop/5837x3891+0+0/resize/630x420!/quality/90/?url=ht

In [37]:
from newsfaces.utils import make_link_absolute
from newsfaces.crawlers.crawler import WaybackCrawler
# Util Functions
from newsfaces.extract_html import Extractor
from newsfaces.utils import make_link_absolute, page_grab
from newsfaces.crawlers.crawler import Crawler
from newsfaces.models import Image, Article, ImageType
import datetime
import pytz
import lxml


class AP(WaybackCrawler):
    def __init__(self):
        super().__init__()
        self.urls = ["https://apnews.com/politics", "https://apnews.com/hub/politics"]
        self.start_url = ""
        self.selector = []

    def get_archive_urls(self, url, selectors):
        """
        This function takes a URLs and returns lists of URLs
        for containing each article on that page.

        Parameters:
            * url:  a URL to a page of articles
            * session: optional session object parameter
            * selectors: a list of css selectors

        Returns:
            A list of article URLs on that page.
        """
        response = self.make_request(url)
        urls = []
        selectors = [
            "div.FourColumnContainer-column",
            "div.TwoColumnContainer7030",
            "div.PageList-items",
            "article",
        ]
        for a in selectors:
            container = response.cssselect(a)
            if len(container) > 0:
                urls += self.parse_links(container)
        xpath_sel = ["TwoColumnContainer", "CardHeadline"]
        # for items that have random characters continually added at the end so we do non-exact matching
        for j in xpath_sel:
            container = response.xpath(f"//div[contains(@class, '{j}')]")
            if len(container) > 0:
                urls += self.parse_links(container)

        return urls

    def crawl(self, startdate, enddate, delta_hrs=6):
        if startdate < datetime.datetime(
            2023, 6, 26, 0, 0, tzinfo=pytz.timezone("utc")
        ):
            self.start_url = self.urls[1]
            changepage_date = datetime.datetime(
                2023, 6, 26, 0, 0, tzinfo=pytz.timezone("utc")
            )
            hub_site = super().crawl(startdate, changepage_date, delta_hrs)
        self.start_url = self.urls[0]
        current_site = super().crawl(startdate, enddate, delta_hrs)
        return hub_site.union(current_site)

    def parse_links(self, container):
        """
        Takes a list of container objects and returns the urls
        from within
        """
        urls = []
        for j in container[0]:
            atr = j.cssselect("a")
            for a in atr:
                href = a.get("href")
                if href is not None:
                    if href.startswith("/web/"):
                        href = make_link_absolute(href, "https://web.archive.org")
                    urls.append(href)
        return urls


class AP_Extractor(Extractor):
    def __init__(self):
        super().__init__()
        self.article_body = ["main.Page-main"]
        self.img_p_selector = ["figure.Figure"] 
        self.img_selector = ["img"]
        self.head_img_div = ["div.Page-lead"]
        self.head_img_select = ["img"]
        self.p_selector = ["p"]
        self.t_selector = ["h1.Page-headline"]

a= AP_Extractor()
a.scrape('https://apnews.com/article/-----804f40cacfa94097a0e5311d1604afec')

#html=page_grab('https://apnews.com/article/nepal-rice-day-festival-harvest-farmers-ae3cd725bd1784d29ea499c2ba383f21')
#print(lxml.etree.tostring(html))

# def extract_imgs(html, img_p_selector, img_selector):
#     """
#     Extract the image content from an HTML:
#     Inputs:
#         - html(str): html to extract images from
#         - img_p_selector(list): css selector for the parent elements of images
#         - img_selector(list): css selector for the image elements
#         Return:
#         -imgs(lst): each element is an image represented as an image object
#     """
#     imgs = []
#     for selector in img_p_selector:
#         img_container = html.cssselect(selector)
#         for container in img_container:
#             for j in img_selector:
#                 photos = container.cssselect(j)
#                 for i in photos:
#                     img_item = Image(
#                         url=i.get("src") or "",
#                         image_type=ImageType("main"),
#                         caption=i.get("caption") or "",
#                         alt_text=i.get("alt") or "",
#                     )
#                 imgs.append(img_item)
#     return imgs

 #a= page_grab('https://apnews.com/article/trump-georgia-election-investigation-grand-jury-willis-d39562cedfc60d64948708de1b011ed3')
# extract_imgs(a, ["figure.Figure"], ["img"])

Fetching https://apnews.com/article/-----804f40cacfa94097a0e5311d1604afec


TypeError: 'NoneType' object is not iterable

In [55]:
from newsfaces.crawlers.crawler import Crawler, WaybackCrawler
from newsfaces.utils import make_link_absolute


class BBC_Latest(Crawler):
    def __init__(self):
        super().__init__()
        self.start_url = "https://www.bbc.com/news/topics/cwnpxwzd269t?page=1"
    
    def crawl(self):
        '''
        run get_html with correct initial html from init
        '''
        return self.get_newslink(self.start_url)
    
    def get_newslink(self, url, articles=set(), videos=set()):
        """
        Takes an initial url and runs get_urls on all possible
        API queries. Gathering all possible articles and videos
        from the API into a set.
        """
        article, video = self.get_urls(url)
        articles = articles.union(article)
        videos = videos.union(video)
        begin = url.find("page=") + 5
        pagenumber = int(url[begin : len(url)])
        if pagenumber < 42:
            newlink = url[: -len(str(pagenumber))] + str(pagenumber + 1)
            article, video = self.crawl(newlink, articles, videos)
            articles = articles.union(article)
            videos = videos.union(video)
        return articles, videos

    def get_urls(self, url, articles=set(), videos=set()):
        """
        This function takes a URLs and returns lists of URLs
        for containing each article and video on that page.

        Parameters:
            * url:  a URL to a page of articles

        Returns:
            A list of URLs to each video and article on that page.
        """
        response = self.make_request(url)
        container = response.cssselect("div")
        filtered_container = [
            elem for elem in container if elem.get("type") is not None
        ]

        for j in filtered_container:
            # find video/article
            type = j.get("type")
            # find link
            if type == "article" or type == "video":
                a = j[0].cssselect("a")
                href = a[0].get("href")
                href = make_link_absolute(href, "https://www.bbc.com")
            if type == "article":
                articles.add(href)
            elif type == "video":
                videos.add(href)
        return articles, videos


class BBC(WaybackCrawler):
    def __init__(self):
        super().__init__()
        self.start_url = "https://www.bbc.com/news/topics/cwnpxwzd269t"
        self.selector = ["div.archive__item__content", "h2.node__title.node-title"]

    def get_archive_urls(self, url, selector):
        return self.get(url)


class BBC(WaybackCrawler):
    def __init__(self):
        super().__init__()
        self.start_url = "https://www.bbc.com/news/topics/cwnpxwzd269t"
        self.selector = ["div.archive__item__content", "h2.node__title.node-title"]

    def get_archive_urls(self, url, selector):
        return self.get(url)

    def get(self, url, articles=set(), videos=set()):
        """
        This function takes a URLs and returns lists of URLs
        for containing each article and video on that page.

        Parameters:
            * url:  a URL to a page of articles

        Returns:
            A list of URLs to each video and article on that page.
        """
        response = self.make_request(url)
        xpath_sel= ['article','video']
            # for items that have random characters continually added at the end so we do non-exact matching
        for j in xpath_sel:
            container = response.xpath(f"//div[contains(@type, '{j}')]")
            if container:
                for j in container:
                    a = j[0].cssselect("a")
                    href = a[0].get("href")
                    href = make_link_absolute(href, "https://web.archive.org")
                    if j == "article":
                        articles.add(href)
                    else:
                        videos.add(href)
        return articles.union(videos)
    
class BBC_Extractor(Extractor):
    def __init__(self):
        super().__init__()
        self.article_body = ["main"]
        self.img_p_selector = ["figure", "div#mediaContainer"] 
        self.img_selector = ["img"]
        self.head_img_div = []
        self.head_img_select = []
        self.p_selector = ["p"]
        self.t_selector = ["h1"]

    def extract_imgs(self, html, img_p_selector, img_selector):
        """
        Extract the image content from an HTML:
        Inputs:
            - html(str): html to extract images from
            - img_p_selector(list): css selector for the parent elements of images
            - img_selector(list): css selector for the image elements
            Return:
            -imgs(lst): each element is an image represented as an image object
        """
        imgs = []
        captions= html.cssselect("figcaption")

        for selector in img_p_selector:
            img_container = html.cssselect(selector)
            print(selector)
            print(img_container)
            if len(img_container) == 0:
                continue
            for container in img_container:
                for j in img_selector:
                    photos = container.cssselect(j)
                    for i, photo in enumerate(photos):
                        if selector == "#mediaContainer":
                            type="video_thumbnail"
                        else:
                            type="main"
                        img_item = Image(
                            url=photo.get("src") or "",
                            image_type=ImageType(type),
                            caption= captions[i].text or "",
                            alt_text=photo.get("alt") or "",
                        )
                        imgs.append(img_item)
        return imgs
    def extract_head_img(self, html, img_p_selector, img_selector):
        return []
    
a= BBC_Extractor()
#a.scrape("https://www.google.com/amp/s/www.bbc.com/news/world-us-canada-40127326.amp")
a.scrape("https://www.bbc.com/news/world-us-canada-65394456")

Fetching https://www.bbc.com/news/world-us-canada-65394456
figure
[<Element figure at 0x10e8810e0>, <Element figure at 0x108aedae0>]
[class^="mediaContainer"]
[]
2


Article(title='Biden aide gaffe leads to campaign clarification', article_text='White House Press Secretary Karine Jean-Pierre has clarified President Joe Biden\'s re-election plans after a comment she made sparked confusion. Hours after Mr Biden formally launched his campaign for a second four-year term in office she refused to say if he planned to complete the entire term. "That\'s something for him to decide," she said at a briefing. Mr Biden, 80, is the oldest president in US history. She later clarified that he would serve another full term in office if he wins. The spokeswoman\'s embarrassing about-face came only hours after Mr Biden officially announced he was running again in a pre-recorded video."Does the president plan to serve all eight years?" a reporter for Politico asked at a White House briefing on Tuesday. "That\'s something for him to decide," she began."I\'m not just not going to get ahead of it. And there\'s a 2024 campaign. Anything related to that, I would refer yo

In [98]:
from lxml import html
import requests
b= page_grab("https://www.bbc.com/news/world-us-canada-65394456")
c= b.cssselect("img")
for i in c:
    print(i.get("src"))
    i.tag
    parent=i.getparent()
    grandparent=parent.getparent()
    g=grandparent.getparent()



Fetching https://www.bbc.com/news/world-us-canada-65394456
https://ichef.bbci.co.uk/news/976/cpsprodpb/35F2/production/_129501831_karine.jpg
https://ichef.bbci.co.uk/news/976/cpsprodpb/097D/production/_129492420_gettyimages-1252051962-1.jpg
https://ichef.bbci.co.uk/news/976/cpsprodpb/3340/production/_124602131_gettyimages-1236348553.jpg
https://ichef.bbci.co.uk/news/976/cpsprodpb/102E7/production/_129497266_gettyimages-1228795132.jpg
https://ichef.bbci.co.uk/news/385/cpsprodpb/FE23/production/_130795056_afghan_promo.jpg
https://ichef.bbci.co.uk/news/385/cpsprodpb/1518/production/_130800450_gettyimages-1248276093.jpg
https://ichef.bbci.co.uk/news/385/cpsprodpb/1639F/production/_130793019_bbc-sport-index-imagery-4-split-images-gradient-3a3fde8f-2929-413b-8960-c6c999c356da.png
https://ichef.bbci.co.uk/news/385/cpsprodpb/12378/production/_130661647_20230806_105553.jpg
https://ichef.bbci.co.uk/news/385/cpsprodpb/D907/production/_130695555_a70c0d19-085e-44a1-8f78-fa9d3d748c2a.jpg
https://ich