## Dealing with different website layouts

In [350]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

class Content:
    def __init__(self, source, url, title, body):
        self.source = source
        self.url = url
        self.title = title
        self.body = body
    
    def print(self):
        print(f'SOURCE:  {self.source}')
        print(f'TITLE: {self.title}')
        print(f'URL: {self.url}')
        print(f'BODY:\n {self.body}')

def scrapeAstroDienst(url):
    source = 'AstroDienst Databank'
    bs = BeautifulSoup(urlopen(url))
    title = bs.find('h1').text
    body = bs.find('div', {'id': 'bodyContent'}).text
    return Content(source, url, title, body)

def scrapeAstrotheme(url):
    source = 'Astrotheme'
    bs = BeautifulSoup(urlopen(url))
    title = bs.find('h1').text
    body = bs.find('p', {'id': 'biographie'}).text
    return Content(source, url, title, body)

def scrapeAstroSeek(url):
    source = 'AstroSeek'
    bs = BeautifulSoup(urlopen(url))
    title = bs.find('div', {'class': 'detail-info'}).find_next_sibling('h2')
    body = bs.find('div', {'class': 'detail-rozbor'})
    return Content(source, url, title, body)

url = 'https://www.astro.com/astro-databank/Downey%2C_Robert_Jr.'
content = scrapeAstroDienst(url)
content.print()

url = 'https://www.astrotheme.com/astrology/Robert_Downey_Jr.'
content = scrapeAstrotheme(url)
content.print()

url = 'https://www.astro-seek.com/birth-chart/robert-downey-jr-horoscope'
content = scrapeAstroSeek(url)
content.print()


SOURCE:  AstroDienst Databank
TITLE: Downey, Robert Jr.
URL: https://www.astro.com/astro-databank/Downey%2C_Robert_Jr.
BODY:
 
From Astro-Databank



Jump to navigation
Jump to search
 



Robert Jr. Downey,natal chart (Placidus)
natal chart English style (Equal houses)natal chart with Whole Sign houses




Name



Downey, Robert Jr.
Gender:  M



Birthname
Robert John Downey, Jr.


born on
4 April 1965 at 13:10  (= 1:10 PM )


Place
Manhattan, New York, 40n46,  73w59 


Timezone
EST h5w (is standard time)


Data source



From memory



Rodden Rating A


Collector: March




Astrology data
  14°45'   19°12 Asc. 13°51'

add Robert Jr. Downey to 'my astro'



 Robert Jr. Downey   (2014)photo: Gage Skidmore, license cc-by-sa-2.0
Biography
American actor and producer whose career has been characterized by critical and popular success in his youth, followed by a period of substance abuse and legal troubles, before a resurgence of commercial success in middle age. From 2013 to 2015, he was 

In [354]:
class Content:
    """
    Common base class for all articles/pages
    """
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print(f'URL: {self.url}')
        print(f'TITLE: {self.title}')
        print(f'BODY:\n{self.body}')

class Website:
    """ 
    Contains information about website structure
    """
    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [355]:
from bs4 import BeautifulSoup


class Crawler:
    def getPage(url):
        try:
            html = urlopen(url)
        except Exception:
            return None
        return BeautifulSoup(html, 'html.parser')

    def safeGet(bs, selector):
        """
        Utilty function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        selectedElems = bs.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def getContent(website, path):
        """
        Extract content from a given page URL
        """
        url = website.url+path
        bs = Crawler.getPage(url)
        if bs is not None:
            title = Crawler.safeGet(bs, website.titleTag)
            body = Crawler.safeGet(bs, website.bodyTag)
            return Content(url, title, body)
        return Content(url, '', '')

In [358]:
siteData = [
    ['AstroDienst', 'https://www.astro.com/astro-databank', 'h1', 'div#bodyContent'],
    ['Astrotheme', 'https://www.astrotheme.com/astrology', 'h1', 'div.corpsTexte'],
    ['AstroSeek', 'https://www.astro-seek.com/birth-chart', 'h2', 'div.detail-rozbor-items-profil'],
]
websites = []
for name, url, title, body in siteData:
    websites.append(Website(name, url, title, body))

Crawler.getContent(websites[0], '/Downey%2C_Robert_Jr.').print()
Crawler.getContent(
    websites[1], '/Robert_Downey_Jr.').print()
Crawler.getContent(
    websites[2],
    '/robert-downey-jr-horoscope').print()

URL: https://www.astro.com/astro-databank/Downey%2C_Robert_Jr.
TITLE: Downey, Robert Jr.
BODY:

From Astro-Databank



Jump to navigation
Jump to search
 



Robert Jr. Downey,natal chart (Placidus)
natal chart English style (Equal houses)natal chart with Whole Sign houses




Name



Downey, Robert Jr.
Gender:  M



Birthname
Robert John Downey, Jr.


born on
4 April 1965 at 13:10  (= 1:10 PM )


Place
Manhattan, New York, 40n46,  73w59 


Timezone
EST h5w (is standard time)


Data source



From memory



Rodden Rating A


Collector: March




Astrology data
  14°45'   19°12 Asc. 13°51'

add Robert Jr. Downey to 'my astro'



 Robert Jr. Downey   (2014)photo: Gage Skidmore, license cc-by-sa-2.0
Biography
American actor and producer whose career has been characterized by critical and popular success in his youth, followed by a period of substance abuse and legal troubles, before a resurgence of commercial success in middle age. From 2013 to 2015, he was listed by Forbes as Hollywood's

## Crawling through sites with search

In [359]:
class Content:
    """Common base class for all articles/pages"""

    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self):
        """
        Flexible printing function controls output
        """
        print(f'New article found for topic: {self.topic}')
        print(f'URL: {self.url}')
        print(f'TITLE: {self.title}')
        print(f'BODY:\n{self.body}')

In [360]:
class Website:
    """Contains information about website structure"""

    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [368]:
from bs4 import BeautifulSoup

class Crawler:
    def __init__(self, website):
        self.site = website
        self.found = {}

    def getPage(url):
        try:
            html = urlopen(url)
        except Exception as e:
            return None
        return BeautifulSoup(html, 'html.parser')

    def safeGet(bs, selector):
        """
        Utilty function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        selectedElems = bs.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def getContent(self, topic, url):
        """
        Extract content from a given page URL
        """
        bs = Crawler.getPage(url)
        if bs is not None:
            title = Crawler.safeGet(bs, self.site.titleTag)
            body = Crawler.safeGet(bs, self.site.bodyTag)
            return Content(topic, url, title, body)
        return Content(topic, url, '', '')

    def search(self, topic):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = Crawler.getPage(self.site.searchUrl + topic)
        searchResults = bs.select(self.site.resultListing)
        for result in searchResults:
            url = result.select(self.site.resultUrl)[0].attrs['href']
            # Check to see whether it's a relative or an absolute URL
            url = url if self.site.absoluteUrl else self.site.url + url
            if url not in self.found:
                self.found[url] = self.getContent(topic, url)
            self.found[url].print()



siteData = [
    ['AstroSeek', 'https://www.astro-seek.com/birth-chart', 'https://famouspeople.astro-seek.com/search/?narozeni_jmeno=', 'table',
        'table a', True, 'h2', 'div.detail-rozbor-items-profil'],
    ['AstroDienst', 'https://www.astro.com', 'https://www.astro.com/wiki/astro-databank/index.php?search=', 'div.searchresults', 
        'div.searchresults a', False, 'h1', 'div#bodyContent']
]
sites = []
for name, url, search, rListing, rUrl, absUrl, tt, bt in siteData:
    sites.append(Website(name, url, search, rListing, rUrl, absUrl, tt, bt))

crawlers = [Crawler(site) for site in sites]
topics = ['robert%20downey']

for topic in topics:
    for crawler in crawlers:
        crawler.search(topic)


New article found for topic: robert%20downey
URL: https://www.astro-seek.com/birth-chart/robert-downey-jr-horoscope
TITLE: Robert Downey Jr. - Birth Chart

(Robert John Downey Jr)

BODY:



ASTROLOGICAL SIGNS







Sun in Aries14°45’
Sun Sign - Zodiac Sign



The ruler of Aries is the planet Mars, which symbolizes energy, health, fighting spirit and leadership skills. People born under this sign are courageous and decisive in most situations in life. -»














Moon in Taurus19°11’
Moon (Luna)



Your safety depends on stability, which is not easy to satisfy. You may have a practical nature, dependent on the material aspects of life. You have to learn to accept change as part of your life, otherwise complications can occur - changes caused by other people that you love, illness and so on. The basic problem is finding your own self-respect in order to ensure that you do not mistakenly seek it in material things. When you accept yourself for who you are, it will become easier for 

## Crawling Sites through Links

In [None]:
class Website:

    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag


class Content:

    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print(f'URL: {self.url}')
        print(f'TITLE: {self.title}')
        print(f'BODY:\n{self.body}')

In [None]:
import re


class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = {}

    def getPage(url):
        try:
            html = urlopen(url)
        except Exception as e:
            print(e)
            return None
        return BeautifulSoup(html, 'html.parser')

    def safeGet(bs, selector):
        selectedElems = bs.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def getContent(self, url):
        """
        Extract content from a given page URL
        """
        bs = Crawler.getPage(url)
        if bs is not None:
            title = Crawler.safeGet(bs, self.site.titleTag)
            body = Crawler.safeGet(bs, self.site.bodyTag)
            return Content(url, title, body)
        return Content(url, '', '')

    def crawl(self):
        """
        Get pages from website home page
        """
        bs = Crawler.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            url = targetPage.attrs['href']
            url = url if self.site.absoluteUrl else f'{self.site.url}{targetPage}'
            if url not in self.visited:
                self.visited[url] = self.getContent(url)
                self.visited[url].print()


brookings = Website('Reuters', 'https://brookings.edu', '\/(research|blog)\/', True, 'h1', 'div.post-body')
crawler = Crawler(brookings)
crawler.crawl()

URL: https://www.brookings.edu/blog/fixgov/2023/04/05/what-we-learned-from-the-chicago-mayoral-results/
TITLE: What we learned from the Chicago mayoral results
What we learned from the Chicago mayoral results
BODY:

As Chicagoans went to the polls on Tuesday, early signs pointed to a narrow victory for Paul Vallas, the former head of the city’s public school system and noted educational reformer, over Brandon Johnson, a former social studies teacher turned organizer for the Chicago Teachers Union. Vallas led in the pre-election polls by an average of 3 percentage points, a margin that widened to 6 points when undecided voters were asked whether they leaned toward a candidate. A higher share of Vallas’s supporters said that they were certain to cast their ballots, and more of Johnson’s said that they might change their minds about their choice. Vallas enjoyed a strong lead among voters 60 and older, who are the most likely to vote of all age cohorts, while Johnson was doing best among t

URL: https://www.brookings.edu/research/addressing-the-looming-sovereign-debt-crisis-in-the-developing-world-it-is-time-to-consider-a-brady-plan/
TITLE: Addressing the looming sovereign debt crisis in the developing world: It is time to consider a ‘Brady’ plan
Addressing the looming sovereign debt crisis in the developing world: It is time to consider a ‘Brady’ plan
BODY:








Brahima Sangafowa Coulibaly

					Vice President and Director - Global Economy and Development 

					Senior Fellow - Global Economy and Development 

 Twitter
BSangafowaCoul






W



Wafa Abedin

					Research and Administrative Assistant to the Vice President and Director - Global Economy and Development 




Among the challenges facing developing countries, none is arguably more crucial than the significantly deteriorated fiscal situation that threatens to erase several years of progress on development agendas. According to some estimates, almost 60 percent of the poorest countries are either in or at hig

URL: https://www.brookings.edu/research/the-second-half-of-the-sustainable-development-goal-era-ideas-for-doing-things-differently/
TITLE: The ‘Second Half’ of the Sustainable Development Goal era: Ideas for doing things differently
The ‘Second Half’ of the Sustainable Development Goal era: Ideas for doing things differently
BODY:








John W. McArthur

					Director - Center for Sustainable Development 

					Senior Fellow - Global Economy and Development 

 Twitter
@mcarthur





This September, the U.N. will convene a midpoint summit on the Sustainable Development Goals, halfway between their 2015 launch and 2030 deadline. For many leaders gathering in the General Assembly, the mood might be somber. Stark global tensions alongside inadequate SDG progress make for a tough outlook. But a successful summit will need to focus on pragmatics more than sentiment: What has gone well, where could a burst of effort tackle gaps, and—perhaps most importantly—what needs to be done differently

URL: https://www.brookings.edu/research/caring-about-care-an-sdg-5-priority/
TITLE: Caring about Care: An SDG-5 priority
Caring about Care: An SDG-5 priority
BODY:








Caren Grown

					Senior Fellow - Global Economy and Development, Center for Sustainable Development 




Goal 5 is an ambitious and expansive approach to reducing gaps between males and females and enabling women and girls to live their lives to the fullest. It proposes a multidimensional definition of, and comprehensive set of indicators for, tracking gender equality and women’s empowerment, complemented with targets and indicators across other goals. While advances have been made toward many aspects of Goal 5, the U.N. estimates that at the current rate, it will take nearly 300 years to meet all targets.  A high priority for accelerating progress is Target 5.4, which seeks to equalize the time that women and men spend on unpaid care and domestic work, including care for children, the elderly, the sick, and those w

URL: https://www.brookings.edu/blog/brown-center-chalkboard/2023/04/05/state-of-the-states-gubernatorial-policy-priorities-in-2023/
TITLE: State of the States: Gubernatorial policy priorities in 2023
State of the States: Gubernatorial policy priorities in 2023
BODY:








Katharine Meyer

					Fellow - Governance Studies, Brown Center on Education Policy 

 Twitter
@katharinemeyer








Rachel M. Perera

					Fellow - Governance Studies, Brown Center on Education Policy - The Brookings Institution 

 Twitter
@rachelmarisa





The federal government plays a limited role in education policy—states and local governments are primarily responsible for educating our nation’s youth. The first federal laws about education governance weren’t introduced until 1965 with the Elementary and Secondary Education Act (ESEA) and Higher Education Act (HEA). And still, states are given broad latitude to determine how to best implement these federal laws in their states. Today, the federal government

URL: https://www.brookings.edu/blog/order-from-chaos/2023/04/05/when-might-us-political-support-be-unwelcome-in-taiwan/
TITLE: When might US political support be unwelcome in Taiwan?
When might US political support be unwelcome in Taiwan?
BODY:

For a time, it looked as though House Speaker Kevin McCarthy would make a high-profile visit to Taiwan this spring. There was some suggestion that this might lead Beijing to react even more coercively than it did after the previous speaker, Nancy Pelosi, visited in August 2022. Perhaps for that reason, McCarthy will now have a meeting with Taiwanese President Tsai Ing-wen when she transits through Los Angeles, California. Depending on how McCarthy frames his support for Tsai, however, the People’s Republic of China (PRC) might still escalate its military operations around Taiwan to signal its opposition to the alleged “hollowing out” of the U.S. “One China” policy. Depending on the scale of these actions, some Taiwanese voters might again concl

URL: https://www.brookings.edu/research/sdg-implementation-for-fragile-countries-needs-more-risk-taking/
TITLE: SDG implementation for fragile countries needs more risk-taking
SDG implementation for fragile countries needs more risk-taking
BODY:








Naheed Sarabi

					Visiting Fellow - Global Economy and Development, Center for Sustainable Development 

 Twitter
Sarabinaheed





In 2023, concurring economic, social, and environmental crises are disproportionately affecting fragile states, creating a grim outlook for achieving the SDGs by 2030. The Global Peace Index Report for 2022 indicates deteriorating global peacefulness since 2014, with a growing gap between the most peaceful and least peaceful countries. SDG progress has been either stagnating or declining in more than half of the fragile states. Poverty and insecurity are on the rise in conflict-affected and fragile countries, where 20 percent of the global share of those in extreme poverty live; this is expected to rise t

URL: https://www.brookings.edu/research/scaling-private-sector-engagement-in-the-sdgs/
TITLE: Scaling private sector engagement in the SDGs
Scaling private sector engagement in the SDGs
BODY:








Jane Nelson

					Nonresident Senior Fellow - Global Economy and Development, Center for Sustainable Development 







George Ingram

					Senior Fellow - Global Economy and Development, Center for Sustainable Development 

 Twitter
@GMIngramIV





Private sector investment and innovation are essential to achieving the Sustainable Development Goals (SDGs). A vanguard of companies is making public commitments and taking action. Yet, business engagement and impact are far from becoming mainstream. A concerted effort is required to scale the quantity, quality, and accountability of private sector activities that could have a measurable impact on supporting the SDGs.  
In the 12th U.N. Global Compact-Accenture CEO Study, released in 2023, 98 percent of more than 2,600 chief executives acros

URL: https://www.brookings.edu/research/a-purpose-driven-fund-to-end-extreme-poverty-by-2030/
TITLE: A purpose-driven fund to end extreme poverty by 2030
A purpose-driven fund to end extreme poverty by 2030
BODY:








Homi Kharas

					Senior Fellow - Global Economy and Development, Center for Sustainable Development 







John W. McArthur

					Director - Center for Sustainable Development 

					Senior Fellow - Global Economy and Development 

 Twitter
@mcarthur





Ending extreme poverty by 2030 is first among equals within the Sustainable Development Goals. When SDG target 1.1 was formally adopted in 2015, the number of extremely poor people was thought to be around 730 million globally and was falling by roughly 65 million a year. Continuing that trend would have cut poverty rates to zero by 2030. But progress has slowed instead. Recent projections suggest 570 million people might still be poor in 2030, far short of elimination. At the SDG midpoint, rebooting efforts to endi

## Crawling multiple page types

In [None]:
class Website:
    """Common base class for all articles/pages"""

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        

In [None]:
class Product(Website):
    """Contains information for scraping a product page"""

    def __init__(self, name, url, titleTag, productNumber, price):
        Website.__init__(self, name, url, TitleTag)
        self.productNumberTag = productNumberTag
        self.priceTag = priceTag

class Article(Website):
    """Contains information for scraping an article page"""

    def __init__(self, name, url, titleTag, bodyTag, dateTag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag