In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def getPage(url):
    """
    Utilty function used to get a Beautiful Soup object from a given URL
    """

    session = requests.Session()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
    try:
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        return None
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs

## Dealing with different website layouts

In [3]:
import requests

class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body


def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')


def scrapeNYTimes(url):
    bs = getPage(url)
    title = bs.find('h1').text
    lines = bs.select('div.StoryBodyCompanionColumn div p')
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find('h1').text
    body = bs.find('div', {'class', 'post-body'}).text
    return Content(url, title, body)


url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: Delivering inclusive urban access: 3 uncomfortable truths
URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/


The past few decades have been filled with a deep optimism about the role of cities and suburbs across the world. These engines of economic growth host a majority of world population, are major drivers of economic innovation, and have created pathways to opportunities for untold amounts of people.	






Jeffrey Gutman

					Nonresident Senior Fellow - Global Economy and Development 







Adie Tomer

					Fellow - Metropolitan Policy Program 

 Twitter
AdieTomer






But all is not well within our so-called Urban Century. Rapid urbanization, rising gentrification, concentrated poverty, and shortages of basic infrastructure have combined to create spatial inequity in cities and suburbs across the globe. The challenges of housing, moving, and employing so many people have led to longer travel time

Title: The Men Who Want to Live Forever
URL: https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html

Would you like to live forever? Some billionaires, already invincible in every other way, have decided that they also deserve not to die. Today several biotech companies, fueled by Silicon Valley fortunes, are devoted to “life extension” — or as some put it, to solving “the problem of death.”
It’s a cause championed by the tech billionaire Peter Thiel, the TED Talk darling Aubrey de Gray, Google’s billion-dollar Calico longevity lab and investment by Amazon’s Jeff Bezos. The National Academy of Medicine, an independent group, recently dedicated funding to “end aging forever.”
As the longevity entrepreneur Arram Sabeti told The New Yorker: “The proposition that we can live forever is obvious. It doesn’t violate the laws of physics, so we can achieve it.” Of all the slightly creepy aspects to this trend, the strangest is the least noticed: The people publicly ch

In [4]:
class Content:
    """
    Common base class for all articles/pages
    """
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

class Website:
    """ 
    Contains information about website structure
    """

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [5]:
import requests
from bs4 import BeautifulSoup


class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        """
        Utilty function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, site, url):
        """
        Extract content from a given page URL
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

In [6]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
    ['New York Times', 'http://nytimes.com', 'h1', 'div.StoryBodyCompanionColumn div p']
]
websites = []
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')
crawler.parse(
    websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(
    websites[2],
    'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(
    websites[3], 
    'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')

URL: https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/
TITLE: Idea to Retire: Old methods of policy education
Idea to Retire: Old methods of policy education
BODY:

Public policy and public affairs schools aim to train competent creators and implementers of government policy. While drawing on the principles that gird our economic and political systems to provide a well-rounded education, like law schools and business schools, policy schools provide professional training. They are quite distinct from graduate programs in political science or economics which aim to train the next generation of academics. As professional training programs, they add value by imparting both the skills which are relevant to current employers, and skills which we know will be relevant as organizations and societies evolve. 
The relevance of the skills that policy programs impart to address problems of today and tomorrow bears further discussion. We are living t

URL: https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html
TITLE: Oil Boom Gives the U.S. a New Edge in Energy and Diplomacy
BODY:
HOUSTON — A substantial rise in oil prices in recent months has led to a resurgence in American oil production, enabling the country to challenge the dominance of Saudi Arabia and dampen price pressures at the pump.
The success has come in the face of efforts by Saudi Arabia and its oil allies to undercut the shale drilling spree in the United States. Those strategies backfired and ultimately ended up benefiting the oil industry.
Overcoming three years of slumping prices proved the resiliency of the shale boom. Energy companies and their financial backers were able to weather market turmoil — and the maneuvers of the global oil cartel — by adjusting exploration and extraction techniques.
After a painful shakeout in the industry that included scores of bankruptcies and a significant loss of jobs, a steadier shale-drilling industry is a

## Crawling through sites with search

In [1]:
class Content:
    """Common base class for all articles/pages"""

    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self):
        """
        Flexible printing function controls output
        """
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [2]:
class Website:
    """Contains information about website structure"""

    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [3]:
import requests
from bs4 import BeautifulSoup

class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''

    def search(self, topic, site):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()


crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
        'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]
sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2],
                         row[3], row[4], row[5], row[6], row[7]))

topics = ['python', 'data science']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

GETTING INFO ABOUT: python
New article found for topic: python
URL: Leveraging the disruptive power of artificial intelligence for fairer opportunities
TITLE: 
According to President Obama’s Council of Economic Advisers (CEA), approximately 3.1 million jobs will be rendered obsolete or permanently altered as a consequence of artificial intelligence technologies. Artificial intelligence (AI) will, for the foreseeable future, have a significant disruptive impact on jobs. That said, this disruption can create new opportunities if policymakers choose to harness them—including some with the potential to help address long-standing social inequities. Investing in quality training programs that deliver premium skills, such as computational analysis and cognitive thinking, provides a real opportunity to leverage AI’s disruptive power.







Makada Henry-Nickie

					Fellow - Governance Studies 

 Twitter
mhnickie





AI’s disruption presents a clear challenge: competition to traditional skill

New article found for topic: python
URL: The Hutchins Center Explains: Budgeting for aging America
TITLE: 


For decades, we have been hearing that the baby-boom generation was like a pig moving through a python–bigger than the generations before and after. 
That’s true. But that’s also a very misleading metaphor for understanding the demographic forces that are driving up federal spending: They aren’t temporary. The generation born between 1946 and 1964 is the beginning of a demographic transition that will persist for decades after the baby boomers die, the consequence of lengthening lifespans and declining fertility. Putting the federal budget on a sustainable course requires long-lasting fixes, not short-lived tweaks.  
First, a few demographic facts.
As the chart below illustrates, there was a surge in births in the U.S. at the end of World War II, a subsequent decline, and then an uptick as baby boomers began having children.




Although the population has been rising, the numbe

New article found for topic: python
URL: Skills, success, and why your choice of college matters
TITLE: 


Amidst growing frustration with the cost of higher education, complaints also abound about its quality. One critique, launched in the book Academically Adrift by two sociologists, finds little evidence that college students score better on measures of critical thinking, writing, and reasoning after attending college. This is something of a paradox, since strong evidence shows that attending college tends to raise earnings power, even for students who start with mediocre preparation. 
Our recent study uses a different approach to assess the value of a college education. We find that the particular skills listed by a college’s alumni on their resumes predict how well graduates from those schools perform in terms of earning a living, meeting debt obligations, and working for high-paying or innovative companies. Since jobs requiring more valuable skills typically require at least some

New article found for topic: python
URL: An Atlanta organization’s mission to bring racial equity to the tech ecosystem
TITLE: 

Summary
Between the COVID-19 pandemic and the tragic death of George Floyd, the country’s ongoing crisis of racism has come into stark relief. Black Americans are disproportionately diagnosed with or dying from COVID-19 due to structural conditions, while also facing major economic risks as the racial unemployment gap between white and Black populations is the widest it’s been in five years. At the same time, Black people are still vulnerable to police violence that too often occurs without consequences. While there is a great deal of work to be done to dismantle structural racism, it is imperative to use this moment to remove racial barriers and invest in long-term prosperity for Black people, enterprises, and communities.







Reniya Dinkins

					Senior Research Assistant - Metropolitan Policy Program 

 Twitter
reniyasdinkins








Sifan Liu

					Sen

New article found for topic: python
URL: Inside the Pentagon’s Secret Afghan Spy Machine
TITLE: 
The Pentagon’s top researchers have rushed a classified and controversial intelligence program into Afghanistan. Known as “Nexus 7,” and previously undisclosed as a war-zone surveillance effort, it ties together everything from spy radars to fruit prices in order to glean clues about Afghan instability.
The program has been pushed hard by the leadership of the Defense Advanced Research Projects Agency (DARPA). They see Nexus 7 as both a breakthrough data-analysis tool and an opportunity to move beyond its traditional, long-range research role and into a more active wartime mission. 
But those efforts are drawing fire from some frontline intel operators who see Nexus 7 as little more than a glorified grad-school project, wasting tens of millions on duplicative technology that has nothing to do with stopping the Taliban. 
“There are no models and there are no algorithms,” says one person fami

New article found for topic: python
URL: Think Bigger on North Korea
TITLE: 
While the world is fixated on Iraq and the Middle East, North Korea continues to pose at least as great a threat to Western security interests. Six-party talks with the North Koreans in Beijing have just showed that the Bush administration hasn’t yet found a way out of the nuclear crisis. Although negotiations appear likely to resume in a couple of months, their prospects for success seem poor.
The basic dilemma is easy to understand. North Korea will not surrender its nuclear capabilities, which are among its only valuable national assets, unless offered a very good deal for giving them up. President Bush refuses to offer such a deal because he sees the North Korean demand as blackmail. He insists that before any talks about better diplomatic relations or economic interaction occur, North Korea first relinquish—with verification—a nuclear program it had pledged nine years ago to abandon completely. At most, B

New article found for topic: python
URL: Modeling with Data: Tools and Techniques for Scientific Computing
TITLE: 

PREFACE


Should you use the book? This book is intended to be a complement to the standard stats textbook, in three ways.
First, descriptive and inferential statistics are kept separate beginning with the first sentence of the first chapter. I believe that the fusing of the two is the number one cause of confusion among statistics students.
Once descriptive modeling is given its own space, and models do not necessarily have to be just preparation for a test, the options blossom. There are myriad ways to convert a subjective understanding of the world into a mathematical model, including simulations, models like the Bernoulli/Poisson distributions from traditional probability theory, ordinary least squares, and who knows what else.
If those options aren’t enough, simple models can be combined to form multilevel models to describe situations of arbitrary complexity. That i

New article found for topic: data science
URL: What all policy analysts need to know about data science
TITLE: 







Alex Engler

					Rubenstein Fellow - Governance Studies 

 Twitter
@AlexCEngler





Conversations around data science typically contain a lot of buzzwords and broad generalizations that make it difficult to understand its pertinence to governance and policy. Even when well-articulated, the private sector applications of data science can sound quite alien to public servants. This is understandable, as the problems that Netflix and Google strive to solve are very different than those government agencies, think tanks, and nonprofit service providers are focused on. This does not mean, however, that there is no public sector value in the modern field of data science. With qualifications, data science offers a powerful framework to expand our evidence-based understanding of policy choices, as well as directly improve service delivery.
To better understand its importance t

New article found for topic: data science
URL: Measuring racism and discrimination in economic data
TITLE: 
Although researchers in economics are increasingly cognizant that race and ethnicity are key determinants of economic outcomes, credibly assessing potential causes and identifying solutions is often complicated by the lack of high-quality data. The typical economist’s work primarily focuses on proposing relationships and testing for causal mechanisms across a broad set of economic phenomena. The study of race and the consequences of race in market interactions have long been hampered by the relative lack of longitudinal data collected on relevant markers of discrimination, racism, and related long-term outcomes.







Randall Akee

					Nonresident Fellow - Economic Studies, Center on Children and Families, Future of the Middle Class Initiative 

					Former Brookings Rubenstein Fellow										

 Twitter
indigenalysis








Marcus Casey

					Nonresident Fellow - Economic Stu

New article found for topic: data science
URL: Bridging the gender data gap
TITLE: 
More men than women are killed in car crashes each year, partly because men drive more and engage in riskier driving behavior. On the other hand, women are 17% more likely to be killed and 47% more likely to be injured in crashes than men are. Women are at increased risk simply because they are women: cars are primarily designed, built, and tested by male engineers using male data, so they are built with men in mind. Scaled-down versions of male crash test dummies, meant to represent women, were not used until 2003—and are primarily tested in the passenger seat. In car design, development, and testing, male bodies are the standard and female bodies the outlier. This creates a gender data gap with very real impacts on the lives of Americans.





J



Jeanette Gaudry Haynie

					Founder and Executive Director - Athena Leadership Project 

					Lieutenant Colonel - U.S. Marine Corps Reserve 




The gend

New article found for topic: data science
URL: Big data, meet behavioral science
TITLE: 
America’s community colleges offer the promise of a more affordable pathway to a bachelor’s degree. Students can pay substantially less for the first two years of college, transfer to a four-year college or university, and still earn their diploma in the same amount of time. At least in theory. Most community college students—80 percent of them—enter with the intention to transfer, but only 20 percent actually do so within five years of entering college. This divide represents a classic case of what behavioralists call an intention-action gap. 
Why would so many students who enter community colleges intending to transfer fail to actually do so? Put yourself in the shoes of a 20-something community college student. You’ve worked hard for the past couple years, earning credits and paying a lot less in tuition than you would have if you had enrolled immediately in a four-year college or university. Bu

IndexError: list index out of range

## Crawling Sites through Links

In [68]:
class Website:

    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag


class Content:

    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [69]:
import re


class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = []

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

    def crawl(self):
        """
        Get pages from website home page
        """
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)


reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)',
                  False, 'h1', 'div.StandardArticleBody_body_1gnLA')
crawler = Crawler(reuters)
crawler.crawl()

GETTING https://www.reuters.com
GETTING https://www.reuters.com/article/us-usa-trump-5g/trump-national-security-team-sees-building-5g-network-as-option-idUSKBN1FH103
URL: https://www.reuters.com/article/us-usa-trump-5g/trump-national-security-team-sees-building-5g-network-as-option-idUSKBN1FH103
TITLE: Trump security team sees building U.S. 5G network as option
BODY:
WASHINGTON (Reuters) - President Donald Trump’s national security team is looking at options to counter the threat of China spying on U.S. phone calls that include the government building a super-fast 5G wireless network, a senior administration official said on Sunday. The official, confirming the gist of a report from Axios.com, said the option was being debated at a low level in the administration and was six to eight months away from being considered by the president himself. The 5G network concept is aimed at addressing what officials see as China’s threat to U.S. cyber security and economic security. The Trump admini

URL: https://www.reuters.com/article/us-usa-immigration-manchin/democratic-senator-criticizes-pelosis-immigration-comment-idUSKBN1FH0RC
TITLE: Democratic senator criticizes Pelosi's immigration comment
BODY:
WASHINGTON (Reuters) - U.S. Senator Joe Manchin, a moderate Democrat, said on Sunday he thought a new White House immigration plan was a good starting point, and he criticized House Democratic leader Nancy Pelosi for dismissing it as a way to “make America white again.” “We don’t need that type of rhetoric on either side, from Nancy, (Republican House Speaker) Paul Ryan or anybody else,” said Manchin, a West Virginian and a leader of a bipartisan Senate group working on immigration. He spoke on CNN’s “State of the Union” program. Manchin’s comments highlighted differences among Democrats ahead of a Feb. 8 deadline for the U.S. Congress to pass another spending bill and try to reach an immigration agreement that would also protect up to 1.8 million illegal immigrants brought to the 

URL: https://www.reuters.com/article/us-afghanistan-blast/militants-attack-afghan-army-post-near-military-academy-in-capital-idUSKBN1FI07M?il=0
TITLE: Militants attack Afghan army post near military academy in capital
BODY:
KABUL (Reuters) - At least four militants attacked an army outpost near one of Afghanistan’s main military academies on Monday and at least one soldier was killed and three wounded, a defense ministry official said. The attack in the western outskirts of the capital, Kabul, came two days after an ambulance bomb in the center of the city killed more than 100 people and just over a week after another attack on the Hotel Intercontinental killed more than 20. Both of those attacks were claimed by the Taliban. Ministry of Defence officials said the militants attacked the outpost near the well-defended Marshal Fahim military academy just before dawn. One of the attackers blew himself up, one had been killed and two were still fighting. One soldier had been killed and thre

KeyboardInterrupt: 

## Crawling multiple page types

In [1]:
class Website:
    """Common base class for all articles/pages"""

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        

In [2]:
class Product(Website):
    """Contains information for scraping a product page"""

    def __init__(self, name, url, titleTag, productNumber, price):
        Website.__init__(self, name, url, TitleTag)
        self.productNumberTag = productNumberTag
        self.priceTag = priceTag

class Article(Website):
    """Contains information for scraping an article page"""

    def __init__(self, name, url, titleTag, bodyTag, dateTag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag

In [None]:

def parsePage(url):
    
    if '/ideas/' in url:
        

oreilly = Website('O\'Reilly', 'https://oreilly.com', 'h1' '')        