In [11]:
import requests
from bs4 import BeautifulSoup

# CLASS CONTENT
class Content:
    def __init__(self, topic, url, title, body):
        self.url = url
        self.title = title
        self.body = body
        self.topic = topic
        
    def print(self):
        #print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url.rstrip()))
        #print('TITLE: {}'.format(self.title))
        #print('BODY:\n{}'.format(self.body))


# CLASS WEBSITE
class Website:
    def __init__(self, name, url, searchUrl,  resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl=absoluteUrl
    
    def print(self):
        print(self.url)
        
# CLASS CRAWLER 
class Crawler:
    
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''

    def search(self, topic, site):
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            try:
                url = result.select(site.resultUrl)[0].attrs['href']
            except:
                return
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()
            
           
            
    def parse(self, site, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
        if title != '' and body != '':
            content = Content(url, title, body)
            content.print()

In [13]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=', 'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=','div.search-result-content', 'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=', 'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
            ]

sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))
   
topics = ['python', 'data science']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        targetSite.print()
        crawler.search(topic, targetSite)

GETTING INFO ABOUT: python
http://oreilly.com
http://reuters.com
http://www.brookings.edu
URL: Housing finance reform: The path forward gets rolling
URL: Building guardrails for ChatGPT
URL: How open-source software shapes AI policy
URL: Preventing pandemics through biodiversity conservation and smart wildlife trade regulation
URL: Leveraging the disruptive power of artificial intelligence for fairer opportunities
URL: The Hutchins Center Explains: Budgeting for aging America
URL: An Atlanta organization’s mission to bring racial equity to the tech ecosystem
URL: Idea to Retire: Old methods of policy education
URL: The Silicon Valley Wage Premium
URL: Institutionalizing Data Analysis in German Federal Governance
URL: Making waves in India: Media and the COVID-19 pandemic
URL: Skills, success, and why your choice of college matters
URL: 
				BUILDING SKILLS FOR LIFE			
GETTING INFO ABOUT: data science
http://oreilly.com
http://reuters.com
http://www.brookings.edu
URL: Reckoning with sci