In [41]:
#Problem: Parse the title and body of the websites Brookings and NYTimes!

In [42]:
#Enumerate the getter functions!
import requests
from bs4 import BeautifulSoup


def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')

def getTextOfFirstTagMatchedInSoup(searchPattern, bs, property = "tag"):
    try:
        title = bs.find(**searchPattern).text
    except AttributeError as e:
        print(f"An attempt was made to access the text of the assumed to be {property}, None was accessed.")
        return ''
    return title

def getTextOfSecondTagMatchedInSoup(searchPattern, bs, property = "tag"):
    try:
        titles = bs.find_all(**searchPattern)
        title = titles[0].text
    except IndexError as e:
        print(f"The title tags can be indexed atmost by {len(titles)-1}, current index is {1}!")
        return ''
    except AttributeError as e:
        print(f"An attempt was made to access the text of the assumed to be {property}, None was accessed.")
        return ''
    return title

In [43]:
#Enumerate the classes to be used!

class Content:
    def __init__(self, url = "", title = "", body = ""):
        self.url = url
        self.title = title
        self.body = body
    def __str__(self):
        return 'Title: {}'.format(self.title) + "\n" + ('URL: {}\n'.format(self.url)) + "\n" + (self.body)

In [44]:
#Enumerate the types being used!
from typing import Callable, Tuple, Dict, List


Pattern = dict
Url = str
Title = str 
Body = str
Strategy = Callable
Strategies = Dict[str, Strategy]
Patterns = Dict[str, Pattern]
Component = str

In [45]:
#Create an abstract class for checking whether the dataModel has a default value or not!


import abc
import inspect
from typing import Type


class TypeWithADefaultValue(abc.ABC):
    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        cls._check_init_defaults()

    @classmethod
    def _check_init_defaults(cls):
        init_signature = inspect.signature(cls.__init__)
        for name, param in init_signature.parameters.items():
            if name == 'self':
                continue
            if param.default is inspect.Parameter.empty:
                raise TypeError(f"Parameter '{name}' in {cls.__name__}.__init__ must have a default value")

In [46]:
#Define how to obtain the components of a class with a default value!
def componentsOf(dataModel : type): #TODO: Test whether base class has default value or not
    #if not issubclass(dataModel, TypeWithADefaultValue):
    #    raise TypeError("The dataModel does not have a default value.")
    return list(dataModel().__dict__.keys())

In [47]:
#Create the general function from which the scraper for each individual website will be created based on the classes to be used!

from functools import partial

def scrapeWebsiteForComponentsOfDataModel(url : Url, patterns : Patterns, strategies : Strategies, dataModel : type):
    dataModelParameters = tuple()
    for component in componentsOf(dataModel):
        dataModelParameters = strategies[component](url, patterns[component])
    return dataModel(*dataModelParameters)

#Generalized from:
#def scrapeWebsiteForTitleAndBody(url, patterns : Patterns, strategies : Strategies):
#
#    title = strategies["title"](url, patterns["title"])
#    body = strategies['body'](url, patterns['body'])
#
#    return Content(url, title, body)

In [48]:
#Based on the classes to be used create the function determining what data to look for on the websites!

#scrapeWebsiteForTitleAndBody = partial(scrapeWebsiteForComponentsOfDataModel, dataModel = Content)

In [49]:
#Enumerate the strategies to choose from!

def searchForTextOfFirstMatch(url : Url, pattern : Pattern):
    bs = getPage(url)
    tag = getTextOfFirstTagMatchedInSoup(pattern, bs)
    return tag

def searchForTagByParts(url, pattern : Pattern):
    tags = getPage(url).find_all(**pattern)
    goalTag = ''
    for tag in tags:
        goalTag += (tag.text) + "\n"
    return goalTag

In [50]:
#Determine the websites!
domains = ["brookings.edu", "nytimes.com"]

In [51]:
#Define the strategies to use for the websites!
BrookingsStrategy = {
    "title" : searchForTextOfFirstMatch, 
    "body"  : searchForTagByParts
    }

NYTimesStrategy = {
    "title" : searchForTextOfFirstMatch,
    "body" : searchForTagByParts
}
websiteStrategies = {
    "brookings.edu"  : BrookingsStrategy,
    "nytimes.com"   :NYTimesStrategy
}

In [52]:
#Define the patterns for the scrapers!

BrookingsPatterns = {
    "title" : {'name' : 'title'},
    "body"  : {'name' : 'div', 'attrs' : {'class' : "byo-block -narrow wysiwyg-block wysiwyg"}}
}

NYTimesPatterns = {
    "title" : {'name' : 'title'},
    "body"  : {'selector' : ('div.StoryBodyCompanionColumn div p')}
}
patterns = {
    "brookings.edu" : BrookingsPatterns,
    "nytimes.com"   : NYTimesPatterns
}

In [54]:
#Define the scrapers for all websites to be parsed using the known data to look for, the patterns, and the strategies!
from functools import partial


scrapers = {
    "brookings.edu" : partial(scrapeWebsiteForTitleAndBody,strategies = BrookingsStrategy, patterns = BrookingsPatterns),
    "nytimes.com"   : partial(scrapeWebsiteForTitleAndBody, strategies = BrookingsStrategy, patterns = NYTimesPatterns)
}
scrapers["brookings.edu"]('https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/')

KeyError: 'url'

In [None]:
#Define a regex pattern for finding the domain of a website!
import re
def findDomainOf(url):
    match = re.search(r'https?:\/\/(?:www\.)?([^\/:]+)', url)
    if match:
        domain = match.group(1)
        return domain 
    else:
        return None

In [None]:
# #Define printing of url's based on their website 

# #Strategies:
# #Define the strategies to use for the websites!
# BrookingsStrategy = {
#     "title" : searchForTextOfFirstMatch, 
#     "body"  : searchForTagByParts
#     }

# NYTimesStrategy = {
#     "title" : searchForTextOfFirstMatch,
#     "body" : searchForTagByParts
# }
# websiteStrategies = {
#     "brookings.edu"  : BrookingsStrategy,
#     "nytimes.com"   :   NYTimesStrategy
# }

# #Patterns:

# BrookingsPatterns = {
#     "title" : {'name' : 'title'},
#     "body"  : {'name' : 'div', 'attrs' : {'class' : "byo-block -narrow wysiwyg-block wysiwyg"}}
# }

# NYTimesPatterns = {
#     "title" : {'name' : 'title'},
#     "body"  : {'selector' : ('div.StoryBodyCompanionColumn div p')}
# }
# patterns = {
#     "brookings.edu" : BrookingsPatterns,
#     "nytimes.com"   : NYTimesPatterns
# }

# def scrapeUrlBasedOnASingleDataModelAndStrategiesAndPatternsForWebsites(url : Url, dataModel : type, websiteStrategies : Strategies, patterns : Patterns, domains : List[Url]):
#     scrapeWebsiteForTheDataModel = partial(scrapeWebsiteForComponentsOfDataModel, dataModel = dataModel)
#     scrapers = {domain : partial(scrapeWebsiteForTheDataModel, patterns = patterns[domain], strategies = websiteStrategies[domain]) for domain in domains}
#     domain = findDomainOf(url)
#     return scrapers[domain](url)

# scrapeWebsiteForTitleAndBody = partial(scrapeUrlBasedOnASingleDataModelAndStrategiesAndPatternsForWebsites, dataModel = Content, websiteStrategies = websiteStrategies, patterns = patterns, domains = domains)
# print(scrapeWebsiteForTitleAndBody('https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'))

# #TODO: Find the website from the link!

In [None]:
#Print the contents!
def ScrapeTitleAndBodyOf(url):
    domain = findDomainOf(url)
    return scrapers[domain](url)

def print_the_content_of(url):
    data = ScrapeTitleAndBodyOf(url)
    print(data)

print_the_content_of('https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/')
print_the_content_of("https://www.brookings.edu/articles/the-hamilton-project-2023-in-figures/")
print_the_content_of("https://www.brookings.edu/articles/bill-baers-testimony-before-the-u-s-senate-committee-on-the-judiciary-subcommittee-on-competition-policy-antitrust-and-consumer-rights/")
print_the_content_of('https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html')


TypeError: scrapeUrlBasedOnASingleDataModelAndStrategiesAndPatternsForWebsites() got an unexpected keyword argument 'strategies'

In [None]:
#Make the following code work
class Scraper1:
    def __init__(self, dataModel):
        self.dataModel
    
    def modifyStrategy(self, website, dataComponent, newStrategy)

scraper.modifyStrategy(website, dataComponent, newStrategy)