In [2]:
#Problem: Parse the title and body of the websites Brookings and NYTimes!

In [3]:
#Enumerate the getter functions!
import requests
from bs4 import BeautifulSoup


def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')

def getTextOfFirstTagMatchedInSoup(searchPattern, bs, property = "tag"):
    try:
        title = bs.find(**searchPattern).text
    except AttributeError as e:
        print(f"An attempt was made to access the text of the assumed to be {property}, None was accessed.")
        return ''
    return title

def getTextOfSecondTagMatchedInSoup(searchPattern, bs, property = "tag"):
    try:
        titles = bs.find_all(**searchPattern)
        title = titles[0].text
    except IndexError as e:
        print(f"The title tags can be indexed atmost by {len(titles)-1}, current index is {1}!")
        return ''
    except AttributeError as e:
        print(f"An attempt was made to access the text of the assumed to be {property}, None was accessed.")
        return ''
    return title

In [4]:
#Enumerate the classes to be used!

class Content:
    def __init__(self, url = "s", title = "s", body = "s"):
        self.url = url
        self.title = title
        self.body = body
    def __str__(self):
        return 'Title: {}'.format(self.title) + "\n" + ('URL: {}\n'.format(self.url)) + "\n" + (self.body)

In [5]:
#Enumerate the types being used!
from typing import Callable, Tuple, Dict, List


Pattern = dict
Url = str
Title = str 
Body = str
Strategy = Callable
Strategies = Dict[str, Strategy]
Patterns = Dict[str, Pattern]
Component = str
Domain = str

In [6]:
#Create an abstract class for checking whether the dataModel has a default value or not!


import abc
import inspect
from typing import Type


class TypeWithADefaultValue(abc.ABC):
    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        cls._check_init_defaults()

    @classmethod
    def _check_init_defaults(cls):
        init_signature = inspect.signature(cls.__init__)
        for name, param in init_signature.parameters.items():
            if name == 'self':
                continue
            if param.default is inspect.Parameter.empty:
                raise TypeError(f"Parameter '{name}' in {cls.__name__}.__init__ must have a default value")

Content().__dict__.keys()

dict_keys(['url', 'title', 'body'])

In [7]:
#Define how to obtain the components of a class with a default value!
def componentsOf(dataModel : type): #TODO: Test whether base class has default value or not
    # if not issubclass(dataModel, TypeWithADefaultValue):
    #     raise TypeError("The dataModel does not have a default value.")
    return list(dataModel().__dict__.keys())

In [8]:
#Create the general function from which the scraper for each individual website will be created based on the classes to be used!

from functools import partial

#TODO: DO the generalization
def scrapeDomainForComponentsOfDataModel(url : Url, patterns : Patterns, strategyForComponentsforAWebsite : Strategies, dataModel : type):
    dataModelComponents = list()
    components = componentsOf(dataModel)
    for component in components:
        dataModelComponents += [strategyForComponentsforAWebsite[component](url, patterns[component])]
    return dataModel(*dataModelComponents)

# def scrapeWebsiteForTitleAndBody(url, patterns : Patterns, strategies : Strategies):

#    title = strategies["title"](url, patterns["title"])
#    body = strategies['body'](url, patterns['body'])

#    return Content(url, title, body)

In [9]:
#Based on the classes to be used create the function determining what data to look for on the websites!
#TODO:Do the application of the generalization if the generalization is done above
scrapeWebsiteForTitleAndBody = partial(scrapeDomainForComponentsOfDataModel, dataModel = Content)

In [10]:
#Enumerate the strategies to choose from!

def searchForTextOfFirstMatch(url : Url, pattern : Pattern):
    bs = getPage(url)
    tag = getTextOfFirstTagMatchedInSoup(pattern, bs)
    return tag

def searchForTagByParts(url, pattern : Pattern):
    tags = getPage(url).find_all(**pattern)
    goalTag = ''
    for tag in tags:
        goalTag += (tag.text) + "\n"
    return goalTag

def justReturnTheUrl(url, pattern : Pattern):
    return url


In [11]:
#Determine the websites!
domains = ["brookings.edu", "nytimes.com"]

In [12]:
#Define the strategies to use for the websites!
BrookingsStrategy = {
    "url"   : justReturnTheUrl,
    "title" : searchForTextOfFirstMatch, 
    "body"  : searchForTagByParts
    }

NYTimesStrategy = {
    "url"   : justReturnTheUrl,
    "title" : searchForTextOfFirstMatch,
    "body" : searchForTagByParts
}
strategies = {
    "brookings.edu"  : BrookingsStrategy,
    "nytimes.com"   :NYTimesStrategy
}

In [13]:
#Define the patterns for the scrapers!

BrookingsPatterns = {
    "url"   : {},
    "title" : {'name' : 'title'},
    "body"  : {'name' : 'div', 'attrs' : {'class' : "byo-block -narrow wysiwyg-block wysiwyg"}}
}

NYTimesPatterns = {
    "url"   : {},
    "title" : {'name' : 'title'},
    "body"  : {'selector' : ('div.StoryBodyCompanionColumn div p')}
}
domainPatterns = {
    "brookings.edu" : BrookingsPatterns,
    "nytimes.com"   : NYTimesPatterns
}

In [14]:
#Define the scrapers for all websites to be parsed using the known data to look for, the patterns, and the strategies!
from functools import partial

scrapers = {
    "brookings.edu" : partial(scrapeDomainForComponentsOfDataModel, patterns = BrookingsPatterns,  strategyForComponentsforAWebsite = BrookingsStrategy, dataModel = Content),
    "nytimes.com"   : partial(scrapeDomainForComponentsOfDataModel, patterns = NYTimesPatterns,    strategyForComponentsforAWebsite = NYTimesStrategy, dataModel = Content)
}
print(scrapers["brookings.edu"](url="https://www.brookings.edu/articles/bill-baers-testimony-before-the-u-s-senate-committee-on-the-judiciary-subcommittee-on-competition-policy-antitrust-and-consumer-rights/"))

Title: Bill Baer's testimony before the U.S. Senate Committee on the Judiciary Subcommittee on Competition Policy, Antitrust, and Consumer Rights | Brookings
URL: https://www.brookings.edu/articles/bill-baers-testimony-before-the-u-s-senate-committee-on-the-judiciary-subcommittee-on-competition-policy-antitrust-and-consumer-rights/


Chair Klobuchar, Ranking Member Lee, and distinguished members of the Subcommittee, thank you for the opportunity to appear this afternoon and address one of the many challenges we face in harnessing the power and maximizing the potential of artificial intelligence.
The growing use of pricing algorithms presents one such challenge. I am no expert in AI. But from the vantage point of this long-time antitrust enforcer, now just an antitrust worrier, there is good reason for concern that misuse of this tool is growing and puts consumers at risk of paying supracompetitive prices for all sorts of goods and services.1
As your October hearing on the rental housin

In [15]:
#Define a regex pattern for finding the domain of a website!
import re
def findDomainOf(url):
    match = re.search(r'https?:\/\/(?:www\.)?([^\/:]+)', url)
    if match:
        domain = match.group(1)
        return domain 
    else:
        return None

In [16]:
#Define printing of url's based on their website 
#TODO:Do the generalization for the given inputs!
# def scrapeUrlBasedOnASingleDataModelAndStrategiesAndPatternsForWebsites(url : Url, dataModel : type, strategies : Strategies, patterns : Patterns, domains : List[Url]):
#     scrapeWebsiteForTheDataModel = partial(scrapeWebsiteForComponentsOfDataModel, dataModel = dataModel)
#     scrapers = {domain : partial(scrapeWebsiteForTheDataModel, patterns = patterns[domain], strategies = strategies[domain]) for domain in domains}
#     domain = findDomainOf(url)
#     return scrapers[domain](url)

#scrapeWebsiteForTitleAndBody = partial(scrapeUrlBasedOnASingleDataModelAndStrategiesAndPatternsForWebsites, dataModel = Content, strategies = strategies, patterns = patterns, domains = domains)
print(scrapeWebsiteForTitleAndBody('https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/', patterns = domainPatterns, strategies = strategies))

#TODO: Find the website from the link!

TypeError: scrapeDomainForComponentsOfDataModel() got an unexpected keyword argument 'strategies'

In [None]:
#Print the contents!
def ScrapeTitleAndBodyOf(url):
    domain = findDomainOf(url)
    return scrapers[domain](url)

def print_the_content_of(url):
    data = ScrapeTitleAndBodyOf(url)
    print(data)

print_the_content_of('https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/')
print_the_content_of("https://www.brookings.edu/articles/the-hamilton-project-2023-in-figures/")
print_the_content_of("https://www.brookings.edu/articles/bill-baers-testimony-before-the-u-s-senate-committee-on-the-judiciary-subcommittee-on-competition-policy-antitrust-and-consumer-rights/")
print_the_content_of('https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html')


NameError: name 'findDomainOf' is not defined

In [None]:
#Make the following code work

#Define the scrapers for all websites to be parsed using the known data to look for, the patterns, and the strategies!
from functools import partial
import re
from typing import Callable, Tuple, Dict, List

class SingleDataModelScraper:
    Pattern = dict
    Url = str
    Title = str 
    Body = str
    Strategy = Callable
    Strategies = Dict[str, Strategy]
    Patterns = Dict[str, Pattern]
    Component = str
    Domain = str

    def componentsOf(dataModel : type): #TODO: Test whether base class has default value or not
    # if not issubclass(dataModel, TypeWithADefaultValue):
    #     raise TypeError("The dataModel does not have a default value.")
        return list(dataModel().__dict__.keys())

    def scrapeDomainForComponentsOfDataModel(url : Url, patterns : Patterns, strategyForComponentsforADomain : Strategies, dataModel : type):
        dataModelComponents = list()
        #Get the components of the data model!
        components = componentsOf(dataModel)
        #For each component of the data model, according to the strategy for the given website's component, compute the component's value!
        for component in components:
            dataModelComponents += [strategyForComponentsforADomain[component](url, patterns[component])]
        #Return the instance of the data model from the components!
        return dataModel(*dataModelComponents)
    
    def __init__(self, dataModel : type, domainStrategies : Strategies, domainPatterns : Patterns, domains : List[Domain] = []):
        print("Initializing scraper.")
        print(f"Data model's components: {componentsOf(dataModel)}")
        self.dataModel = dataModel
        print(f"Domain strategies are: {str(domainStrategies)}")
        print()
        self.domainStrategies = domainStrategies
        self.domainPatterns = domainPatterns
        #For each domain, create a scraper for the domain that is capable of scraping any website of the domain and returning for each website an instance of the desired data model!
        self.scrapers = {domain : partial(self.scrapeDomainForComponentsOfDataModel, 
                                          patterns = domainPatterns[domain], 
                                          strategyForComponentsforAWebsite = domainStrategies[domain], 
                                          dataModel = dataModel) for domain in domains}

    def __findDomainOf(self, url):
        match = re.search(r'https?:\/\/(?:www\.)?([^\/:]+)', url)
        if match:
            domain = match.group(1)
            return domain 
        else:
            return None

    def __call__(self, url):
        domain = self.__findDomainOf(url)
        try:
            return scrapers[domain](url)
        except KeyError as k:
            print("There does not exist a strategy for the given domain!")

    
    def addWebsiteStrategy(self, domain, domainStrategy):
        pass

    def addToDomainStrategy(self, domain, component, strategyForComponent):
        pass 

    def removeFromDataModel(self, component):
        pass 
    def addToDataModel(self, component):
        pass
bndomains = ['brookings.edu', 'nytimes.com']
scraper = SingleDataModelScraper(Content, domainPatterns, strategies, bndomains)
print(scraper("https://www.brookings.edu/articles/bill-baers-testimony-before-the-u-s-senate-committee-on-the-judiciary-subcommittee-on-competition-policy-antitrust-and-consumer-rights/"))

#TODO: Ensure, that the class's dataModel has to have a method for printing it's elements!
#scraper.modifyStrategy(website, dataComponent, newStrategy)

Initializing scraper.
Data model's components: ['url', 'title', 'body']
Domain strategies are: {'brookings.edu': {'url': {}, 'title': {'name': 'title'}, 'body': {'name': 'div', 'attrs': {'class': 'byo-block -narrow wysiwyg-block wysiwyg'}}}, 'nytimes.com': {'url': {}, 'title': {'name': 'title'}, 'body': {'selector': 'div.StoryBodyCompanionColumn div p'}}}
Title: Bill Baer's testimony before the U.S. Senate Committee on the Judiciary Subcommittee on Competition Policy, Antitrust, and Consumer Rights | Brookings
URL: https://www.brookings.edu/articles/bill-baers-testimony-before-the-u-s-senate-committee-on-the-judiciary-subcommittee-on-competition-policy-antitrust-and-consumer-rights/


Chair Klobuchar, Ranking Member Lee, and distinguished members of the Subcommittee, thank you for the opportunity to appear this afternoon and address one of the many challenges we face in harnessing the power and maximizing the potential of artificial intelligence.
The growing use of pricing algorithms p

In [None]:
BrookingsPatterns = {
    "url"   : {},
    "title" : {'name' : 'title'},
    "body"  : {'name' : 'div', 'attrs' : {'class' : "byo-block -narrow wysiwyg-block wysiwyg"}}
}

NYTimesPatterns = {
    "url"   : {},
    "title" : {'name' : 'title'},
    "body"  : {'selector' : ('div.StoryBodyCompanionColumn div p')}
}
domainPatterns = {
    "brookings.edu" : BrookingsPatterns,
    "nytimes.com"   : NYTimesPatterns
}

#Define the strategies to use for the websites!
BrookingsStrategy = {
    "url"   : justReturnTheUrl,
    "title" : searchForTextOfFirstMatch, 
    "body"  : searchForTagByParts
    }

NYTimesStrategy = {
    "url"   : justReturnTheUrl,
    "title" : searchForTextOfFirstMatch,
    "body" : searchForTagByParts
}
domainStrategies = {
    "brookings.edu"  : BrookingsStrategy,
    "nytimes.com"   :NYTimesStrategy
}

domains = ["brookings.edu", "nytimes.com"]

scraper = SingleDataModelScraper(Content, domainPatterns, domainStrategies, domains)

In [None]:
domains = ["ebay.com"]

class Parameter:
    def __init__(self, label, values):
        self.label = label 
        self.values = values
    def __str__(self):
        return f"Label: {str(self.label)}" + ", " + f"Value: {[value for value in self.values]}" + "\n"

class Listing:
    def __init__(self, name, price, parameters : List[Parameter]):
        self.name = name 
        self.price = price 
        self.parameters = parameters 
    
    def __str__(self):
        return f"Name: \t{str(self.name)}" + "\n" + f"Price: \t{self.price}" + "\n" + f"Parameters: {[str(parameter) for parameter in self.parameters][0]}"
parameters = [Parameter("Color", "Black"), Parameter("Color", "White")]
listing = Listing("Some name of a listing", "50 dollars", parameters)
print(listing)

#Specific strategies
def parameterTagToParameter(parameterTag):
    label = parameterTag.label.span.span.text
    values = []
    listOfValueTags = parameterTag.find({"name" : "", 
                            "attrs" : 
                                {
                                    "class" : "x-msku__select-box"
                                }})
    for valueTag in listOfValueTags:
        values.append(valueTag.text)
    return Parameter(label, values)

def getListAndTransformItsElementsToParameters(url, patterns):
    bs = getPage(url)
    parameters = bs.find(patterns["listpattern"])
    for parameterTag in parameters:
        parameter = parameterTagToParameter(parameterTag)

ebayStrategy = {
    "name"  : searchForTextOfFirstMatch,
    "price" : searchForTextOfFirstMatch,
    "parameters" : getListAndTransformItsElementsToParameters
}
ebayPatterns = {
    "name"       : {"name" : "h1", "attrs" : {"class" : "x-item-title__mainTitle"}},
    "price"      : {"name" : "span", "attrs" : {"class" : "ux-textspans"}},
    "parameters" : {"listpattern" : {"attrs" : {"class" : "vim x-msku mar-t-16", "data-testid" : "x-msku"}}},
}

domainStrategies2 = {"ebay.com" : ebayStrategy}
domainPatterns2 = {"ebay.com" : ebayPatterns}

scraper2 = SingleDataModelScraper(Listing, 
                       domainPatterns = domainPatterns2, 
                       domainStrategies = domainStrategies2, 
                       domains = ["ebay.com"])

scraper2("https://www.ebay.com/itm/175900855483?hash=item28f4820cbb:g:~HgAAOSwYZxlAcv2&amdata=enc%3AAQAIAAAA4J0jJ3Q5dqoGZEAodn%2B8vXjyi8HrMbuCrwdVPoFbx%2FBbTAOmMlxlh8cAXWBqN4C%2FgqIICnj%2Flm3OJOlaMNZCxjFsamek%2FDr0dW3iI890bPZD9OGXyACjohCI%2FvQsWfSYtXKnhoHM5dOh0s3D5mpn0FbeecvixKUzQbhbS%2FREAt1Ie9ZvXsWxJx61x0rWbo6ro4C9nw65fleMVLTLcxWAjACeMm967kwdygPnDkS%2Ft%2FmWKkOiJ5iKM9cJHH276GFWhnc7ac3BuPaCkhysP3%2Bp%2F28v802Ln9CXGn8jkSDH1alA%7Ctkp%3ABFBM0LqwuJBj&var=475333376890")

Name: 	Some name of a listing
Price: 	50 dollars
Parameters: Label: Color, Value: ['B', 'l', 'a', 'c', 'k']

There does not exist a strategy for the given domain!
