In [1]:
# Library used for making HTTP requests. To get the HTML.
import requests
# Optional, only used to display data later in table.
import pandas as pd

# lxml is a parsing library which parses HTML/XML. With
# this library we can select certain elements which we'd
# like to scrape.
from lxml import html

In [11]:
# We create a session which contains the cookies (CookieJar). This is 
# needed in order for the website to 'remember' who we are.
# If we don't use a session, we won't get past the cookie-wall.
session = requests.session()

# Make the initial request to get the page.
# get(url)
resp = session.get('https://techcrunch.com')

# It's a good idea to write away the response in a file for 
# following reasons:
# - manually inspect the page (did you got what you expected?)
# - reprocess the page without making additional requests
#   (quicker, but also helps against blocking)

# open returns a file pointer, after you are done with a 
# file (reading or writing), you should ALWAYS close it
# (Windows users often know the message: "file in use").
# "with" is python statement which automatically closes/
# "disposes" the open object when going out of scope.

# open(file, mode). Mode; wb = write binary. The response from
# requests is binary encoded thus we need to open the file
# in binary mode. 
with open('00_intial_page.html', 'wb') as f:
    f.write(resp.content)

# Parse the response using lxml so we can scrape the content.
page = html.fromstring(resp.content)

In [16]:
# We use css selectors to select the elements. The returned
# object is always a list, so we select the first form.
# cssselect(css_selector)
form_consent = page.cssselect('div.techcrunch>form')[0]

# When submitting a form, your browser will gather all input
# elements and form a dictionary { name: value }.
# We do the same, but then manually form the request.
data = dict((x.attrib['name'], x.attrib['value']) for x in form_consent.cssselect('input[name]'))

data

{'consentCollectionStep': 'EU_SINGLEPAGE',
 'previousStep': '',
 'csrfToken': 'QIHXXnSaQfXaKxKZzegl90w0RtLccMo8',
 'jurisdiction': '',
 'locale': 'en-US',
 'doneUrl': 'https://guce.techcrunch.com/copyConsent?sessionId=3_cc-session_2ef7d91e-a799-45c3-9ca9-9a22605f18e8&inline=false&lang=en-US',
 'tosId': 'eu',
 'sessionId': '3_cc-session_2ef7d91e-a799-45c3-9ca9-9a22605f18e8',
 'namespace': 'techcrunch',
 'originalDoneUrl': 'https://techcrunch.com/?guccounter=1',
 'inline': 'false',
 'startStep': 'EU_SINGLEPAGE',
 'isSDK': 'false',
 'brandBid': '5c10rrdf3ldqp',
 'userType': 'NON_REG',
 'country': 'NL',
 'ybarNamespace': 'TECHCRUNCH',
 'agree': 'agree'}

In [18]:
# We submit this data to the consent url (found while inspecting
# network traffic).
# post(url, data)
resp = session.post('https://consent.yahoo.com/consent', data)

# Again write the output after submiting the consent form.
with open('01_after_consent.html', 'wb') as f:
    f.write(resp.content)

# Again, parse the page.
page = html.fromstring(resp.content.decode())

In [5]:
# Function which takes div.post-block as input and parses the 
# details from the article.
def parse_article(article):
    # For each detail, we use the correct css selector. Remember
    # that cssselect always returns an array -> select the first 
    # item.
    # - text_content() returns the text content of the element (duh..)
    # - strip() removes any leading or trailing whitespace, which 
    #   often is present in HTML due formatting (beautify).
    # - attrib[] is used to get content of an attribute instead of
    #   an element.
    title = article.cssselect('h2.post-block__title>a')[0].text_content().strip()
    url = article.cssselect('h2.post-block__title>a')[0].attrib['href']
    datetime = article.cssselect('time')[0].attrib['datetime']
    content = article.cssselect('div.post-block__content')[0].text_content().strip()
    
    # Return everything as a dictionary. 
    return {
        'title': title,
        'url': url,
        'datetime': datetime,
        'content': content
    }

# Every article has a .post-block div, for each of these articles
# call parse_article to get the details.
articles = [parse_article(x) for x in page.cssselect('div.post-block')]

# Parse the articles as dataframe, so we could do further processing
# (i.e. printing it as table).
df = pd.DataFrame(articles)
df[['datetime', 'title', 'content']]

Unnamed: 0,datetime,title,content
0,2020-02-04T05:55:42-08:00,Tinder’s handling of user data is now under GD...,Dating app Tinder is the latest tech service t...
1,2020-02-04T05:26:45-08:00,Elon Musk promotes Texas ‘career day’ as Space...,SpaceX’s next-generation spacecraft is already...
2,2020-02-04T05:17:52-08:00,What is going on with Tesla?,Shares of American electric car company Tesla ...
3,2020-02-04T05:16:08-08:00,Google’s location tracking finally under forma...,Google’s lead data regulator in Europe has fin...
4,2020-02-04T05:00:56-08:00,Online-to-offline platform Sendoso raises $40M...,Email is garbage and we’re all buried in the s...
5,2020-02-04T05:00:31-08:00,"Hinge Health, the digital solution for chronic...","Hinge Health, the San Francisco-based startup ..."
6,2020-02-04T05:00:26-08:00,Monday.com 2.0 workflow platform lets companie...,"Monday.com, announced version 2.0 of its flexi..."
7,2020-02-04T04:32:32-08:00,UK Council websites are letting citizens be pr...,On the same day that a data ethics advisor to ...
8,2020-02-04T04:15:56-08:00,"Nomagic, a startup out of Poland, picks up $8....",Factories and warehouses have been two of the ...
9,2020-02-04T04:00:47-08:00,Payments infra startup Finix closes $35M Serie...,"This morning Finix, a software-as-a-service (S..."
