In [1]:
import os
from datetime import datetime
import pandas as pd
import requests

C:\Users\jonas\Anaconda3\envs\ML\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\jonas\Anaconda3\envs\ML\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


## Manually scraping sz.de

In [2]:
url = "http://sz.de"

In [3]:
response = requests.get(url, allow_redirects=True)

In [4]:
response.status_code

200

In [5]:
print(response.text[0:700], "[...]")

<!doctype html>
<html lang="de">

<head>
    <meta charset="utf-8" />
    <title>Aktuelle Nachrichten, Hintergründe und Kommentare - SZ.de</title>
    <link rel="canonical" href="https://www.sueddeutsche.de" />
    <meta name="robots" content="index,follow,noarchive,noodp" />
    <meta name="author" content="Süddeutsche.de GmbH, Munich, Germany" />
    <meta name="copyright" content="Süddeutsche.de GmbH, Munich, Germany" />
    <meta name="viewport" content="width=1280" />
    <meta name="email" content="kontakt@sueddeutsche.de" />
    <meta name="description" content="News aus Deutschland und aller Welt mit Kommentaren und Hintergrundberichten auf Süddeutsche.de." />
    <meta name="keyword [...]


In [6]:
print(response.content[0:700], "[...]")

b'<!doctype html>\n<html lang="de">\n\n<head>\n    <meta charset="utf-8" />\n    <title>Aktuelle Nachrichten, Hintergr\xc3\xbcnde und Kommentare - SZ.de</title>\n    <link rel="canonical" href="https://www.sueddeutsche.de" />\n    <meta name="robots" content="index,follow,noarchive,noodp" />\n    <meta name="author" content="S\xc3\xbcddeutsche.de GmbH, Munich, Germany" />\n    <meta name="copyright" content="S\xc3\xbcddeutsche.de GmbH, Munich, Germany" />\n    <meta name="viewport" content="width=1280" />\n    <meta name="email" content="kontakt@sueddeutsche.de" />\n    <meta name="description" content="News aus Deutschland und aller Welt mit Kommentaren und Hintergrundberichten auf S\xc3\xbcddeutsche.de." />\n    <meta name="key' [...]


In [7]:
response.encoding

'UTF-8'

In [8]:
response.url

'https://www.sueddeutsche.de/'

In [9]:
response.elapsed

datetime.timedelta(microseconds=175513)

In [14]:
with open(os.path.join("local", "sz.html"), "wb") as f:
    f.write(response.content)

## Automatically scraping multiple websites

In [15]:
STORAGE_DIR = "local"

In [16]:
# List of news pages to be scraped
newspaper_urls = dict(
    sz="https://www.sueddeutsche.de/",
    zeit="https://www.zeit.de/index",
    faz="https://www.faz.net/aktuell/",
    ts="https://www.tagesspiegel.de/",
    spiegel="https://www.spiegel.de/",
    kronen="https://www.krone.at/",
    wtf="https://asdfkajwlkejwkejklajsdflksadjfasdf.nix",
)

In [17]:
# Current date as string
now = datetime.now()
now_str = now.strftime("%Y-%m-%d")
print(now_str)

2021-04-23


### Target Objects

We will create two objects:

- content_dict: a dict with the HTML content of the pages we scraped

- log_list: a list with metadata about our requests

In [18]:
content_dict = {}
text_dict = {}
log_list = []
failing_list = []

### Scraper function

In [19]:
def scrape_website(name, url):

    # (1) Run request
    response = requests.get(url, allow_redirects=True)
    content = response.content
    text = response.text

    # (2) File name to store the raw HTML
    file_name = os.path.join(
        STORAGE_DIR,
        f"{now_str}-{name}.html",
    )

    # (3) Write raw HTML
    with open(file_name, "wb") as f:
        f.write(response.content)

    # (4) Fill content_dict and text_dict
    content_dict[name] = response.content
    text_dict[name] = response.text

    # (5) Fill log_list
    log_info = dict(
        name=name,
        date=now_str,
        file_name=file_name,
        status=response.status_code,
        url=response.url,
        encoding=response.encoding,
    )
    log_list.append(log_info)

### Execute

In [20]:
for name, url in newspaper_urls.items():
    try:
        scrape_website(name, url)
    except:
        failing_list.append((name, url))

### Check

In [21]:
log_df = pd.DataFrame(log_list)
log_df

Unnamed: 0,name,date,file_name,status,url,encoding
0,sz,2021-04-23,local\2021-04-23-sz.html,200,https://www.sueddeutsche.de/,UTF-8
1,zeit,2021-04-23,local\2021-04-23-zeit.html,200,https://www.zeit.de/index,UTF-8
2,faz,2021-04-23,local\2021-04-23-faz.html,200,https://www.faz.net/aktuell/,utf-8
3,ts,2021-04-23,local\2021-04-23-ts.html,200,https://www.tagesspiegel.de/,utf-8
4,spiegel,2021-04-23,local\2021-04-23-spiegel.html,200,https://www.spiegel.de/,utf-8
5,kronen,2021-04-23,local\2021-04-23-kronen.html,200,https://www.krone.at/,ISO-8859-1


In [22]:
log_file_name = os.path.join(
    STORAGE_DIR,
    f"{now_str}.csv",
)

In [23]:
log_df.to_csv(log_file_name)

In [24]:
failing_list

[('wtf', 'https://asdfkajwlkejwkejklajsdflksadjfasdf.nix')]