## Script für Scraper

In [23]:
#!/usr/bin/env python
# coding: utf-8

# Prepare scraping

import os
from datetime import datetime
import pandas as pd
import requests

SOURCES_PATH = os.path.join("input", "web-sources.csv")
STORAGE_PATH = os.path.join("data-lake")

# Read sources
web_sources = pd.read_csv(SOURCES_PATH)
web_sources.head()

# Current date as string
now = datetime.now()
now_str = now.strftime("%Y-%m-%d")
print("Date:", now_str)

content_dict = {}
text_dict = {}
log_list = []
failing_list = []

def scrape_website(name, url):

    # (1) Run request
    response = requests.get(url, allow_redirects=True)
    content = response.content
    text = response.text

    # (2) File name to store the raw HTML
    file_name = os.path.join(
        STORAGE_PATH,
        f"{now_str}-{name}.html",
    )

    # (3) Write raw HTML
    with open(file_name, "wb") as f:
        f.write(response.content)

    # (4) Fill content_dict and text_dict
    content_dict[name] = response.content
    text_dict[name] = response.text

    # (5) Fill log_list
    log_info = dict(
        name=name,
        date=now_str,
        file_name=file_name,
        status=response.status_code,
        original_url=url,
        final_url=response.url,
        encoding=response.encoding,
    )
    log_list.append(log_info)

def scrape_wrapper(newspaper):
    url = newspaper["url"]
    name = newspaper["name"] #id
    try:
        scrape_website(name, url)
        print(f"[INFO] Scraped {name} ({url})")
    except:
        failing_list.append((name, url))
        print(f"[ERROR] Failed to scrape: {name} ({url})")


web_sources.apply(scrape_wrapper, axis=1)

log_file_name = os.path.join(
    STORAGE_PATH,
    f"{now_str}.csv",
)
log_df = pd.DataFrame(log_list)
log_df.to_csv(log_file_name)

Date: 2021-04-23
[INFO] Scraped sz (https://www.sueddeutsche.de/)
[INFO] Scraped zeit (https://www.zeit.de/index)
[INFO] Scraped faz (https://www.faz.net/aktuell/)
[INFO] Scraped ts (https://www.tagesspiegel.de/)
[INFO] Scraped spiegel (https://www.spiegel.de/)
[INFO] Scraped kronen (https://www.krone.at/)


## Script für das DWH

In [25]:
#!/usr/bin/env python
# coding: utf-8

# Data Warehouse

import os
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
import sqlite3
import requests

STORAGE_PATH = os.path.join("data-lake")
SQL_PATH = os.path.join("dwh.sqlite3")

# Current date as string
now = datetime.now()
now_str = now.strftime("%Y-%m-%d")

log_file_name = os.path.join(
    STORAGE_PATH,
    f"{now_str}.csv",
)
log_file = pd.read_csv(log_file_name)
log_file.head()

stopwords_url = "https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt"
stopwords_list = requests.get(stopwords_url, allow_redirects=True).text.split("\n")[9:]

def read_html_file(filename, encoding="utf-8"):
    with open(filename, "r", encoding="utf-8") as f:
        text = f.read()
    return text

def process_html(text):
    items = text.replace("\n", " ").lower().split(" ")
    items = [i for i in items if len(i) > 1 and i not in stopwords_list]
    return items

def process_newspaper(newspaper):
    filename = newspaper["file_name"]
    encoding = newspaper["encoding"].lower()
    text = read_html_file(filename, encoding)
    bstext = BeautifulSoup(text, "html.parser").text
    items = process_html(bstext)
    count = pd.Series(items).value_counts().to_frame()
    count.columns = ["count"]
    count["word"] = count.index
    count["name"] = newspaper["name"] #paper(1.)
    count["date"] = now_str
    return count

collection = []

def process_wrapper(newspaper):
    name = newspaper["name"]
    try:
        count = process_newspaper(newspaper)
        print(f"[INFO] Processing {name}")
        collection.append(count)
    except:
        print(f"[ERROR] Failt to process {name}")

log_file.apply(process_wrapper, axis=1)

data = pd.concat(collection, axis=0)
print("Data shape:", data.shape)

lockdown = data.loc[data["word"] == "lockdown"]
lockdown.head()

connection = sqlite3.connect(SQL_PATH)
data.to_sql("wordcount", connection, index=False, if_exists="append")

[INFO] Processing sz
[INFO] Processing zeit
[INFO] Processing faz
[INFO] Processing ts
[INFO] Processing spiegel
[INFO] Processing kronen
Data shape: (10136, 4)
