In [261]:
import requests
from bs4 import BeautifulSoup
import html5lib
import numpy as np
import json
import time

In [2]:
url = 'https://www.bild.de/themen/ereignisse/klimawandel/news-nachrichten-news-fotos-videos-16877746.bild.html'
main = 'https://www.bild.de/'

In [3]:
# get page
def get_page(url):
    return BeautifulSoup(requests.get(url).content, 'html.parser')

In [29]:
def find_and_complete_links(page):
    # each article item has its own class; each page contains 10 article links (from 1 to Last)
    items = list(map(str, range(1, 10))) + ['Last']
    div_class = ['hentry landscape t10l item' + i for i in items]
    article_links = [page.find('div', class_=class_name).find('a')['href'] for class_name in div_class]
    article_links = list([main + i for i in article_links if not i.startswith(main)] + [i for i in article_links if i.startswith(main)])
    return article_links

In [25]:
# last page -> crashes!
def find_next_page(page):
    return main + page.find_all('a', class_='next hide-text')[0]['data-ajax-href']

In [None]:
# replace tag (default <br>) with (default) space to get meaningful text
def replace_tag(s, tags=['<br>'], sub=' '):
    s = str(s)
    for tag in tags:
        s = s.replace(tag, sub)
    return BeautifulSoup(s)

In [90]:
def clean_str(s, chars='\n\t\r', strip_chars=['\.',' '], only_lstrip=False):
    for char in chars:
        s = s.replace(char, '')
    if only_lstrip:
        for char in strip_chars:
            s = s.lstrip(char)
    else:
        for char in strip_chars:
            s = s.strip(char)
    return s

In [248]:
#returns (kicker, headline, subhead)
def get_title_info(art):
    try:
        kicker = replace_tag(art.main.find_all('h1')[0].find('span', class_='kicker')).get_text()
    except:
        try:
            kicker = replace_tag(art.find('article').find('header').find('span', class_='kicker')).get_text()
        except:
            kicker = ''
    try:
        headline = replace_tag(art.main.find_all('h1')[0].find('span', class_='headline')).get_text()
    except:
        try:
            headline = replace_tag(art.find('article').find('header').find('span', class_='headline')).get_text()
        except:
            headline = ''
    try:
        subhead = replace_tag(art.main.find_all('h2', class_='subhead')[0]).get_text()
    except:
        try:
            subhead = replace_tag(art.find('article').find('header').find('h2', class_='subhead')).get_text()
        except:
            subhead = ''
    return (kicker, headline, subhead)

In [92]:
# picture descriptions are also already in text
def get_pic_description(art):
    return [clean_str(i.get_text()) for i in art.find_all('figcaption')]

In [198]:
def get_date_time(art):
    raw = list(map(clean_str, art.find('div', class_='authors').get_text().split('-')))
    date = raw[0].split(' ')[-1]
    time = raw[1].split(' ')[0]
    return [date, time]

In [117]:
# articles have no intro but we could try to extract smth...
def get_intro(art):
    try:
        em = art.find('div', class_='txt').find('em').get_text()
    except:
        em = ''
    try:
        strong = art.find('div', class_='txt').find('strong').get_text()
    except:
        strong = ''
    intro = em + strong
    if intro == '':
        return None
    else:
        return intro

In [227]:
# get all text from article
def get_clean_text(art):
    text = ''
    for i in art.find_all('div', class_='txt')[0].findAll('p'):
        text += clean_str(i.get_text().replace('\xa0', ' '))
    if text == '':
        text = None
    return text    

In [267]:
def wait(s):
    time.sleep(s)
    #print(s)
    return 0

In [280]:
def scrape_page(url):
    page = get_page(url)
    article_links = find_and_complete_links(page)
    for link in article_links:
        #wait(3)
        print(link)
        try:
            art = get_page(link)
        except:
            print('smth is wrong with get_page')
            return 0
        try:
            kicker, headline, subhead = get_title_info(art)
        except:
            kicker, headline, subhead = '', None, ''
        try:
            date_time = get_date_time(art)
            date = date_time[0]
            time = date_time[1]
        except:
            date, time = None, None
        try:
            text = get_clean_text(art)
        except:
            text = None
        article_dict = {'headline': headline,
                        'headline_intro': kicker + ' ' + subhead, 
                        'date': date,
                        'time': time, 
                        'intro': None, 
                        'text': text,
                        'thread': None}        
        yield article_dict

In [229]:
def write_article(article_dict, filename):
    with open(filename, 'w') as outfile:
        json.dump(article_dict, outfile)

In [290]:
page_num = 0
link = url
while page_num < 15:
    page_num += 1
    print(page_num)
    for art_num, art in enumerate(scrape_page(link)):
        write_article(art, 'C:\\Users\\Даша\\Downloads\\FH\\DQDA\\Projekt\\bild_corpus\\b_article_' + str(page_num) + 'p' + str(art_num) + '.txt')
        
    try:
        link = find_next_page(get_page(link))
    except:
        break

1
https://www.bild.de//regional/muenchen/muenchen-aktuell/radweg-da-kunden-weg-muenchner-stadtrat-ruft-klimanotstand-aus-66796256.bild.html
https://www.bild.de//news/wetter/wetter/wetter-rekord-hitze-australien-so-heiss-war-es-in-down-under-noch-nie-66797366.bild.html
https://www.bild.de//politik/inland/politik-inland/greta-klimawandel-und-feminismus-was-deutschlands-first-lady-ueber-diese-themen-66797636.bild.html
https://www.bild.de//bild-plus/ratgeber/wissenschaft/ratgeber/weisse-weihnachten-gab-es-die-frueher-wirklich-oefter-66760852.bild.html
https://www.bild.de//bild-plus/lifestyle/2019/ratgeber/weihnachten-klimafreundliche-miet-weihnachtsbaeume-im-bild-test-66600560.bild.html
https://www.bild.de//bild-plus/ratgeber/wissenschaft/ratgeber/klimawandel-im-fakten-check-werden-wir-2076-noch-leben-66751584.bild.html
https://www.bild.de//bild-plus/geld/wirtschaft/wirtschaft/klimapaket-so-teuer-wird-es-fuer-verbraucher-66741454.bild.html
https://www.bild.de//politik/ausland/politik-ausla

https://www.bild.de//regional/hamburg/hamburg-aktuell/luisa-neubauer-karriere-als-politikerin-ist-moeglich-66423380.bild.html
https://www.bild.de//reise/service/service/co2-kompensation-vier-tipps-fuer-klimabewusstes-reisen-66007656.bild.html
https://www.bild.de//politik/inland/politik-inland/csu-general-fordert-gruene-sollen-zahl-der-flugreisen-offenlegen-66408730.bild.html
https://www.bild.de//politik/ausland/politik-ausland/greta-thunberg-kommt-zu-spaet-zum-klimagipfel-in-madrid-66402988.bild.html
https://www.bild.de//bild-plus/politik/inland/politik-inland/klima-wissenschaftler-erklaeren-so-schlimm-sind-unsere-klima-daten-66389612.bild.html
https://www.bild.de//regional/berlin/berlin-aktuell/klimaaktivisten-von-ende-gelaende-stuermen-tagebaue-in-der-lausitz-66382630.bild.html
https://www.bild.de//politik/inland/politik-inland/klimanotstand-und-kurzfluege-kritik-an-verhalten-von-gruenen-politiker-66370196.bild.html
https://www.bild.de//regional/berlin/berlin-aktuell/beamte-posten-fr