In [1]:
import streamlit as st
import pandas as pd
import requests

from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector

In [2]:
BASE_URL = "https://www.projekt-gutenberg.org/"

In [3]:
st.cache(suppress_st_warning = True)

<function streamlit.runtime.legacy_caching.caching.cache.<locals>.wrapper(f: ~F) -> ~F>

In [82]:
def scrape_author(author):
    url = f"{BASE_URL}/autoren/namen/{author.lower()}.html"
    print(f"Scrape Author [{author}] {url}")
    
    res = requests.get(url)
    
    # AUthor nicht gefunden
    if res.status_code != 200:
        print(f"Author {author} wurde nicht gefunden!")
        return None
    
    try:
        print(f"Author {author} wurde gefunden!")
        author_site = BeautifulSoup(res.content, "lxml", from_encoding = EncodingDetector.find_declared_encoding(res.content, is_html = True))
    except Exception:
        print("Error während dem Decoden")
        return None
    
    # Wörterbuch / Dict mit allen Informationen über unseren Autor
    infos = {"data"     : None,
             "books"    : _find_books(author_site),
             "info"     : _find_info(author_site),
             "image_url": _find_image(author_site)
            }
    
    df_all = pd.DataFrame()
    
    for title, url in infos["books"]:
        st.markdown(f"[{title}]({url})")
        print(f"Scrape Buch '{title}' [{url}]")
        
        df_temp = _scrape_book(url)
        df_all = pd.concat([df_all, df_temp], ignore_index = True)
        
    df_all["Autor"] = author.upper()
    
    infos["data"] = df_all
    
    print(f"Gefundene Sätze: {df_all.shape}")
    
    return infos
        

In [48]:
scrape_author("Kafka")

Scrape Author [Kafka] https://www.projekt-gutenberg.org//autoren/namen/kafka.html
Author Kafka wurde gefunden!


In [6]:
def _find_books(books):
    tag = books.find("div", {"class": "archived"})
    if tag == None:
        return []
    
    book_url = []
    
    for l in tag.find_all("li"):
        tag = l.find("a", href = True)
        book_title = tag.string

        url = f"{BASE_URL}/{tag['href'][6:]}"
        url = url[:url.rfind("/")]
        
        book_url.append((book_title, url))
        
    return book_url 

In [36]:
def _find_info(author_site):
    try:
        return author_site.find_all("p")[1].text
    except:
        return None
    

In [47]:
def _find_image(author_site):
    #image = author_site.find("img", {"class": "autpic"})
    try:
        return f"{BASE_URL}/autoren/{author_site.find('img', src = True, title = True)['src'][3:]}"
    except:
        return None
    

In [78]:
def _scrape_book(url):
    res = requests.get(url)
    
    book_site = BeautifulSoup(res.content, "lxml", from_encoding = EncodingDetector.find_declared_encoding(res.content, is_html = True))
    
    subchapters = book_site.find_all("li")
    
    subchapters_links = []
    
    for sub in subchapters:
        link = sub.find("a", href = True)
        subchapters_links.append(url + link["href"])
    
    df = pd.DataFrame(columns = ["Satz"])
    
    progressbar = st.progress(0)
    
    for index, temp_url in enumerate(subchapters_links):
        progressbar.progress((index+1)/len(subchapters_links))
        
        res = requests.get(temp_url)     
        books = BeautifulSoup(res.content, "lxml", from_encoding = EncodingDetector.find_declared_encoding(res.content, is_html = True))

        data = _find_text(books)
        
        for satz in data.split("."):
            df.loc[len(df)] = satz
    
    progressbar.empty()
    
    df["Satz"] = df["Satz"].map(_correction).dropna()
    
    return df

In [79]:
def _correction(string):
    if len(string) < 4:
        return None
    else:
        return string

In [80]:
def _find_text(books):
    text = ""
    
    for paragraph in books.find_all("p"):
        if paragraph.string:
            text = text + paragraph.text
    
    return text
        