In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from jupyter_progressbar import ProgressBar
import os
from dnrepresentations.parsing import ForumPost, ForumThread, Author, ForumBoard, Database

## Now lets scrape all the files with my new classes

In [3]:
def write_thread_to_database(thread: ForumThread):
    raise NotImplementedError()

In [4]:
def yield_path_of_html_files(root_path="data/digitalnippon.de"):
    html_folders = ["anime-manga-japan", "digital-nippon", "digital-nippon-archiv", "medientitel", "rpg-forest", "smalltalk"]
    walkers = (os.walk(os.path.join(root_path, directory)) for directory in html_folders)
    for walker in walkers:
        for root, _, files in walker:
            for file in files:
                # Filter non-HTML files
                basename, ext = os.path.splitext(file)
                if ext not in [".htm", ".html"] or basename.endswith("POLL"):
                    continue
                full_path = os.path.join(root, file)
                yield full_path

In [5]:
nr_files = sum(1 for _ in yield_path_of_html_files())
print(nr_files)

16490


In [6]:
# Read threads
threads = []
board_paths = set()
for path in ProgressBar(yield_path_of_html_files(), size=nr_files):
    board_paths.add(os.path.split(path[22:])[0])
    thread = ForumThread(path)
    try:
        thread.parse()
    except AttributeError as e:
        print(path, e)
        continue
    except Exception:
        print(path)
        raise
    threads.append(thread)

VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…

data/digitalnippon.de/digital-nippon/news/dn-rewatch-club-vol-4-anmeldung_t14082.html 'NoneType' object has no attribute 'group'
data/digitalnippon.de/digital-nippon/news/dn-watch-club-vol-19-anmeldung_t20929.html 'NoneType' object has no attribute 'group'
data/digitalnippon.de/digital-nippon/news/support/frhling-2011-on-air-threadfhrung_t13472.html 'NoneType' object has no attribute 'group'
data/digitalnippon.de/digital-nippon/news/support/winter-2016-on-air-threadbetreuung_t22414.html 'NoneType' object has no attribute 'group'
data/digitalnippon.de/digital-nippon-archiv/geschlossene-bereiche/discussion-area-philosophie/one-night-stands_t1242.html 'NoneType' object has no attribute 'group'
data/digitalnippon.de/smalltalk/kissaten-/wie-esst-ihr-gurken_t19100.html 'NoneType' object has no attribute 'group'
data/digitalnippon.de/smalltalk/kissaten-/bestes-werk-von-tsuzuki-masaki_t18358.html 'NoneType' object has no attribute 'group'
data/digitalnippon.de/smalltalk/kissaten-/archiv/op-sec

In [7]:
print(*board_paths, sep="\n")

digital-nippon/news
digital-nippon-archiv/geschlossene-bereiche/pc-internet-smartphone
digital-nippon-archiv/geschlossene-bereiche/nippon-news
rpg-forest/rpg-archiv/beta
rpg-forest/rpg-archiv/galvanizers-game
rpg-forest/rpg-archiv/nachtalben
medientitel/movies
rpg-forest/rpg-archiv/genesis
medientitel/anime
digital-nippon-archiv/geschlossene-bereiche/music
digital-nippon-archiv/geschlossene-bereiche/creative-corner
medientitel/novels-
rpg-forest/rpg-archiv/mirrors-call
medientitel/manga
rpg-forest/rpg-archiv/der-boshin-krieg
rpg-forest/rpg-archiv/pain-prophet
medientitel/games
rpg-forest/rpg-archiv/crystals-power
digital-nippon-archiv/geschlossene-bereiche/discussion-area-philosophie
medientitel/tv
rpg-forest/rpg-archiv/endzeit
digital-nippon/news/newcomers
digital-nippon/news/support
smalltalk/kissaten-/cons-usermeetings
rpg-forest/rpg-archiv/das-kartenhaus
rpg-forest/rpg-archiv/himitsu
rpg-forest/rpg-archiv/restauration
rpg-forest/rpg-archiv/dark-rift
digital-nippon-archiv/geschlosse

In [8]:
member_walker = os.walk("data/digitalnippon.de/mitglieder")
member_root, _, member_files = next(member_walker)
authors = []
for file in member_files:
    full_path = os.path.join(member_root, file)
    author = Author(full_path)
    try:
        author.parse()
    except Exception:
        print(full_path)
        raise
    authors.append(author)

In [9]:
boards = [ForumBoard(path=path) for path in board_paths]

In [10]:
#(post for post in thread for thread in threads)

# Now for the database code

In [11]:
db = Database("sqlite:///digitalnippon.sql")

In [12]:
db.create_all()

In [13]:
session = db.get_session()

In [14]:
# Add boards and author
session.add_all(authors)
session.add_all(boards)

In [15]:
session.commit()

In [17]:
# Add parent references to boards
for board in boards:
    parent_path = os.path.split(board.path)[0]
    try:
        board_parent = next(b for b in boards if b.path == parent_path)
    except StopIteration:
        continue
    board.parent_id = board_parent.id

In [18]:
session.commit()

In [19]:
# Add board references to threads
for thread in threads:
    parent_board_path = os.path.split(thread.path[22:])[0]
    parend_board = next(board for board in boards if board.path == parent_board_path)
    thread.board_id = parend_board.id

In [20]:
session.add_all(threads)
session.commit()

In [21]:
posts = []
for thread in threads:
    for post in thread.posts:
        post.thread_id = thread.id
        author_name = post.author_string
        try:
            author = next(author for author in authors if author.name == author_name)
        except StopIteration:
            author = Author("")
            author.name = author_name
            session.add(author)
            session.commit()
            authors.append(author)
        post.author_id = author.id
        posts.append(post)

In [22]:
session.add_all(posts)
session.commit()