In [None]:
# number of articles in each category according to urls_depth_1.json

import json

with open("data/urls_depth_3.json", "r") as f:
    data = json.load(f)
total = 0
url_dict = {}
for category, urls in data.items():
    print(f"{category}: {len(urls)}")
    total += len(urls)
    url_dict[category] = len(urls)

In [None]:
len(data["Videnskab"]), len(set(data["Videnskab"]))

In [None]:
total

In [None]:
sec = total/3.26
min = sec/60
hour = min/60
print(f"Total time: {hour} hours, {min} minutes, {sec} seconds")

In [None]:
# number of articles inside each category subfolder
import os

for category in data.keys():
    print(f"{category}: {len(os.listdir(f'data/wiki_depth_3/{category}'))} / {url_dict[category]}")

In [None]:
# number of articles inside each category subfolder
import os

for category in data.keys():
    print(f"{category}: {len(os.listdir(f'data/wiki_depth_1/{category}'))} / {url_dict[category]}")

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import re
import json
import html2text

In [None]:
depth = 1

In [None]:
# https://github.com/Alir3z4/html2text/blob/master/docs/usage.md
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_emphasis = True
h.ignore_images = True
h.ignore_tables = True
h.body_width = 0
h.unicode_snob = True
h.single_line_break = True

def clean_html(raw_text: str):
    """ 
    Cleans text for html tags 
    
    Args:
        raw_text (str): Raw text with html tags
    
    Returns:
        str: Cleaned text
    """
    t = h.handle(raw_text) # convert html to text
    res = re.findall(r'\"(.+?)\"', t) # find all text inside quotes
    for r in res: # remove unnescessary spaces inside quotes like: " text " --> "text"
        t = t.replace(r, f"{r.strip()}")
    t = re.sub(" +", " ",t).strip() # remove trailing spaces
    t = re.sub(r' ([.,;:?!)}\]])', r'\1', t) # remove spaces before punctuations and brackets
    t = t.replace("*", "").strip() # remove bullet points
    t = re.sub(r'\[.*?\]', '', t) # remove all inside [] - used for editing wikipage texts (like: [ redigér | rediger kildetekst] ) + marking links (like: [4])
    t = re.sub(r'#', '', t) # avoid #'s that mark headers

    lines = [t.strip() for t in t.split("\n")] # obtain a list of lines
    chunks = []
    curr_chunk = ""
    for l in lines: # combine lines into text chunks - discard empty lines
        if "Hentet fra" in l:
            continue
        if len(l)>0:
            # add punctuations where missing
            l = l if l[-1] in [":", ";", "!", "?", "."] else l + "." 
            curr_chunk += " " + l
        else:
            if len(curr_chunk)>0:
                chunks.append(curr_chunk.strip())
            curr_chunk = ""
    if len(curr_chunk)>0: # add last text chunk
        chunks.append(curr_chunk)
    t = " ".join(chunks )
    t = t.replace("\xad", "") # remove soft hyphens (https://en.wikipedia.org/wiki/Soft_hyphen
    t = t.replace("\xa0", " ") # replace non-breaking space (https://en.wikipedia.org/wiki/Non-breaking_space)
    t = t.replace("\u200b", "") # remove zero width space (https://en.wikipedia.org/wiki/Zero-width_space)
    t = t.replace("\\", "")
    #t = re.sub(" +", " ",t).strip() # remove trailing spaces
    return t.strip()


In [None]:
titles = []
texts = []
categories = []
urls = []
for category in tqdm(os.listdir(f"data/wiki_depth_{depth}"), desc="Loading data"):
    if category == "Uddannelse":
        for file in os.listdir(f"data/wiki_depth_{depth}/{category}"):
            with open(f"data/wiki_depth_{depth}/{category}/{file}", "r") as f:
                text_file = f.read()
            # get title, text, category and url from text_file (stored using .write(title + "\t" + url + "\t" + category + "\t" + text + "\n"))
            titles.append(text_file.split("\t")[0])
            urls.append(text_file.split("\t")[1])
            categories.append(text_file.split("\t")[2])
            texts.append(clean_html(text_file.split("\t")[3]))
data = pd.DataFrame({"title": titles, "text": texts, "category": categories, "url": urls})

In [None]:
data