# Download & Preprocess Wikinews articles

In [None]:
from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import pandas as pd
from pathlib import Path
import subprocess

In [None]:
def download(split, tag):
    
    filepath_csv = f"../data/wikinews/urls/urls_{split}.tsv"
    
    dirpath_download = Path(f"../data/wikinews/download/{split}")
    if not dirpath_download.exists():
        dirpath_download.mkdir(parents=True)
    
    df = pd.read_csv(filepath_csv, sep='\t')
    with open(f"../log/wikinews_download_{split}.log", "w") as log:
        for _, row in df.iterrows():
            if not pd.isna(row[tag]):
                topic = row['category'].strip().lower().replace(' - ','_').replace(' ', '_')
                idx = row["id"]
                url = row[tag].strip()
                
                if tag == "url":
                    filename = f"{topic}_{idx}.html"
                elif tag == "Other candidate":
                    filename = f"{topic}_{idx}_extra.html"
                else:
                    print("error")
                
                result = subprocess.call(
                    ["wget", url, "-O", dirpath_download / filename],
                    stdout=log, 
                    stderr=log
                )

In [None]:
article_end_phrases = [
    "Have an opinion on this story?",
    "Share this:",
    "This page is archived, and is no longer publicly editable."
]

In [None]:
def preprocess(split):
    
    dirpath_download = Path(f"../data/wikinews/download/{split}")
    
    dirpath_preprocess = Path(f"../data/wikinews/preprocess/{split}")
    if not dirpath_preprocess.exists():
        dirpath_preprocess.mkdir(parents=True)
    
    for filepath in dirpath_download.glob("*.html"):
        
        with open(filepath, 'r') as f:
            html_doc = f.read()
    
        soup = BeautifulSoup(html_doc, 'html.parser')
    
        text = ''
        # title
        text += soup.title.text.replace(' - Wikinews, the free news source', '') + '\n'
        
        for p in soup.body.find_all(['p']):
    
            # post article irrelevant text
            if any([True if phrase in p.text else False for phrase in article_end_phrases]):
                break
    
            # remove line breaks
            if p.text.strip() == "":
                continue
    
            # date
            if p.strong and p.strong.span and p.strong.span["id"] == "publishDate":
                text += p.text.replace("\n", "") + "\n"
            else:
                text += p.text
            
        text = text.strip()
        
        with open(dirpath_preprocess / f"{filepath.stem}.txt", "w") as f:
            f.write(text + "\n")
        with open(dirpath_preprocess / f"{filepath.stem}.ann", "w") as f:
            f.write("\n")

In [None]:
Path("../log/").mkdir()

In [None]:
download("dev", "url")
preprocess("dev")

In [None]:
download("test", "url")
download("test", "Other candidate")
preprocess("test")