# Foinse scraper pieces, ctd

> Scraping Foinse, from the Wayback Machine

- toc: false
- branch: master
- badges: true
- comments: false
- categories: [irish, scraper, foinse]

[Continued]({% post_url 2021-09-27-foinse_scraper_pieces %})

In [1]:
link = "http://web.archive.org/web/20171209002240/http://www.foinse.ie/sport/eile/6412-an-dornalai-john-joe-nevin-rangaithe-ag-uimhir-a-haon-anois"

In [53]:
import requests
from bs4 import BeautifulSoup, Comment

In [20]:
def extract_summary(inlist):
    if len(inlist) > 2:
        if inlist[-2] == "Did you understand this story? Here are the main points:":
            return inlist[-1]
    return ""

In [19]:
def filter_para_list(inlist):
    out = []
    for para in inlist:
        if para == "":
            continue
        elif para.strip() == "Foinse - News as Gaeilge":
            return out
        elif para.strip() == "Did you understand this story? Here are the main points:":
            return out
        else:
            out.append(para)
    return out

In [None]:
def get_content(url, text=""):
    from bs4 import Comment

    out = {}
    if text:
        page_content = text
    else:
        page = requests.get(url)
        if page.status_code != 200:
            return {}
        page_content = page.text

    soup = BeautifulSoup(page_content, "lxml")

    content = soup.find("div", {"class": "item-page"})
    if not content:
        return {}
    
    breadcrumbs = soup.find("div", {"class": "ja-breadcrums"})
    if breadcrumbs:
        here = breadcrumbs.find("a", {"class": "pathway"})
        out["category"] = here.text.strip()
    
    # junk
    jc = content.find("div", {"id": "jc"})
    if jc:
        jc.extract()
    pagenav = content.find("ul", {"class": "pagenav"})
    if pagenav:
        pagenav.extract()
    for js in content.find_all("script", {"type": "text/javascript"}):
        js.extract()

    h2 = content.find("h2")
    if h2:
        title = h2.text.strip()
        if title:
            out["title"] = title
        h2.extract()

    h1 = content.find("h1")
    if h1:
        heading = h1.text.strip()
        if heading:
            out["subcategory"] = heading
        h1.extract()

    published_tag = content.find("dd", {"class": "published"})
    if published_tag:
        out["published"] = published_tag.text.strip()

    author_tag = content.find("dd", {"class": "createdby"})
    if author_tag:
        out["author"] = author_tag.text.strip()
    artinfo = content.find("dl", {"class": "article-info"})
    artinfo.clear()

    paragraphs_tags = content.find_all("p")
    paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
    out["text"] = paragraphs
    
    raw_text = content.text
    
    raw_out = []
    for raw_line in raw_text.split("\n"):
        line = raw_line.replace("\xa0", " ").strip()
        if line == "":
            continue
        raw_out.append(line)
    if paragraphs != raw_out:
        out["text"] = raw_out
        
    summary = extract_summary(out["text"])
    if summary:
        out["summary"] = summary
    out["text"] = filter_para_list(out["text"])

    vocab_list = []
    for vocab in content.find_all("a", {"class": "glossarylink"}):
        item = {}
        item["en"] = vocab.get("title").strip()
        item["ga"] = vocab.text.strip()
        vocab_list.append(item)
    out["vocab"] = vocab_list
    
    if not out:
        print(content)

    return out

In [24]:
page = requests.get(link)

soup = BeautifulSoup(page.text, "lxml")

content = soup.find("div", {"class": "item-page"})
if not content:
    print("Empty")


In [81]:
BASE_DIR = "/home/jim/Playing/foinseunpacked"
file = open(f"{BASE_DIR}/filt1", "r")
pages = []
for link in file.readlines():
    pages.append(link.strip())

In [93]:
foinse_data = []
with open("/home/jim/foinse-bad.txt", "w") as bad_list:
    for page in pages:
        print(page)
        page_path = BASE_DIR + page.strip()[1:]
        with open(page_path, "r") as pagef:
            plines = pagef.readlines()
            ptext = "\n".join(plines)
        content = get_content(page_path, ptext)
        if content:
            foinse_data.append(content)
        else:
            bad_list.write(page + "\n")

./web.archive.org/web/20171209002240/http:/www.foinse.ie/eagarfhocal-a-litreacha/litreacha/5975-jab-maith-a-dheanamh-agaibh
./web.archive.org/web/20171209002240/http:/www.foinse.ie/eagarfhocal-a-litreacha/diospoireacht-na-seachtaine/5982-colun-an-ghaeilge-agus-uachtaranacht-chomhairle-an-aontais-eorpaigh
./web.archive.org/web/20171209002240/http:/www.foinse.ie/eagarfhocal-a-litreacha/smaointe-fanacha/6038-comoradh-eiri-amach-na-casca-sa-bhliain-2016
./web.archive.org/web/20171209002240/http:/www.foinse.ie/sport/cumann-luthcleas-gael/6338-tacaionn-realtai-peile-agus-iomana-le-feachtas-gaeilge-clg
./web.archive.org/web/20171209002240/http:/www.foinse.ie/sport/eile/6417-cinneadh-deanta-ag-gatland-gan-odriscoll-a-roghnu-dfhoireann-an-tsathairn
./web.archive.org/web/20171209002240/http:/www.foinse.ie/sport/eile/6412-an-dornalai-john-joe-nevin-rangaithe-ag-uimhir-a-haon-anois
./web.archive.org/web/20171209002240/http:/www.foinse.ie/oideachas/foinse-og/6426-pleanag-glanadh-an-ti
./web.archive

./web.archive.org/web/20171209002304/http:/www.foinse.ie/oideachas/foinse-og/6426-pleanag-glanadh-an-ti
./web.archive.org/web/20171209002304/http:/www.foinse.ie/oideachas/foinse-og/6422-plean-siopa-nuachtan
./web.archive.org/web/20171209002304/http:/www.foinse.ie/oideachas/foinse-og/6425-plean-an-larionad-siopadoireachta
./web.archive.org/web/20171209002304/http:/www.foinse.ie/gneithe/tg4/6306-mor-shraith-ghrinn-do-tg4-seolta-i-dtir-chonaill
./web.archive.org/web/20171209002304/http:/www.foinse.ie/gneithe/blag-scannain/6344-man-of-steel-scannan-nua-superman-le-feiceail-sa-phictiurlann
./web.archive.org/web/20171209002304/http:/www.foinse.ie/gneithe/blag-scannain/6345-brad-pitt-sa-scannan-world-war-z
./web.archive.org/web/20171209002304/http:/www.foinse.ie/nuacht/naisiunta/6424-na-freagrai-ar-na-comhairliuchain-phoibli-i-dte-maidir-leis-an-ngaeilge-agus-an-ultais-foilsithe
./web.archive.org/web/20171209002304/http:/www.foinse.ie/nuacht/nuacht-is-deanai/6818-fear-ar-thug-an-psni-rabhadh-

./web.archive.org/web/20171209002330/http:/www.foinse.ie/gneithe/tg4/6306-mor-shraith-ghrinn-do-tg4-seolta-i-dtir-chonaill
./web.archive.org/web/20171209002330/http:/www.foinse.ie/gneithe/blag-scannain/6344-man-of-steel-scannan-nua-superman-le-feiceail-sa-phictiurlann
./web.archive.org/web/20171209002330/http:/www.foinse.ie/gneithe/blag-scannain/6345-brad-pitt-sa-scannan-world-war-z
./web.archive.org/web/20171209002330/http:/www.foinse.ie/nuacht/naisiunta/6424-na-freagrai-ar-na-comhairliuchain-phoibli-i-dte-maidir-leis-an-ngaeilge-agus-an-ultais-foilsithe
./web.archive.org/web/20171209002330/http:/www.foinse.ie/nuacht/nuacht-is-deanai/6818-fear-ar-thug-an-psni-rabhadh-faoi-an-tseachtain-seo-caite-gafa-i-ndoire
./web.archive.org/web/20171209002330/http:/www.foinse.ie/nuacht/nuacht-is-deanai/6821-fear-gafa-maidir-le-tionoisc-buille-is-teitheadh-sa-phriomhchathair
./web.archive.org/web/20171209002330/http:/www.foinse.ie/nuacht/nuacht-is-deanai/6822-seanoiri-ag-dul-i-mbun-agoide-maidir-le-

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9c in position 433: invalid start byte

In [None]:
get_content("http://web.archive.org/web/20121118021552/http://www.foinse.ie/nuacht/bluirini/5301-an-rath-ar-bhreagchuirt-ui-dhalaigh-gael-linn-2012")