# Foinse scraper pieces, ctd

> Scraping Foinse, from the Wayback Machine

- toc: false
- branch: master
- badges: true
- comments: false
- categories: [irish, scraper, foinse]

[Continued]({% post_url 2021-09-27-foinse_scraper_pieces %})

In [1]:
link = "http://web.archive.org/web/20171209002240/http://www.foinse.ie/sport/eile/6412-an-dornalai-john-joe-nevin-rangaithe-ag-uimhir-a-haon-anois"

In [53]:
import requests
from bs4 import BeautifulSoup, Comment

In [20]:
def extract_summary(inlist):
    if len(inlist) > 2:
        if inlist[-2] == "Did you understand this story? Here are the main points:":
            return inlist[-1]
    return ""

In [19]:
def filter_para_list(inlist):
    out = []
    for para in inlist:
        if para == "":
            continue
        elif para.strip() == "Foinse - News as Gaeilge":
            return out
        elif para.strip() == "Did you understand this story? Here are the main points:":
            return out
        else:
            out.append(para)
    return out

In [75]:
def get_content(url):
    from bs4 import Comment

    out = {}
    page = requests.get(url)
    if page.status_code != 200:
        return {}

    soup = BeautifulSoup(page.text, "lxml")

    content = soup.find("div", {"class": "item-page"})
    if not content:
        return {}
    
    breadcrumbs = soup.find("div", {"class": "ja-breadcrums"})
    if breadcrumbs:
        here = breadcrumbs.find("a", {"class": "pathway"})
        out["category"] = here.text.strip()
    
    # junk
    jc = content.find("div", {"id": "jc"})
    jc.extract()
    pagenav = content.find("ul", {"class": "pagenav"})
    pagenav.extract()
    for js in content.find_all("script", {"type": "text/javascript"}):
        js.extract()

    h2 = content.find("h2")
    title = h2.text.strip()
    if title:
        out["title"] = title
    h2.extract()

    h1 = content.find("h1")
    heading = h1.text.strip()
    if heading:
        out["subcategory"] = heading
    h1.extract()

    published_tag = content.find("dd", {"class": "published"})
    if published_tag:
        out["published"] = published_tag.text.strip()

    author_tag = content.find("dd", {"class": "createdby"})
    if author_tag:
        out["author"] = author_tag.text.strip()
    artinfo = content.find("dl", {"class": "article-info"})
    artinfo.clear()

    paragraphs_tags = content.find_all("p")
    paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
    out["text"] = paragraphs
    
    raw_text = content.text
    
    raw_out = []
    for raw_line in raw_text.split("\n"):
        line = raw_line.replace("\xa0", " ").strip()
        if line == "":
            continue
        raw_out.append(line)
    if paragraphs != raw_out:
        out["text"] = raw_out
        
    summary = extract_summary(out["text"])
    if summary:
        out["summary"] = summary
    out["text"] = filter_para_list(out["text"])

    vocab_list = []
    for vocab in content.find_all("a", {"class": "glossarylink"}):
        item = {}
        item["en"] = vocab.get("title").strip()
        item["ga"] = vocab.text.strip()
        vocab_list.append(item)
    out["vocab"] = vocab_list

    return out

In [24]:
page = requests.get(link)

soup = BeautifulSoup(page.text, "lxml")

content = soup.find("div", {"class": "item-page"})
if not content:
    print("Empty")


In [76]:
get_content(link)

{'category': 'Spórt',
 'title': 'An dornálaí John Joe Nevin rangaithe ag uimhir a haon anois',
 'subcategory': 'Eile',
 'published': 'Foilsithe an Dé Máirt, 02 Iúil 2013 16:12',
 'author': 'Scríofa ag Úna Ní Eidhin',
 'text': ['Tá sé fógartha inniu go bhfuil ardú céime faighte ag an dornálaí Éireannach, John Joe Nevin agus uimhir a haon bainte amach aige i rangú coileachmheáchain an domhain.',
  'Fuair Nevin an t-ardú céime seo tar éis dó bonn óir a bhuachan ag Craobhchomórtais Dornálaíochta Amaitéaracha na hEorpa ag tús na míosa seo caite sa Bhealarúis. D’fhoilsigh an Cumann Idirnáisiúnta Dornálaíochta Amaitéaraí an rangú athbhreithnithe seo inniu.',
  'Mar thoradh air seo, tá dhá dhornálaí ag Éirinn anois atá rangaithe ag uimhir a haon sa domhan – Nevin agus Katie Taylor. Tá na dornálaithe Éireannacha Paddy Barnes agus Michael Conlon rangaithe ag uimhir a trí ina rannáin féin maidir le meáchan.'],
 'vocab': [{'en': 'boxer', 'ga': 'dornálaí'},
  {'en': 'ranking', 'ga': 'rangú'},
  {'e

In [78]:
get_content("http://web.archive.org/web/20140401140521/http://foinse.ie/nuacht/nuacht-is-deanai/6817-john-gilligan-scaoilte-saor-on-bpriosun")

{'category': 'Nuacht',
 'title': 'John Gilligan scaoilte saor ón bpríosún',
 'subcategory': 'Nuacht is Déanaí',
 'published': 'Foilsithe an Dé Máirt, 15 Deireadh Fómhair 2013 11:30',
 'author': 'Scríofa ag Úna Ní Eidhin',
 'text': ['Táthar tar éis an gáinneálaí drugaí ciontaithe, John Gilligan (61), a scaoileadh saor ó Phríosún Phort Laoise ar maidin inniu.',
  'Gearradh pianbhreith de 28 bliain sa phríosún ar Gilligan ar dtús i mí an Mhárta 2001 ach baineadh ocht mbliana don phianbhreith seo mar thoradh ar achomharc a rinne sé sa bhliain 2003. Gearradh pianbhreitheanna eile air ansin, as nithe a rinne sé as bealach fad is a bhí sé istigh sa phríosún.',
  'D’fhág sé an príosún ag 9.30am ar maidin agus d’imigh sé i gcarr le beirt fhear eile.',
  'Bhí Gilligan á thriail freisin as dúnmharú an iriseora, Veronica Guerin, blianta siar ach éigiontaíodh é sa bhliain 2001 sa Chúirt Choiriúil Speisialta.'],
 'summary': 'Convicted drug trafficker John Gilligan has been released from Portlaoise P