# Foinse scraper pieces

> Scraping Foinse, from the Wayback Machine

- toc: false
- branch: master
- badges: true
- comments: false
- categories: [irish, scraper, foinse]

In [1]:
link = "http://web.archive.org/web/20171209002240/http://www.foinse.ie/sport/eile/6412-an-dornalai-john-joe-nevin-rangaithe-ag-uimhir-a-haon-anois"

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
page = requests.get(link)
assert page.status_code == 200

In purely text terms, much of the junk can be discarded using these comments:

In [4]:
if "<!-- CONTENT -->" in page.text:
    trim = page.text.split("<!-- CONTENT -->")[1]

In [5]:
if trim and "<!-- //CONTENT -->" in trim:
    trim = trim.split("<!-- //CONTENT -->")[0]

... but it's easier with BeautifulSoup to just extract `<div class="item-page">`

In [6]:
soup = BeautifulSoup(page.text, "lxml")

In [7]:
content = soup.find("div", {"class": "item-page"})

In [8]:
title = content.find("h2").text.strip()

In [9]:
published_tag = content.find("dd", {"class": "published"})

In [10]:
if published_tag:
    published = published_tag.text.strip()

In [11]:
author_tag = content.find("dd", {"class": "createdby"})
if author_tag:
    author = author_tag.text.strip()

In [12]:
author

'Scríofa ag Úna Ní Eidhin'

In [13]:
paragraphs_tags = content.find_all("p", {"class": "MsoNormal"})

In [14]:
paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]

In [15]:
vocab_list = []
for p in paragraphs_tags:
    for vocab in p.find_all("a", {"class": "glossarylink"}):
        item = {}
        item["en"] = vocab.get("title").strip()
        item["ga"] = vocab.text.strip()
        vocab_list.append(item)

In [16]:
check = "http://web.archive.org/web/20171222073817/http://www.foinse.ie/nuacht/nuacht-is-deanai/6822-seanoiri-ag-dul-i-mbun-agoide-maidir-le-ciorruithe"

In [17]:
page2 = requests.get(check)
assert page2.status_code == 200

In [22]:
def get_content(url):
    out = {}
    page = requests.get(url)
    if page.status_code != 200:
        return {}

    soup = BeautifulSoup(page.text, "lxml")

    content = soup.find("div", {"class": "item-page"})
    if not content:
        return {}

    title = content.find("h2").text.strip()
    if title:
        out["title"] = title

    published_tag = content.find("dd", {"class": "published"})
    if published_tag:
        out["published"] = published_tag.text.strip()

    author_tag = content.find("dd", {"class": "createdby"})
    if author_tag:
        out["author"] = author_tag.text.strip()

    paragraphs_tags = content.find_all("p")
    paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
    out["text"] = paragraphs

    vocab_list = []
    for p in paragraphs_tags:
        for vocab in p.find_all("a", {"class": "glossarylink"}):
            item = {}
            item["en"] = vocab.get("title").strip()
            item["ga"] = vocab.text.strip()
            vocab_list.append(item)
    out["vocab"] = vocab_list

    return out

In [24]:
page = requests.get(link)

soup = BeautifulSoup(page.text, "lxml")

content = soup.find("div", {"class": "item-page"})
if not content:
    print("Empty")


In [26]:
jc = content.find("div", {"id": "jc"})

In [28]:
jc.clear()

In [30]:
pagenav = content.find("ul", {"class": "pagenav"})

In [32]:
pagenav.clear()

In [33]:
content

<div class="item-page">
<h1>
	Eile	</h1>
<h2>
<a href="/web/20171223171100/http://www.foinse.ie/sport/eile/6412-an-dornalai-john-joe-nevin-rangaithe-ag-uimhir-a-haon-anois">
		An dornálaí John Joe Nevin rangaithe ag uimhir a haon anois</a>
</h2>
<dl class="article-info">
<dt class="article-info-term">Sonraí</dt>
<dd class="published">
	Foilsithe an Dé Máirt, 02 Iúil 2013 16:12	</dd>
<dd class="createdby">
				Scríofa ag Úna Ní Eidhin 		</dd>
</dl>
<p>Tá sé fógartha inniu go bhfuil ardú céime faighte ag an <a class="glossarylink" href="/web/20171223171100/http://www.foinse.ie/component/glossary/Glossary-1/D/dorn%C3%A1la%C3%AD-2914/?Itemid=593" title="boxer ">dornálaí</a> Éireannach, John Joe Nevin agus uimhir a haon bainte amach aige i <a class="glossarylink" href="/web/20171223171100/http://www.foinse.ie/component/glossary/Glossary-1/R/rang%C3%BA-2878/?Itemid=593" title="ranking ">rangú</a> coileachmheáchain an domhain.</p>
<p>Fuair Nevin an t-ardú céime seo tar éis dó <a class="glossa

In [19]:
def filter_para_list(inlist):
    out = []
    for para in inlist:
        if para == "":
            continue
        elif para.strip() == "Foinse - News as Gaeilge":
            return out
        else:
            out.append(para)
    return out

In [20]:
def extract_summary(inlist):
    if len(inlist) > 2:
        if inlist[-2] == "Did you understand this story? Here are the main points:":
            return inlist[-1]
    return ""