# Foinse scraper pieces

> Scraping Foinse, from the Wayback Machine

- toc: false
- branch: master
- badges: true
- comments: false
- categories: [irish, scraper, foinse]

In [1]:
link = "http://web.archive.org/web/20171209002240/http://www.foinse.ie/sport/eile/6412-an-dornalai-john-joe-nevin-rangaithe-ag-uimhir-a-haon-anois"

In [53]:
import requests
from bs4 import BeautifulSoup, Comment

In [3]:
page = requests.get(link)
assert page.status_code == 200

In purely text terms, much of the junk can be discarded using these comments:

In [4]:
if "<!-- CONTENT -->" in page.text:
    trim = page.text.split("<!-- CONTENT -->")[1]

In [5]:
if trim and "<!-- //CONTENT -->" in trim:
    trim = trim.split("<!-- //CONTENT -->")[0]

... but it's easier with BeautifulSoup to just extract `<div class="item-page">`

In [6]:
soup = BeautifulSoup(page.text, "lxml")

In [7]:
content = soup.find("div", {"class": "item-page"})

In [8]:
title = content.find("h2").text.strip()

In [9]:
published_tag = content.find("dd", {"class": "published"})

In [10]:
if published_tag:
    published = published_tag.text.strip()

In [11]:
author_tag = content.find("dd", {"class": "createdby"})
if author_tag:
    author = author_tag.text.strip()

In [12]:
author

'Scríofa ag Úna Ní Eidhin'

In [13]:
paragraphs_tags = content.find_all("p", {"class": "MsoNormal"})

In [14]:
paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]

In [15]:
vocab_list = []
for p in paragraphs_tags:
    for vocab in p.find_all("a", {"class": "glossarylink"}):
        item = {}
        item["en"] = vocab.get("title").strip()
        item["ga"] = vocab.text.strip()
        vocab_list.append(item)

In [16]:
check = "http://web.archive.org/web/20171222073817/http://www.foinse.ie/nuacht/nuacht-is-deanai/6822-seanoiri-ag-dul-i-mbun-agoide-maidir-le-ciorruithe"

In [17]:
page2 = requests.get(check)
assert page2.status_code == 200

In [64]:
def get_content(url):
    from bs4 import Comment

    out = {}
    page = requests.get(url)
    if page.status_code != 200:
        return {}

    soup = BeautifulSoup(page.text, "lxml")

    content = soup.find("div", {"class": "item-page"})
    if not content:
        return {}
    
    # junk
    jc = content.find("div", {"id": "jc"})
    jc.extract()
    pagenav = content.find("ul", {"class": "pagenav"})
    pagenav.extract()
    for js in content.find_all("script", {"type": "text/javascript"}):
        js.extract()

    h2 = content.find("h2")
    title = h2.text.strip()
    if title:
        out["title"] = title
    h2.extract()

    h1 = content.find("h1")
    heading = h1.text.strip()
    if title:
        out["h1"] = title
    h1.extract()

    published_tag = content.find("dd", {"class": "published"})
    if published_tag:
        out["published"] = published_tag.text.strip()

    author_tag = content.find("dd", {"class": "createdby"})
    if author_tag:
        out["author"] = author_tag.text.strip()
    artinfo = content.find("dl", {"class": "article-info"})
    artinfo.clear()

    paragraphs_tags = content.find_all("p")
    paragraphs = [p.text.replace("\xa0", " ").strip() for p in paragraphs_tags]
    out["text"] = paragraphs
    
    raw_text = content.text
    
    raw_out = []
    for raw_line in raw_text.split("\n"):
        line = raw_line.replace("\xa0", " ").strip()
        if line == "":
            continue
        if line in paragraphs:
            continue
        raw_out.append(line)
    out["raw_text"] = raw_out

    vocab_list = []
    for vocab in content.find_all("a", {"class": "glossarylink"}):
        item = {}
        item["en"] = vocab.get("title").strip()
        item["ga"] = vocab.text.strip()
        vocab_list.append(item)
    out["vocab"] = vocab_list

    return out

In [24]:
page = requests.get(link)

soup = BeautifulSoup(page.text, "lxml")

content = soup.find("div", {"class": "item-page"})
if not content:
    print("Empty")


In [65]:
get_content(link)

{'title': 'An dornálaí John Joe Nevin rangaithe ag uimhir a haon anois',
 'h1': 'An dornálaí John Joe Nevin rangaithe ag uimhir a haon anois',
 'published': 'Foilsithe an Dé Máirt, 02 Iúil 2013 16:12',
 'author': 'Scríofa ag Úna Ní Eidhin',
 'text': ['Tá sé fógartha inniu go bhfuil ardú céime faighte ag an dornálaí Éireannach, John Joe Nevin agus uimhir a haon bainte amach aige i rangú coileachmheáchain an domhain.',
  'Fuair Nevin an t-ardú céime seo tar éis dó bonn óir a bhuachan ag Craobhchomórtais Dornálaíochta Amaitéaracha na hEorpa ag tús na míosa seo caite sa Bhealarúis. D’fhoilsigh an Cumann Idirnáisiúnta Dornálaíochta Amaitéaraí an rangú athbhreithnithe seo inniu.'],
 'raw_text': ['Mar thoradh air seo, tá dhá dhornálaí ag Éirinn anois atá rangaithe ag uimhir a haon sa domhan – Nevin agus Katie Taylor. Tá na dornálaithe Éireannacha Paddy Barnes agus Michael Conlon rangaithe ag uimhir a trí ina rannáin féin maidir le meáchan.'],
 'vocab': [{'en': 'boxer', 'ga': 'dornálaí'},
  {'

In [63]:
page = requests.get(link)

soup = BeautifulSoup(page.text, "lxml")

content = soup.find("div", {"class": "item-page"})
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
    print(comment.extract())


 End Wayback Rewrite JS Include 
- this template -
[if IE]>
<link rel="stylesheet" href="http://www.foinse.ie/templates/foinse_ga/css/ie.css" type="text/css" />
<![endif]
[if lt IE 7.0]>
<link rel="stylesheet" href="http://www.foinse.ie/templates/foinse_ga/css/ie7minus.css" type="text/css" />
<![endif]
[if IE 7.0]>
<style>
.clearfix {
	display: inline-block; /* IE7xhtml*/
}
</style>
<![endif]
 HEADER 
 //HEADER 
 MAIN NAVIGATION 
 //MAIN NAVIGATION 
 MAIN CONTAINER 
 CONTENT 
 //CONTENT 
 RIGHT COLUMN
 ss-large-rectangle 
 ss-large-rectangle 
 adsense
 end adsense
 RIGHT COLUMN
 //MAIN CONTAINER 
 BOTTOM SPOTLIGHT 
 Abivia multi Twitter module 1.6.1 http://www.abivia.net 
 //BOTTOM SPOTLIGHT 
 FOOTER 
 //FOOTER 

     FILE ARCHIVED ON 17:11:00 Dec 23, 2017 AND RETRIEVED FROM THE
     INTERNET ARCHIVE ON 07:07:29 Oct 05, 2021.
     JAVASCRIPT APPENDED BY WAYBACK MACHINE, COPYRIGHT INTERNET ARCHIVE.

     ALL OTHER CONTENT MAY ALSO BE PROTECTED BY COPYRIGHT (17 U.S.C.
     SECTION 108(a)

In [19]:
def filter_para_list(inlist):
    out = []
    for para in inlist:
        if para == "":
            continue
        elif para.strip() == "Foinse - News as Gaeilge":
            return out
        else:
            out.append(para)
    return out

In [20]:
def extract_summary(inlist):
    if len(inlist) > 2:
        if inlist[-2] == "Did you understand this story? Here are the main points:":
            return inlist[-1]
    return ""