In [None]:
import json
import pdfminer

In [None]:
with open('seidean_si.json') as infile:
    books = json.load(infile)

In [None]:
for book in books.keys():
    for dialect in books[book].keys():
        tmp = {}
        url = books[book][dialect]
        tmp['url'] = url
        tmp['file'] = url.split('/')[-1]
        books[book][dialect] = tmp

In [None]:
pages = {
    "Táimid mór le Chéile": [i for i in range(4, 20)],
    "Cliabhán d'Ailbhe": [i for i in range(3, 38)],
    "Murach an Traenáil ar fad": [i for i in range(4, 37)],
    "Ná lig dóibh éalú": [i for i in range(5, 40)],
    "Céard é sin?": [i for i in range(4, 30)],
}

In [None]:
def clean_text(text):
    import re
    clean = list()
    for line in text.split('\n'):
        trimmed = line.strip()
        if trimmed == '':
            continue
        # skip page numbers
        if re.search(r'^[0-9]+$', trimmed):
            continue
        # add a full stop for chapters
        if re.match(r'^Caibidil [0-9]+$', trimmed):
            trimmed += "."
        if '.indd' in trimmed:
            continue
        if '14/11/2006' in trimmed:
            continue
        if 'ar fadl.in' in trimmed:
            continue
        if '14/08/2006' in trimmed:
            continue
        if '.qxd' in trimmed:
            continue
        if trimmed in ['Is fada liom go', 'dtiocfaidh an Satharn.']:
            continue
        clean.append(trimmed.replace('…', '...'))
    return clean

In [None]:
def split_sentences(cleaned):
    from mosestokenizer import MosesSentenceSplitter
    with MosesSentenceSplitter('en') as splitsents:
        split=splitsents(cleaned)
    return split

In [None]:
def open_quote(text):
    return re.search("‘", text) and not re.search("’", text)
def close_quote(text):
    return re.search("’", text) and not re.search("‘", text)
def fix_split(cleaned):
    resplit = list()
    i = 0
    while i < len(cleaned):
        if i < len(cleaned) -1 and open_quote(cleaned[i]) and close_quote(cleaned[i+1]):
            resplit.append(f'{cleaned[i]} {cleaned[i+1]}')
            i = i + 2
        else:
            resplit.append(cleaned[i])
            i = i + 1
    return resplit

In [None]:
for book in pages.keys():
    page_range = pages.get(book)
    for dialect in books[book].keys():
        file = books[book][dialect]['file']
        text = extract_text(file, page_numbers=page_range)
        if file == '1ab98add8018901db52fd0fdf4a63e7c.pdf':
            v=clean_text(text)
            v[114], v[115] = v[115], v[114]
            v[53], v[54] = v[54], v[53]
            books[book][dialect]['lines'] = fix_split(split_sentences(v))
        elif file == 'd40b6b41054d6508191c167c9fd554cd.pdf':
            v=clean_text(text)
            v[114], v[115] = v[115], v[114]
            books[book][dialect]['lines'] = fix_split(split_sentences(v))
        elif file == '23f7240dba59deace40be7dec0a814a2.pdf':
            v=clean_text(text)
            v[54], v[55] = v[55], v[54]
            v[73], v[74] = v[74], v[73]
            v[118], v[119] = v[119], v[118]
            books[book][dialect]['lines'] = fix_split(split_sentences(v))
        elif file == 'df76a9acab68f481e3fc6ad09956364f.pdf':
            v=clean_text(text)
            v[72], v[73], v[74], v[75], v[76], v[77] = v[75], v[72], v[76], v[73], v[77], v[74]
            v[128], v[129], v[130], v[131], v[132], v[133] = v[129], v[132], v[130], v[128], v[133], v[131]
            v.pop(131)
            books[book][dialect]['lines'] = fix_split(split_sentences(v))
        elif file == '3c2baf4652793d8fb44c598731f1d0f8.pdf':
            v=clean_text(text)
            v[71], v[72] = v[72], v[71]
            v.pop(123)
            # make it match the other two
            v[120] = v[120].replace('.', ',')
            books[book][dialect]['lines'] = fix_split(split_sentences(v))
        elif file == '899d2e69d0833496edf4dd8aa6f4e238.pdf':
            v=clean_text(text)
            v[73], v[74], v[75], v[76] = v[75], v[73], v[76], v[74]
            v[125], v[126], v[127] = v[127], v[125], v[126]
            v[133], v[132] = v[132], v[133]
            v.pop(127)
            # not a great fit, but the rest is a poor match
            v[127] = ' '.join(v[127].split(' ')[-2:])
            books[book][dialect]['lines'] = fix_split(split_sentences(v))
        elif file == 'f4147f2a9bf90bcd1e4c331060d3e4db.pdf':
            v=clean_text(text)
            v[27], v[26] = v[26], v[27]
            books[book][dialect]['lines'] = fix_split(split_sentences(v))
        else:
            books[book][dialect]['lines'] = fix_split(split_sentences(clean_text(text)))

In [None]:
for book in pages.keys():
    for dialect in books[book].keys():
        fname = '{}_{}.txt'.format(book.replace(' ', '_'), dialect)
        outf = open(fname, 'w')
        for line in books[book][dialect]['lines']:
            outf.write(f'{line}\n')
        outf.close()

In [None]:
def moleabharsac_footer_starts(line):
    for i in ['Ócáidí Speisialta –', 'An Scoil –', 'Sa Bhaile –', 'Siopadóireacht –', 'Mé Féin –', 'Caitheamh Aimsire –']:
        if line.startswith(i):
            return True
    return False
def clean_moleabharsac(text, page):
    lines = []
    if page in [8, 9, 20]:
        return lines
    is_even = (page % 2) == 0
    trimmed = [line.strip() for line in text.split('\n')]
    if page in [1]:
        return trimmed[0:2]
    for line in trimmed:
        if line == '':
            continue
        if re.search(r'^[0-9]+$', line):
            if is_even and page == int(line):
                return lines
            else:
                continue
        if re.search(r'^_+$', line):
            continue
        if re.search(r'CEACHT [0-9]+', line):
            continue
        if line == '?':
            continue
        if '–' in line and moleabharsac_footer_starts(line):
            return lines
        if line == 'Mo Leabharsa - First Class':
            return lines
        lines.append(line)
    return lines

In [None]:
from pdfminer.high_level import extract_text
test_pdf = '6c319b3865d18f84101d42193367f716.pdf'
outputs = dict()
for i in range(4, 57):
    text = extract_text(test_pdf, page_numbers=[i])
    outputs[i] = text


In [None]:
def cmlc(num):
    return clean_moleabharsac(outputs[num+3], num)

In [None]:
cmlc(24)