In [1]:
import logging, six, sys, os
import pdfminer.settings
pdfminer.settings.STRICT = False
import pdfminer.high_level
import pdfminer.layout
from pdfminer.image import ImageWriter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from io import StringIO
from nltk.tokenize import WordPunctTokenizer

In [2]:
def normalise(text):
    text = text.lower()
    word_punct_tokenizer = WordPunctTokenizer()
    word = word_punct_tokenizer.tokenize(text)
    text = ' '.join(word)
    return text

In [3]:
def save_file(path, text):
    with open(path, 'w', encoding='utf-8') as fp:
        fp.write(text)

In [45]:
def convert_pdf_to_txt(files=[], outfile='-',
            _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
            no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
            word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
            output_type='text', codec='utf-8', strip_control=False,
            maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
            layoutmode='normal', output_dir=None, debug=False,
            disable_caching=False, dissertation = [], **other):

    # If any LAParams group arguments were passed, create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if not no_laparams:
        laparams = pdfminer.layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None
    
    dirPath = './ug_dataset/pdf_file/'
    files = [f for f in os.listdir(dirPath) if os.path.isfile(os.path.join(dirPath, f))]
    files = sorted(files)
    count = 0
    for fname in files:
        if ('pdf' not in fname):
            continue
        count += 1
        print(count, fname)
        filename = fname.replace('.pdf', '.txt')
        if (filename in dissertation):
            continue
        with open(dirPath+fname, "rb") as fp:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            pagenos=set()

            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=True):
                interpreter.process_page(page)
            content = retstr.getvalue()

            device.close()
            retstr.close()
#             text = normalise(content)
            save_file('./ug_dataset/dissertation/'+filename, content)

In [46]:
def load_text():
    dissertation = []
    dirPath = './ug_dataset/dissertation/'
    files = [f for f in os.listdir(dirPath) if os.path.isfile(os.path.join(dirPath, f))]
    files = sorted(files)
    for fname in files:
        if ('txt' not in fname):
            continue
        dissertation.append(fname)
    return dissertation

In [47]:
dissertation = load_text()
print(len(dissertation))

1100


In [48]:
convert_pdf_to_txt(files=[], outfile='-',
                    _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
                    no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
                    word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
                    output_type='text', codec='utf-8', strip_control=False,
                    maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
                    layoutmode='normal', output_dir=None, debug=False,
                    disable_caching=False, dissertation=dissertation)

1 Hui_Li.pdf
2 Jacques_Coney.pdf
3 Jade_Slocombe.pdf
4 Joshua_Keegan.pdf
5 Linus_Cash.pdf
6 Yineng_Zeng.pdf
7 aca00lad.pdf
8 aca01dh.pdf
9 aca02clj.pdf
10 aca02jhe.pdf
11 aca02tl.pdf
12 aca03bjc.pdf
13 aca03cup.pdf
14 aca03dsw.pdf
15 aca03mrp.pdf
16 aca03ok.pdf
17 aca03tb.pdf
18 aca03tja.pdf
19 aca04ajp.pdf
20 aca04ap.pdf
21 aca04bs.pdf
22 aca04cj.pdf
23 aca04cjm.pdf
24 aca04csg.pdf
25 aca04dnm.pdf
26 aca04dsc.pdf
27 aca04iba.pdf
28 aca04jdb.pdf
29 aca04jfc.pdf
30 aca04kc.pdf
31 aca04kem.pdf
32 aca04lh.pdf
33 aca04mdg.pdf
34 aca04mjg.pdf
35 aca04mjw.pdf
36 aca04ntr.pdf
37 aca04pb.pdf
38 aca04pds.pdf
39 aca04rcs.pdf
40 aca04rh.pdf
41 aca04rz.pdf
42 aca04saj.pdf
43 aca04tp.pdf
44 aca04yj.pdf
45 aca05ac.pdf
46 aca05ar.pdf
47 aca05asb.pdf
48 aca05ch.pdf
49 aca05cjb.pdf
50 aca05cs.pdf
51 aca05djc.pdf
52 aca05ds.pdf
53 aca05gam.pdf
54 aca05gdl.pdf
55 aca05icm.pdf
56 aca05is.pdf
57 aca05jaa.pdf
58 aca05jfg.pdf
59 aca05jo.pdf
60 aca05lsf.pdf
61 aca05mb.pdf
62 aca05mc.pdf
63 aca05mmh.pdf
64 aca

684 acb09rj.pdf
685 acb09ss.pdf
686 acb09yw.pdf
687 acb10ak.pdf
688 acb10bc.pdf
689 acb10jy.pdf
690 acb10sb.pdf
691 acb11ac.pdf
692 acb11anl.pdf
693 acb11gp.pdf
694 acb11jk.pdf
695 acb11ts.pdf
696 acb12am.pdf
697 acb12dc.pdf
698 acb12jh.pdf
699 acb12jm.pdf
700 acb13ac.pdf
701 acb13ag.pdf
702 acb13jl.pdf
703 acb13sad.pdf
704 acb14aa.pdf
705 acb14at.pdf
706 acb14cw.pdf
707 acb14db.pdf
708 acb14ds.pdf
709 acb14jc.pdf
710 acb14jd.pdf
711 acb14js.pdf
712 acb14lt.pdf
713 acb14ss.pdf
714 acb15ac.pdf
715 acb15ap.pdf
716 acb15ba.pdf
717 acb15gr.pdf
718 acb15ms.pdf
719 acb15sd.pdf
720 acc06jb.pdf
721 acc11tm.pdf
722 acc12jm.pdf
723 acc14mb.pdf
724 acc15gr.pdf
725 acc15jc.pdf
726 acc15ms.pdf
727 acd15sb.pdf
728 acp10art.pdf
729 acp10dpr.pdf
730 acp11cw.pdf
731 acp12ads.pdf
732 acp12vb.pdf
733 acq12ss.pdf
734 acq12zb.pdf
735 ara12wgs.pdf
736 ara13mlc.pdf
737 boa08cgk.pdf
738 cia10jt.pdf
739 coa07so.pdf
740 coa11sb.pdf
741 coa12sd.pdf
742 cpa12mkm.pdf
743 ecp11cc.pdf
744 ecp11se.pdf
745 ecs11jz.pdf