In [414]:
import fitz
import re
import os

In [415]:
def remove_newlines(text):
    text = text.replace('\n','')
    return text

def remove_after(text,start_pattern):
    start = start_pattern.search(text).start()
    text = text[0:start]
    return text

In [416]:
class paper(object):
    def __init__(self,path_to_pdf):
        self.path = path_to_pdf
        self.citation,self.short_id,self.year = self.get_paper_id()
        self.text = self.get_text()
    
    def __repr__(self):
        return 'A paper object for %s' %self.citation
    
    def get_paper_id(self):
        path = self.path
        filename = path.split('/')[-1]
        citation = '.'.join(filename.split('.')[:-1])
        year = re.search('\d{4}',citation).group(0)
        short_id = citation.split(' ')[0]+' '+year
        return citation,short_id,int(year)
    
    def get_text(self):
        text = ''
        with fitz.open(self.path) as infile:
            for page in infile:
                text+= page.getText()
        return text
        
    def get_section(self,section):
        attr = section.name
        start = section.start_pattern.search(self.text).start()
        end = section.end_pattern.search(self.text[start:]).start()+start
        text = self.text[start:end]
        setattr(self,attr,text)

In [412]:
class section(object):
    def __init__(self,name,start_pattern,end_pattern,preprocess_steps=None):
        self.start_patterns = [re.compile(start_pattern)]
        if end_pattern:
            self.end_pattern = [re.compile(end_pattern)]
        self.name = name
        self.preprocess_steps = [None]
    def __repr__(self):
        return 'A section object for the %s'%self.name
    
    def add_layer(self,start_pattern,end_pattern,preprocess_step):
        self.start_patterns =  self.start_patterns.append(start_pattern)
        self.end_patterns = self.end_patterns.append(end_pattern)
        self.preprocess_steps = self.preprocess_steps.append(preprocess_step)

In [413]:
abstract = section(
    name = 'abstract',
    start_pattern=r'\n.*(Abstract\n|Summary\n|Abstract:|Summary:|[Aa]\s*[Bb]\s*[Ss]\s*[Tt]\s*[Rr]\s*[Aa]\s*)',
    end_pattern=r'\n.*(Keywords|Introduction)(\b.*\b|\s){0,3}\n'
)

abstract.add_layer(r'',r'',)

In [None]:
# abstract_start = re.compile(r'\n.*(Abstract\n|Summary\n|Abstract:|Summary:|[Aa]\s*[Bb]\s*[Ss]\s*[Tt]\s*[Rr]\s*[Aa]\s*)')
# abstract_end = re.compile(r'\n.*(Keywords|Introduction)(\b.*\b|\s){0,3}\n')

# introduction = section(
#     start_pattern=r'\n([\d][.][\s].*)?(Introduction)(\b.*\b|\s){0,3}\n',
#     end_pattern=r'\n([\d][.][\s].*)?(Results?|Materials|Methods)(\b.*\b|\s){0,3}\n'
# )

# # intro_start = re.compile(r'\n([\d][.][\s].*)?(Introduction)(\b.*\b|\s){0,3}\n')
# # intro_end = re.compile(r'\n([\d][.][\s].*)?(Results?|Materials|Methods)(\b.*\b|\s){0,3}\n')

# results = section(
#     start_pattern=r'\n([\d][.][\s].*)?Results?(\b.*\b|\s){0,3}\n',
#     end_pattern=r'\n([\d][.][\s]*)?(Discussion|Conclusion|Materials|Methods|Acknowledg[e]?ments|Experimental)(\b.*\b|\s){0,3}\n'
# )

# # results_start = re.compile(r'\n([\d][.][\s].*)?Results?(\b.*\b|\s){0,3}\n')
# # results_end = re.compile(r'\n([\d][.][\s]*)?(Discussion|Conclusion|Materials|Methods|Acknowledg[e]?ments|Experimental)(\b.*\b|\s){0,3}\n')

# methods = section(
#     start_pattern=r'\n([\d][.][\s].*)?(Methods|Materials|Experimental)(\b.*\b|\s){0,3}\n',
#     end_pattern=r'\n([\d][.][\s].*)?(Results|Discussion|Acknowledg[e]?ments|References|Bibliography|Citations|\b.*\b [Cc]ited)(\b.*\b|\s){0,3}\n'
# )

# # methods_start = re.compile(r'\n([\d][.][\s].*)?(Methods|Materials|Experimental)(\b.*\b|\s){0,3}\n')
# # methods_end = re.compile(r'\n([\d][.][\s].*)?(Results|Discussion|Acknowledg[e]?ments|References|Bibliography|Citations|\b.*\b [Cc]ited)(\b.*\b|\s){0,3}\n')

# references = section(
#     start_pattern=r'\n.*(References|Bibliography|Citations|\b.*\b [Cc]ited)(\b.*\b|\s){0,3}\n',
#     end_pattern=None
# )

# references_start = re.compile(r'\n.*(References|Bibliography|Citations|\b.*\b [Cc]ited)(\b.*\b|\s){0,3}\n')

In [404]:
directory = '/home/jsd/Phase Separation Papers/'

paper_paths = sorted([directory+a for a in os.listdir(directory) if a.endswith('.pdf')])

papers = [paper(a) for a in paper_paths[0:3]]

In [408]:
papers[0].get_section(abstract)
print(papers[0].abstract)
# papers[0].abstract
# get_section(papers[0].text,abstract.start_pattern,abstract.end_pattern)
# current_paper.get_section(abstract)
# current_paper.abstract
# current_paper.text
# text = current_paper.text
# start = abstract.start_pattern.search(text).start()
# end = abstract.end_pattern.search(text[start:]).start()+start
# section = text[start:end]
# print(section)


Abstract
A direct comparison of a chromatography and an aqueous two-phase system (ATPS) processes for the partial puriﬁcation of penicillin acylase
(PA) produced by a recombinant strain of E. coli, was performed. An established chromatography process for the recovery of PA was selected
as a model system and characterised for comparison with a developed ATPS prototype process. PEG-phosphate systems were selected for the
recovery of PA over PEG-citrate systems, since higher enzyme recovery and increased purity was obtained. ATPS proved to be suitable to process
highly concentrated disrupted extract (35%, w/w) and maintain a high top phase enzyme recovery. In the direct comparison of the processes, the
superiority of the ATPS approach was highlighted since a reduction of the number of unit operations from 7 to 4 was achieved. An outline economic
analysis based on the cost of separation agent of the processes favour the ATPS process, in which a gross cost reduction of 37% (from $0.47 to
$

In [252]:
text = get_pdf_text(directory+current_paper)
text = remove_newlines(text)
# text = remove_after(text,references_start)
# abstract = get_section(text,abstract_start,abstract_end)
# introduction = get_section(text,intro_start,intro_end)
# results = get_section(text,results_start,results_end)
# methods = get_section(text,methods_start,methods_end)
print(text)

Journal of Biotechnology 241 (2017) 184–192Contents lists available at ScienceDirectJournal of Biotechnologyj ourna l ho me pa ge: www.elsevier.com/locate/jbiotecA simple, rapid and cost-effective process for production of latexclearing protein to produce oligopolyisoprene moleculesR. Andler a, A.  Steinbüchel a,b,∗a Institute of Molecular Microbiology and Biotechnology, Westfälische Wilhelms-Universität Münster, D-48149 Münster, Germanyb Environmental Sciences Department, King Abdulaziz University, Jeddah, Saudi Arabiaa r  t i c  l e  i  n  f  oArticle history:Received 2 November 2016Received in revised form29 November 2016Accepted 6 December 2016Available online 7 December 2016Keywords:Latex clearing proteinAuto induction mediumAmmonium sulfate precipitationRubber biodegradationPoly(cis-1,4-isoprene)a b s t  r  a c  tAiming at ﬁnding feasible alternatives for rubber waste  disposal, the  partial  enzymatic degradation ofpoly(cis-1,4-isoprene)-containing materials represents a potenti

In [256]:
# re.compile(None)

TypeError: first argument must be string or compiled pattern

In [265]:
class foo(object):
    def __init__(self):
        pass
    
    def add_attr(self,attr,val):
        setattr(self,attr,val)
    
x = foo()
# setattr(x,'attr','val')
# x.attr
x.add_attr('attr','val')
x.attr

'val'