In [1]:
import os
import textproc
import pandas as pd
import re

In [2]:
context = re.compile(r'(?i)food|industr|biotechnolog|agricultur|reagent|pharm|medic|therap|diagnos|prevent|treat')

##Biological entities
product = re.compile(r'(?i)product|target|crude|\sextract\s|lysate|recombinant|[^h]ase\s|ab\s|antibod|\sIgG\s|immunoglob|[a-z][^e\s]in\s')
species_name = re.compile(r'[^A-Z][^.](\s|\()[A-Z]([a-z]+|\.)\s[a-z]{2}|yeast|[A-Z]{3}(\d+)?(\s\w*\s){0,4}\scell|host')
contaminants = re.compile(r'(?i)contamina|hcp|host\scell\sprotein|chop|unwanted|impurit|\sdna\s|\srna\s|\slps\s|lipopolysaccharide|endotoxin|hmw|lmw|aggregate|high\smolecular\sweight|low\smolecular\sweight')


##Treatments
recovery = re.compile(r'recover|yield|obtain|product')
purity = re.compile(r'(?i)puri|clear[^l\s]|remov|reduc|separat|decreas|selectiv|eliminat')
concentration = re.compile(r'(?i)concentrat.*(factor|times|fold|[^a-z]log[^a-z]|by)')
recycling = re.compile(r'(?i)recyl|recover|reuse|cycles')
operations = re.compile(r'(?i)[^A-Z]heat|precipitat|filt[er]|wash|centrifug|chromatograph|refold|atp[se]|extract|phase.*(separat.*|system)|add(ition|ed|ing)|mix')

##misc.
table = re.compile(r'(?i)\stable\s[\dI]')
theory_model = re.compile(r'(?i)theor|model|equation|predict|[^a-z]fit')

    #note: the doe regex is weird because fitz sometimes reads the "fl" in a word like "influence"
    #as the single character u\FB02 instead of as 'fl'
doe = re.compile(r'(?i)systematic|model|design\sof\sexperiments|[^a-z]doe[^a-z]|\s[^s]\w{0,3}uence|screen')
phase_curve = re.compile(r'(?i)solubility|binodal|phase\s(behavior|curve|data)')

##units
mass_per_volume = re.compile(r'[\s/][\w\(]?[wmg][\s/][\w]?[LvV][^A-Za-z]')
mass = re.compile(r'[\s/][\w\(]?[g][^A-Za-z]')
volume = re.compile(r'[\s/][\w\(]?[\w]?[LvV][^A-Za-z]')
temperature = re.compile(r'[^A-Za-z](C|K|F)\s')
time = re.compile(r'(?i)year|month|week|day|\sh.?\s|hour|hr\.?\s|\smin\.?\s|\ssec\.?\s|\ss\.?\s')
pH = re.compile(r'\spH\s')

##Improvements
best = re.compile(r'(?i)optimal|best|ideal|standard|optimum|\sup\sto\s|final|maxim')
improvement = re.compile(r'(?i)higher|best|better|improve|achieve|up\sto|fold|factor|best|times|[^a-z]log[^a-z]')

##numbers
number = re.compile(r'(?i)[^a-z]\d[^a-z]')
specific_number = re.compile(r'\d+\.\d+|\d\s?%')
percentage = re.compile(r'\d+\.?\d+?\s?%')
percent_sign = re.compile(r'%')

## expression
solubility = re.compile(r'(?i)solub|[^a-z]tag[^a-z]|inclusion\sbod')
expression = re.compile(r'(?i)(titer|titre|express|productiv)')

In [3]:
def parse_pubmed_abstracts(txtfile):
    with open(txtfile,'r') as infile:
        text = infile.read()
    papers = re.split(r'PMID.*\n',text)
    return papers

def filter_abstracts(txtfile,criteria_dct,outfile):
    papers = parse_pubmed_abstracts(txtfile)
    columns = ['Title','doi','entry']+list(criteria_dct.keys())
    df = pd.DataFrame(columns=columns)
    i = 0
    for paper in papers:
        try:
            abst = re.search(r'(?si)\n\n([a-z].*?)\n\n(DOI|PMCID|PMID)',paper).groups()[0]
        except:
            abst = paper
        try:
            title = re.search(r'(?si)\n\n([a-z].*?)\.\n\n',paper).groups()[0].strip()
        except:
            title = ''
        try:
            doi = re.search(r'(doi:.*)\.',paper).groups()[0].strip()
        except:
            doi = ''
        
        df.loc[len(df),['Title','doi','entry']] = [title,doi,abst]
        out_dct = {}
        
        for key in criteria_dct.keys():
            hits = textproc.narrow(abst,criteria_dct[key]['Includes'],criteria_dct[key]['Excludes'])
            if hits != []:
                out_dct[key] = True
            else:
                out_dct[key] = False
    
            df.loc[i,key] = out_dct[key]
        i+=1
    df.to_excel(outfile)
    


In [6]:
criteria_dct = {
    'Quant. solubility':{'Includes':[solubility,number,improvement],
                        'Excludes':[]},
    'Quant. titer':{'Includes':[expression,number,improvement],
                   'Excludes':[]}
}

filter_abstracts('/home/jsd/Desktop/romel.txt',criteria_dct,'/home/jsd/Desktop/test.xlsx')

In [41]:
# df = pd.DataFrame(columns=['Title','doi','entry','Quant. solubility','Quant. titer'])
# sol_includes = [solubility,number,improvement]
# sol_excludes = []
# titer_includes = [expression,number,improvement]
# titer_excludes = []
# out = []
# for paper in papers:
#     try:
#         abst = re.search(r'(?si)\n\n([a-z].*?)\n\n(DOI|PMCID|PMID)',paper).groups()[0]
#     except:
#         abst = paper
#     try:
#         title = re.search(r'(?si)\n\n([a-z].*?)\.\n\n',paper).groups()[0].strip()
#     except:
#         title = ''
#     try:
#         doi = re.search(r'(doi:.*)\.',paper).groups()[0].strip()
#     except:
#         doi = ''
#     sol_hits = textproc.narrow(abst,sol_includes,sol_excludes)
#     titer_hits = textproc.narrow(abst,titer_includes,titer_excludes)
#     print(sol_hits)
#     print(titer_hits)
#     if titer_hits != []:
#         quant_titer = True
#     else:
#         quant_titer = False
#     if sol_hits != []:
#         quant_sol = True
#     else:
#         quant_sol = False
#     df.loc[len(df)] = [title,doi,abst.strip(),quant_sol,quant_titer]

# df
# # re.search(r'(?si)\n\n([a-z].*?)\n\n(DOI|PMCID|PMID)',papers[20]).groups()[0]
# # df['Quant. solubility'].value_counts()
# # df['Quant. titer'].value_counts()
# # df.loc[:,['Quant. titer','Quant. solubility']].value_counts()

# # df.loc[0,'Title'][0]
# # df.loc[df['Quant. solubility']==True,:]
# # df.to_excel('test.xlsx')