In [35]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import re
import os
from itertools import dropwhile
import numpy as np

In [15]:
from lxml import etree
from bs4 import BeautifulSoup

In [16]:
all_xml = os.listdir('./xml')
dir_xml = ['xml/' + item for item in all_xml] 
len(dir_xml)

19049

In [17]:
dir_xml[0:10]

['xml/22215277.cermxml',
 'xml/22215412.cermxml',
 'xml/22232390.cermxml',
 'xml/22233099.cermxml',
 'xml/22235270.cermxml',
 'xml/22236465.cermxml',
 'xml/22242178.cermxml',
 'xml/22246787.cermxml',
 'xml/22249592.cermxml',
 'xml/22252606.cermxml']

In [18]:
## conditions

def alphachars(x): return sum(a.isalpha() for a in x) > 4
def charchar(x): return bool(re.match(r".*[a-zA-Z][a-zA-Z].*", x)) == True
def istitle(x): return ((('method' in x) or ('material' in x) or ('result' in x) or ('discussion' in x) or ('conclusion' in x)
                        or ('references' in x) or ('bibliography' in x) 
                        or ('data' in x) or ('population' in x) or ('patient' in x) or ('cohort' in x) or ('image' in x)) 
                        and (sum(l.isalpha() for l in x) < 30))
def istextblock(x): return sum(l.isalpha() for l in x) > 100
#def methodtitle(x): return ((('method' in x)) or ('material' in x) and (sum(l.isalpha() for l in x)<30) and ('supplement' not in x)) 
def methodtitle(x): return ((('method' in x) or ('material' in x) or ('data' in x)
                            or ('population' in x) or ('patient' in x) or ('cohort' in x) or ('image' in x)) and (sum(l.isalpha() for l in x)<30)) 
## symbol cleaning

def lbracketcount(x): return x.count('{') < 4
def rbracketcount(x): return x.count('}') < 4
def tagcount(x): return x.count('"."') < 4
def hashcount(x): return x.count('#') < 4
def divcount(x): return x.count('</') < 4
def linecount(x): return x.count('||') < 4

In [19]:
pmid_list = []
methods_list = []

for xml in tqdm(dir_xml):

    base_text = []
    keep_text = []
    cull_text = []
    method_candidates = []
    
    ## (0) write the pmid to list
    pmid = xml.replace('xml/','').replace('.cermxml','')
    pmid_list.append(pmid)

    try:
        
        ## (1) parse each xml file into etree
        tree = etree.parse(xml)
    
        ## (2) remove reference and figure elements
        for elem in tree.xpath('//xref'):
            elem.getparent().remove(elem)
    
        for elem in tree.xpath('//fig'):
            elem.getparent().remove(elem)
        
        ## (3) extract only text sections from the article body
        sections = tree.xpath('/article/body/sec')

        ## it would be nice to extract methods sections per <title> tag
        ## however - section naming is inconsistent, and subsections appear at same level as sections
    
        ## (4) extract raw text contained across all sections (as a list with each element containing a 'block')
        raw_text = tree.xpath('/article/body/sec//descendant-or-self::*/text()')
    
        ## (5) remove line breaks and extra spaces
        base_text = [x.replace('\n', ' ') for x in raw_text]
        base_text = [x.replace('    ', ' ') for x in base_text]
        base_text = [x.replace('   ', ' ') for x in base_text]
        base_text = [x.replace('  ', ' ') for x in base_text]
        base_text = [x.lower() for x in base_text]
        
        ## (6) keep list elements if conforms to conditions, return 'cull_text' that is much cleaner
        for x in base_text:
            if (charchar(x) and alphachars(x)) and (istitle(x) or istextblock(x)):
                keep_text.append(x)
            else:
                continue

        for x in keep_text:
            if (lbracketcount(x) and rbracketcount(x) and tagcount(x) and hashcount(x) and divcount(x) and linecount(x)):
                cull_text.append(x)
            else:
                continue 
            
        ## (7) find method title and method section
        
        for i, x in enumerate(cull_text):    
            method_following = [] #list of all elements following and including methods title
            method_only = [] #list of all elements inbetween 'method' title and next section title

            if methodtitle(x) == True:
                method_following = cull_text[i:] 
                
                for i, x in enumerate(method_following):
                    if 'result' in x and sum(l.isalpha() for l in x) <  30:
                        #print("result title!")
                        method_only = method_following[:i]
                        break
                    elif 'discussion' in x and sum(l.isalpha() for l in x) <  30:
                        #print("discussion title!")
                        method_only = method_following[:i]                
                        break
                    elif 'conclusion' in x and sum(l.isalpha() for l in x) <  30:
                        #print("conclusion title!")
                        method_only = method_following[:i]               
                        break
                    elif 'references' in x and sum(l.isalpha() for l in x) <  30:
                        #print("reference title!")
                        method_only = method_following[:i]            
                        break
                    elif 'bibliography' in x and sum(l.isalpha() for l in x) <  30:
                        #print("reference title!")
                        method_only = method_following[:i]                 
                        break                      
                    else:
                        method_only = method_following #if none of these are found to end text block, then take the whole text block
    
                method_string = ' '.join(method_only)        
    
                if istextblock(method_string):
                    method_candidates.append(method_string)
                else:
                    continue
                
            else:
                continue
        
        print(len(method_candidates))    
        methods_list.append(method_candidates) # each paper now has a list of method candidates (each is a string)

    except:
        print('fail')
        methods_list.append('fail')
        continue            

print(len(methods_list))

  0%|          | 0/19049 [00:00<?, ?it/s]

1
2
3
0
2
1
3
1
1
3
4
3
0
2
2
0
4
1
3
1
1
1
2
3
3
1
2
2
1
3
3
2
3
3
4
1
4
1
1
2
2
0
2
1
2
2
0
3
3
5
2
9
3
1
1
2
4
2
2
8
1
2
7
1
0
2
3
1
2
0
3
2
5
2
3
3
1
1
2
3
2
1
0
2
3
0
2
1
0
2
2
2
0
0
1
1
1
2
3
1
5
3
0
0
2
1
1
4
1
2
1
1
3
4
5
0
1
3
1
2
0
2
6
3
3
1
6
1
3
7
2
2
3
1
1
5
1
0
1
3
1
3
1
1
2
4
2
3
1
2
2
1
2
1
0
1
0
1
0
1
4
3
5
1
3
3
1
1
5
1
1
3
1
0
0
1
3
1
2
2
2
2
3
2
4
1
2
3
1
1
2
5
0
0
1
1
1
1
2
2
0
2
3
3
4
0
0
1
2
4
3
2
1
1
2
1
3
3
1
3
2
1
1
2
3
5
3
0
2
0
1
1
3
2
3
2
2
0
2
1
3
1
2
1
2
3
1
3
4
1
1
2
2
1
1
3
2
3
2
1
4
2
2
1
5
6
1
2
8
4
2
1
1
3
1
0
2
2
3
1
0
1
2
1
1
6
1
6
2
1
2
1
1
2
1
2
1
3
1
2
6
5
2
1
2
4
3
1
1
3
6
1
2
4
2
4
0
1
1
2
4
2
4
1
3
1
2
2
1
3
2
4
2
2
1
0
1
2
6
1
2
3
2
3
2
5
5
1
3
2
0
1
2
2
2
1
3
2
1
3
2
3
2
2
2
1
1
2
1
0
1
1
2
2
2
2
1
1
2
1
1
1
0
2
3
2
4
1
4
1
2
3
2
4
2
2
1
2
2
2
1
1
2
2
2
2
3
1
1
3
2
2
2
1
2
3
1
1
1
3
3
6
2
2
2
2
3
4
1
2
0
0
2
2
4
2
1
3
1
1
2
2
5
2
1
1
0
0
0
2
1
4
4
3
0
0
1
3
3
1
3
1
6
2
0
1
3
4
1
0
2
2
5
0
0
5
0
1
2
3
3
3
1
5
9
3
4
3
3
1
2
3
0
2
1
0
2
0
2
3


3
3
5
1
1
2
1
2
2
2
0
4
5
2
6
1
1
3
1
2
2
3
4
5
1
2
1
4
1
2
2
5
1
1
2
1
3
2
0
2
1
0
2
1
2
1
2
2
2
2
2
2
4
3
2
2
1
2
1
0
1
5
8
3
2
2
1
3
2
1
1
4
1
2
2
1
1
3
3
6
2
0
4
1
3
0
1
1
3
1
0
2
6
2
3
4
1
4
0
5
1
4
2
4
3
4
3
2
3
2
4
1
2
3
3
3
0
2
7
7
1
2
3
2
0
1
4
2
4
1
4
3
1
1
1
3
3
2
1
0
2
0
1
3
1
1
1
1
3
1
1
4
4
2
6
2
2
1
1
3
3
2
1
3
7
2
4
1
2
2
2
9
3
3
4
1
3
2
2
2
4
2
3
6
1
4
2
2
1
2
3
4
2
3
2
3
2
1
5
1
4
1
2
2
4
2
1
2
5
1
1
0
1
2
1
3
2
1
3
1
3
1
2
2
2
1
1
3
2
2
2
1
1
2
2
2
5
4
2
2
1
1
2
2
1
7
4
1
2
1
4
1
2
2
2
4
3
3
3
1
6
2
3
4
0
1
3
0
0
2
8
1
1
0
3
1
2
2
3
1
1
2
1
2
1
3
1
2
2
1
2
1
2
0
2
3
2
1
8
1
2
1
1
6
4
0
6
1
3
3
3
4
1
3
5
4
4
3
3
2
6
2
3
3
2
3
4
1
3
3
4
4
3
3
3
2
2
3
2
3
3
3
3
0
2
2
0
3
2
3
2
2
2
5
1
1
3
2
4
4
3
3
3
3
4
1
1
3
3
1
3
1
1
1
3
2
2
2
3
2
3
3
1
1
2
2
1
3
1
4
4
4
2
1
4
4
4
1
1
0
3
3
3
2
3
1
4
5
0
2
0
3
2
3
1
2
5
1
3
2
2
2
4
1
5
2
3
4
7
2
3
1
1
6
3
2
2
3
3
2
1
1
3
6
1
0
3
4
0
0
4
2
4
2
5
0
2
2
1
2
3
3
1
5
2
2
1
7
3
2
7
3
2
1
7
2
2
8
2
1
2
2
5
2
1
1
4
4
5
2
1
2
4
1
1
9
1
2
4
4


1
1
5
2
3
4
4
2
4
4
2
1
5
1
0
1
1
2
1
4
2
5
4
3
3
1
3
3
1
1
4
6
4
1
6
2
1
2
0
3
3
1
2
1
2
2
0
3
0
1
1
2
1
2
1
6
2
2
3
2
0
3
2
2
7
3
2
2
2
5
3
2
9
6
1
4
4
1
2
3
2
2
2
3
3
2
3
1
2
5
1
1
1
3
1
2
3
3
1
3
2
3
2
4
1
2
1
1
2
0
3
1
0
3
2
1
2
3
3
1
7
3
4
0
2
2
4
2
0
3
2
1
3
3
1
2
10
2
3
3
3
1
5
3
3
1
2
2
2
1
6
1
2
4
2
1
3
2
3
0
2
2
2
2
3
0
1
2
3
2
1
1
2
4
2
1
3
1
0
1
2
0
3
1
2
2
1
2
3
3
6
4
3
3
4
2
2
2
4
2
3
3
7
3
3
2
1
1
3
0
3
5
1
2
1
3
1
1
4
2
1
2
1
5
3
2
1
4
2
1
4
1
1
1
0
1
2
5
1
2
2
3
4
4
2
10
0
2
2
1
2
4
1
6
1
1
1
4
2
3
1
2
2
1
2
2
2
6
1
2
1
0
7
2
2
5
1
4
5
4
2
8
2
0
2
4
2
2
3
0
1
2
3
2
3
0
2
4
1
3
5
2
3
2
2
3
2
4
6
5
2
5
3
4
6
4
6
3
4
1
2
3
1
1
2
5
2
4
3
2
6
2
2
1
1
4
2
3
2
7
2
4
8
2
2
2
2
0
1
3
2
2
3
1
2
6
3
2
3
11
1
2
2
2
4
3
2
2
2
1
2
5
4
0
2
2
1
7
4
1
4
2
1
5
2
2
1
2
2
4
2
3
2
1
2
3
0
2
1
3
0
2
2
2
2
0
3
3
2
3
4
2
4
2
2
1
3
1
4
1
2
3
1
2
1
4
5
4
5
4
1
5
4
3
5
4
2
4
1
1
0
1
3
4
5
3
5
5
3
2
2
2
2
2
0
2
1
2
2
2
2
2
0
2
1
3
0
2
1
2
2
2
2
3
0
5
2
1
4
2
2
1
2
2
4
4
3
4
2
1
2
0
3
3
6
4
0
1
2

1
2
3
5
2
1
3
1
2
1
1
2
4
4
3
3
4
5
6
4
0
4
2
4
1
0
2
3
1
0
2
2
1
1
2
5
5
6
1
1
1
3
8
5
2
1
4
3
4
3
3
5
1
1
2
1
2
1
5
1
3
2
1
2
1
4
8
3
5
2
2
1
2
2
5
4
3
4
5
2
5
2
1
1
1
1
2
5
3
4
4
2
2
3
1
2
0
2
3
1
2
3
5
5
1
2
1
1
2
4
3
3
2
3
3
2
2
3
3
3
1
3
9
6
7
6
3
10
3
3
3
5
4
5
6
2
2
5
3
4
3
5
1
1
4
1
1
3
4
0
1
2
2
3
3
1
3
1
3
2
3
3
2
3
0
2
3
2
2
1
3
2
1
2
2
2
6
7
1
1
2
4
2
4
1
1
2
6
1
1
2
2
1
2
2
2
4
1
4
1
3
1
5
1
2
1
3
1
4
3
2
0
4
1
3
3
4
1
6
1
1
3
1
1
1
2
2
6
5
4
3
4
2
5
1
2
2
3
3
1
2
0
3
2
2
2
7
3
2
1
2
3
2
1
0
5
3
5
5
4
6
4
5
3
4
2
2
5
6
5
6
8
3
3
1
2
2
2
4
2
1
0
2
3
2
2
1
1
2
2
2
2
2
3
0
0
1
4
1
1
2
0
4
1
2
2
0
5
2
2
0
1
2
4
3
1
1
2
2
4
1
3
4
2
2
0
2
3
2
2
2
1
2
2
2
0
2
4
3
2
3
2
3
2
1
7
3
1
4
5
6
1
5
2
4
7
4
2
4
4
2
1
3
1
3
2
5
4
2
3
2
2
2
4
2
6
2
2
3
2
2
4
1
2
2
3
2
1
2
3
3
2
2
1
0
4
1
4
3
4
4
3
6
7
2
8
3
1
1
2
2
1
7
4
4
4
4
3
5
6
2
1
2
2
2
4
6
3
2
3
1
2
3
2
2
2
5
3
1
5
2
4
2
2
3
5
2
8
1
4
4
4
2
3
3
1
3
2
1
1
1
4
5
3
5
5
1
5
6
4
4
5
5
3
3
3
3
2
1
2
0
3
3
1
3
2
1
1
3
3
3
5
2
2
1
3
3
5
3
2

2
2
3
4
2
1
1
5
4
1
6
1
2
2
2
3
4
4
4
6
1
2
2
5
4
2
1
1
6
2
1
2
3
4
4
5
3
3
11
9
5
3
3
1
3
5
1
1
4
1
6
4
5
1
1
3
3
2
2
2
1
5
2
5
1
2
0
1
6
2
3
1
1
1
1
5
1
2
2
5
2
4
1
4
3
2
5
3
4
1
3
1
4
6
2
2
6
4
5
3
4
2
0
3
4
6
4
3
5
6
4
3
1
2
1
0
1
2
2
3
1
0
2
1
5
4
4
0
3
3
1
3
0
5
2
2
4
3
5
4
2
3
1
2
2
3
3
3
2
2
3
3
3
2
0
4
4
2
4
2
3
2
4
2
3
1
4
6
3
1
2
2
2
2
3
2
4
1
2
2
2
4
4
3
2
4
3
6
2
2
4
5
5
4
2
2
1
2
2
5
4
0
0
2
1
2
1
2
4
0
4
2
2
1
6
3
4
1
2
0
3
1
2
2
4
4
1
1
3
0
1
3
1
2
4
1
2
0
3
3
3
4
4
2
2
2
2
3
5
0
4
8
2
2
3
0
10
2
3
1
2
3
3
1
2
1
2
1
2
2
4
2
4
4
2
5
1
5
4
1
1
6
6
1
4
4
3
1
1
3
1
1
5
3
6
7
1
2
4
2
0
2
3
7
5
4
5
7
6
5
4
4
1
5
3
2
4
4
6
0
1
4
7
5
4
6
5
2
4
5
0
1
1
0
3
1
3
2
1
2
3
2
1
1
2
2
2
0
1
2
5
1
0
2
5
2
5
1
2
2
2
0
1
3
1
1
2
3
2
3
4
2
1
3
1
2
1
2
6
3
1
1
4
2
5
3
1
2
2
7
3
4
4
1
2
5
1
1
1
1
0
0
1
1
1
2
1
1
2
1
5
1
3
3
1
1
2
2
1
2
3
3
3
2
5
1
4
2
2
1
1
9
0
1
2
0
1
4
3
2
1
2
1
5
2
2
5
6
6
1
6
0
1
3
2
1
1
2
2
3
2
3
0
4
2
2
4
3
5
7
4
0
1
1
2
1
6
3
2
3
2
0
3
4
1
1
2
4
2
2
1
3
4
1
2
4
1
2
2


In [28]:
print(len(methods_list))
print(len(pmid_list))

19049
19049


In [29]:
def count_x(lst, x):
    count = 0
    for ele in lst:
        if (ele == x):
            count = count + 1
    return count

count_x(methods_list,[])

1256

In [30]:
methods_df = pd.DataFrame({'pmid':pmid_list, 'methods':methods_list})
methods_df.to_csv('output/03a_pdf_methods_raw.csv')

In [54]:
explode_df = pd.DataFrame(methods_df['methods'].tolist())

In [55]:
explode_df = explode_df.astype(str).apply(lambda x: x.str[:25000])

In [56]:
explode_df['pmid'] = pmid_list

In [57]:
explode_df.replace('None', np.nan, inplace=True)

In [58]:
explode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19049 entries, 0 to 19048
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       17793 non-null  object
 1   1       13320 non-null  object
 2   2       7762 non-null   object
 3   3       4139 non-null   object
 4   4       2063 non-null   object
 5   5       1026 non-null   object
 6   6       481 non-null    object
 7   7       235 non-null    object
 8   8       116 non-null    object
 9   9       58 non-null     object
 10  10      30 non-null     object
 11  11      14 non-null     object
 12  12      9 non-null      object
 13  13      8 non-null      object
 14  14      5 non-null      object
 15  15      4 non-null      object
 16  16      3 non-null      object
 17  17      2 non-null      object
 18  18      2 non-null      object
 19  19      2 non-null      object
 20  20      1 non-null      object
 21  21      1 non-null      object
 22  22      1 non-null    

In [59]:
explode_df.to_csv('output/03b_pdf_methods_explode.csv')
explode_df.sample(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,pmid
8648,materials and methods materials a prospective ...,materials a prospective clinical trial was con...,"methods experimental tests of the hypothesis, ...",,,,,,,,...,,,,,,,,,,32484597
242,"iii. methods for all methods, an image scene c...",a. data cohort bca histopathology slides wer...,,,,,,,,,...,,,,,,,,,,23392336
11937,materials and methods patients this retrospect...,patients this retrospective study was approved...,,,,,,,,,...,,,,,,,,,,33486606
16670,methods study procedures were approved by the ...,data acquisition and processing dti data was a...,,,,,,,,,...,,,,,,,,,,34557049
15915,2 materials and models 2.1 dataset description...,2.1 dataset description the first three datase...,original ct image of patient (b) original ct i...,original ct image of patient (c) gray scale im...,gray scale image of (a) (d) gray scale image o...,gray scale image of (b) (e) original x-ray ima...,original x-ray image of patient (h) original x...,original x-ray image of patient (i) (j) gray i...,gray image of scaled (g) gray image of scaled ...,gray image of scaled (h) framework pipeline. t...,...,,,,,,,,,,34395156
5271,"materials and methods in this paper, a marc wi...",,,,,,,,,,...,,,,,,,,,,31061259
11673,materials and methods study design and methods...,study design and methods a total of 13 endosco...,image collection table 1 shows the compositio...,,,,,,,,...,,,,,,,,,,33415420
16639,materials and methods we examined 373 colorect...,nbi images polyp images images aiphp-features:...,polyp images images aiphp-features: geometry o...,,,,,,,,...,,,,,,,,,,34551512
12890,material and methods this retrospective study ...,,,,,,,,,,...,,,,,,,,,,33730292
7909,materials and methods the data for this paper ...,,,,,,,,,,...,,,,,,,,,,32193984


In [None]:
for STOP in STOP

# testing

In [39]:
## (1) parse each xml file into etree

tree = etree.parse("xml/22246787.cermxml")

In [40]:
## (2) remove reference and figure elements

for elem in tree.xpath('//xref'):
    elem.getparent().remove(elem)
    
for elem in tree.xpath('//fig'):
    elem.getparent().remove(elem)

In [41]:
## (3) extract only text sections from the article body

sections = tree.xpath('/article/body/sec')

for child in sections:
    print(child.tag, child.attrib)

sec {'id': 'sec-1'}
sec {'id': 'sec-2'}
sec {'id': 'sec-3'}
sec {'id': 'sec-4'}
sec {'id': 'sec-5'}
sec {'id': 'sec-6'}


In [42]:
## it would be nice to extract methods sections per <title> tag
## however - section naming is inconsistent, and subsections appear at same level as sections

In [51]:
## (4) extract raw text contained across all sections
## this extracts as a list with each element containing a 'block'

raw_text = tree.xpath('/article/body/sec//descendant-or-self::*/text()')
raw_text

['\n      ',
 'Introduction',
 '\n      ',
 '\n        Adolescent idiopathic scoliosis (AIS) is a complex\nthreedimensional (3D) deformation of the natural shape of the\nspinal column. AIS patients have pathological spinal curves\nin the coronal plane, alterations of the kyphosis or\nlordosis in the sagittal plane, and rotations of the vertebrae. The\nsurgical instrumentation for the AIS is a complex procedure\ninvolving many difficult decisions, such as the spinal\nsegments to instrument, the type/location/number of hooks or\nscrews, the rod diameter/length/shape, the implant\nattachment order, and the amount of rod rotation [\n        ',
 '\n      ',
 'Currently, the Lenke classification model is prevalent in\nsurgical planning to determine the appropriate region of the\nspine to be fused. The Lenke model is described by a chart,\ncalled the Lenke chart, which specifies the criteria to separate\nthe spine curve shapes into six different types. A Cobb angle\ncharacterizes the spine cu

In [52]:
## (5) remove line breaks and extra spaces

base_text = [x.replace('\n', ' ') for x in raw_text]
base_text = [x.replace('    ', ' ') for x in base_text]
base_text = [x.replace('   ', ' ') for x in base_text]
base_text = [x.replace('  ', ' ') for x in base_text]
base_text = [x.lower() for x in base_text]
base_text

[' ',
 'introduction',
 ' ',
 ' adolescent idiopathic scoliosis (ais) is a complex threedimensional (3d) deformation of the natural shape of the spinal column. ais patients have pathological spinal curves in the coronal plane, alterations of the kyphosis or lordosis in the sagittal plane, and rotations of the vertebrae. the surgical instrumentation for the ais is a complex procedure involving many difficult decisions, such as the spinal segments to instrument, the type/location/number of hooks or screws, the rod diameter/length/shape, the implant attachment order, and the amount of rod rotation [ ',
 ' ',
 'currently, the lenke classification model is prevalent in surgical planning to determine the appropriate region of the spine to be fused. the lenke model is described by a chart, called the lenke chart, which specifies the criteria to separate the spine curve shapes into six different types. a cobb angle characterizes the spine curve in one of three spine regions, namely, the proxim

In [53]:
## conditions

def alphachars(x): return sum(a.isalpha() for a in x) > 4
def charchar(x): return bool(re.match(r".*[a-zA-Z][a-zA-Z].*", x)) == True
def istitle(x): return ((('method' in x) or ('material' in x) or ('result' in x) or ('discussion' in x) or ('conclusion' in x)
                        or ('references' in x) or ('bibliography' in x) or ('data' in x)) 
                        and (sum(l.isalpha() for l in x) < 30))
def istextblock(x): return sum(l.isalpha() for l in x) > 100
#def methodtitle(x): return ((('method' in x)) or ('material' in x) and (sum(l.isalpha() for l in x)<30) and ('supplement' not in x)) 
def methodtitle(x): return ((('method' in x) or ('data' in x) or ('material' in x)) and (sum(l.isalpha() for l in x)<30)) 
## symbol cleaning

def lbracketcount(x): return x.count('{') < 4
def rbracketcount(x): return x.count('}') < 4
def tagcount(x): return x.count('"."') < 4
def hashcount(x): return x.count('#') < 4
def divcount(x): return x.count('</') < 4
def linecount(x): return x.count('||') < 4

In [54]:
keep_text = []
cull_text = []
    
for x in base_text:
    if (charchar(x) and alphachars(x)) and (istitle(x) or istextblock(x)):
        keep_text.append(x)
    else:
        continue
    
for x in keep_text:
    if (lbracketcount(x) and rbracketcount(x) and tagcount(x) and hashcount(x) and divcount(x) and linecount(x)):
        cull_text.append(x)
    else:
        continue

In [55]:
cull_text

[' adolescent idiopathic scoliosis (ais) is a complex threedimensional (3d) deformation of the natural shape of the spinal column. ais patients have pathological spinal curves in the coronal plane, alterations of the kyphosis or lordosis in the sagittal plane, and rotations of the vertebrae. the surgical instrumentation for the ais is a complex procedure involving many difficult decisions, such as the spinal segments to instrument, the type/location/number of hooks or screws, the rod diameter/length/shape, the implant attachment order, and the amount of rod rotation [ ',
 'currently, the lenke classification model is prevalent in surgical planning to determine the appropriate region of the spine to be fused. the lenke model is described by a chart, called the lenke chart, which specifies the criteria to separate the spine curve shapes into six different types. a cobb angle characterizes the spine curve in one of three spine regions, namely, the proximal thoracic (pt) defined between th

In [116]:
## (6) remove lines if they do not contain at least two letters next to each other

for line in base_text:
    matched = re.match(".*[a-zA-Z]{2}.*", line)
    
    if bool(matched) == False:
        base_text.remove(line)
    else:
        continue

In [117]:
## (7) remove lines if they do not contains at least 4 letters

for line in base_text:
    letters = sum(x.isalpha() for x in line)

    if letters < 4:
        base_text.remove(line)
    else:
        continue

In [118]:
base_text

['funding: this work was supported by national natural science foundation of china(30971047, 81000581), national high-tech r&d program (863 program, 2006aa02z430, ministry of science and technology of china), national key project for ind (investigational new drug)(2008zx09312, ministry of science and technology of china), ‘‘10th five-year plan’’ of national key technologies r&d program (2004ba720a21-02, ministry of science and technology of china), and the ‘‘climbing action’’ program (064119533,science and technology commission of shanghai municipality). the funding organizations had no role in study design, data collection and analysis, decision to publish or preparation of the manuscript.',
 'competing interests: the authors have declared that no competing interests exist. . these authors contributed equally to this work.',
 'introduction',
 ' depression affects about 10% of the population at some point in their life and is the leading cause of disability across the world [ ',
 ' alt

In [119]:
## (8) find the first element that is both short, and contains methods (i.e. a title) and drop all before

for i, x in enumerate(base_text):
    if ('ethod' in x or 'aterial' in x) and sum(l.isalpha() for l in x) <  30:
        print("method title!")
        method_following = base_text[i:]
        break
    else:
        print('not found!')
        method_following = []

not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
method title!


In [120]:
len(method_following)

30

In [123]:
method_following

['materials and methods',
 'the study was conducted at the division of mood disorders, shanghai mental health center, shanghai jiao tong university school of medicine between jan 2007 and dec 2009. outpatients were recruited from the clinic and ward of shanghai mental health center. all procedures were reviewed and approved by institutional review boards of shanghai mental health center. written informed consent was obtained from each subject before any study-related procedures were performed.',
 'subjects',
 'inclusion criteria for ssd group were: two or more depressive symptoms for at least 2 weeks with social dysfunction but without depressed mood or anhedonia, and having a total score of 17-item hamilton rating scale for depression (hrsd-17) from 8 to 16. patients were included into mdd group who met dsm-iv criteria for mdd and had the total score of hrsd-17 $17. patients were excluded if they had substance dependence, severe medical illness, organic brain disease, pregnancy. healt

In [124]:
## (9) drop everything after first instance of a results/discussion/conclusion title appears

if len(method_following) == 0:
    method_text = base_text
    print("no methods found, take all text!")

if len(method_following) > 0:
    for i, x in enumerate(method_following):
        if 'esult' in x and sum(l.isalpha() for l in x) <  30:
            print("result title!")
            method_text = method_following[:i]
            break
        elif 'iscussion' in x and sum(l.isalpha() for l in x) <  30:
            print("discussion title!")
            method_text = method_following[:i]
            break
        elif 'onclusion' in x and sum(l.isalpha() for l in x) <  30:
            print("conclusion title!")
            method_text = method_following[:i]
            break
        else:
            print('not found!')
            method_text = method_following

not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!
not found!


In [125]:
method = ' '.join(method_text)
method

'materials and methods the study was conducted at the division of mood disorders, shanghai mental health center, shanghai jiao tong university school of medicine between jan 2007 and dec 2009. outpatients were recruited from the clinic and ward of shanghai mental health center. all procedures were reviewed and approved by institutional review boards of shanghai mental health center. written informed consent was obtained from each subject before any study-related procedures were performed. subjects inclusion criteria for ssd group were: two or more depressive symptoms for at least 2 weeks with social dysfunction but without depressed mood or anhedonia, and having a total score of 17-item hamilton rating scale for depression (hrsd-17) from 8 to 16. patients were included into mdd group who met dsm-iv criteria for mdd and had the total score of hrsd-17 $17. patients were excluded if they had substance dependence, severe medical illness, organic brain disease, pregnancy. healthy control su