In [23]:
import PyPDF2
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter,HTMLConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import numpy as np
import pandas as pd

In [24]:
def find_correct_page_number(filename):
    pdf = PyPDF2.PdfFileReader(open(filename, "rb")) 
    page_number = 6 
    page_index = page_number - 1
  
    find_page = False
    while page_index<10 and not find_page: 
        page = pdf.getPage(page_index).extractText()
        if 'Job Summary' in page:
            print('correct page')
            find_page = True
        else:
            print('no')
            page_index += 1
    page_number = page_index +1 
    return page,page_number

def convert_pdf_to_txt(path,page_number):
    
    page_index= page_number-1
    rsrcmgr = PDFResourceManager() # task with interpreter and device
    retstr = StringIO() # make it faster
    codec = 'utf-8' # code format
    laparams = LAParams() # help with document extraction
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # export into retstr
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    pages= PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
    for index,page in enumerate(pages):
        if index == page_index:
            interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string

def convert_pdf_to_html(path,page_number):
    #outfile = file(outpath,'w')
    page_index= page_number-1
    rsrcmgr = PDFResourceManager() # task with interpreter and device
    retstr = StringIO() # make it faster
    codec = 'utf-8' # code format
    laparams = LAParams() # help with document extraction
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # export into retstr
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    pages= PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
    for index,page in enumerate(pages):
        if index == page_index:
            interpreter.process_page(page)
    
    fp.close()
    device.close()
    string=retstr.getvalue()
    retstr.close()
    return string

def my_patterns():
    patterns = dict()
    patterns['stage']='.*Stage (\d+) \n'
    patterns['text_name']='.*\n([A-Z].*|\d{1,3} [A-Z].*|\d{1,3}/.*)'
    patterns['seperate_value_unit'] = '(\d+|\d*.\d+|\d*,\d*)(?:[a-z\s\*])'
    return patterns

def find_columns_names(text_string,patterns):
    text_string_0 = text_string.index('Start Time') # find the first Start Time string
    text_string_1 = text_string.index('Disclaimer')
    text_split1 = text_string[text_string_0-1:text_string_1] # in order to contains \n for the Start Time
    columns_name_text = re.findall(patterns['text_name'],text_split1)
    stage_number = re.findall(patterns['stage'],text_split1)
    
    # remove the white space 
    columns_name_text = [e.rstrip() for e in columns_name_text]
    
    #print(columns_name_text)
    return columns_name_text,stage_number

def remove_enter_from_string(string):
    return string.replace('\n','')

def remove_last_white_space(string,index):
    match = re.search('\s',string)
    index_space = [i for i in range(len(string)) if string.startswith(' ', i)]
    popout = index_space.pop(index)
    string_new = string[0:popout] + string[popout+1:]
    return string_new


def find_data_names_from_string(strings,columns):
    index_start = list() # record the start of value
    index_end = list() # recrod the end of the value
    for index,col in enumerate(columns):
        #print('name',col)
        if col in strings:
            the_index_start = strings.index(col)
            the_index_end = the_index_start + len(col)
        elif remove_last_white_space(col,0) in strings:        
            print('not in, try remove first')
            col_new = remove_last_white_space(col,0)
            print(col_new)
            the_index_start = strings.index(col_new)
            the_index_end = the_index_start + len(col_new)
        elif remove_last_white_space(col,-1) in strings:
            print('not in, try remove last one')
            col_new = remove_last_white_space(col,-1)
            print(col_new)
            the_index_start = strings.index(col_new)
            the_index_end = the_index_start + len(col_new)
        index_end.append(the_index_start)
        if index == len(columns)-1: # end of column, the first element should be deleted
            index_end.append(len(strings))
            index_end.pop(0)
        #elif index > 0: 
        index_start.append(the_index_end)
        #index_start.append(match.end()+1)
    #for index,e in enumerate(index_end):
    index_range_list = [(index_start[index],e) for index,e in enumerate(index_end)]
    #print(my_list)
    #name = 
    data = [strings[e1:e2] for e1,e2 in index_range_list]
    return data

def split_value_unit_from_string(data,pattern,columns):
    #pattern = 
    #match=re.search(pattern_unit,'.96psi/ft')
    data_value = ['']*len(data) # record of values 
    data_unit = ['']*len(data) # record of units
    for i,elem in enumerate(data):
        if 'Start Time' in columns[i] or 'End Time' in columns[i]:
            data_value[i] = elem.rstrip( )
            data_unit[i] = 'sec'
        else:
            #print('col',elem)
            match=re.search(pattern,elem)
            data_value[i] = match.group(0).rstrip( )
            if ',' in data_value[i]:
                data_value[i]=data_value[i].replace(',','')
                
            data_unit[i] = elem[match.end():].rstrip( ) # remove the space at the end
    return data_value,data_unit


In [25]:
def main(filename):
    # initiation of patterns
    patterns = my_patterns()  
    # get page from PyPDF2 as it follows the order
    page,page_number=find_correct_page_number(filename)
    # get text from pdfminer, as columns names are in the order
    text_string = convert_pdf_to_txt(filename,page_number)
    # get columns from text_string
    columns, stage = find_columns_names(text_string,patterns)
    # get value from page using columns from text_string
    # make it easier to extract
    page = page.split('Disclaimer')[0] 
    # remove \n from the page 
    page_no_enter = remove_enter_from_string(page)
    # find the data contains value and unit
    data = find_data_names_from_string(page_no_enter,columns)
    # split data into value and unit
    data_value,data_unit = split_value_unit_from_string(data,patterns['seperate_value_unit'],columns)
    # create the dataframe
    DataFrame= pd.DataFrame({'1name':columns,'data_value':np.array(data_value),'unit':pd.Series(data_unit)})
    
    return DataFrame
filename = 'TCC Encana Oil  Gas - Horseshoe Hill 10H-1 Stg 15.pdf'
DataFrame = main(filename)
print(DataFrame)

no
correct page
                            1name           data_value    unit
0                      Start Time   06-Mar-12 09:29:41     sec
1                        End Time   06-Mar-12 13:31:29     sec
2                       Pump Time               190.52     min
3           Max Treating Pressure                11429     psi
4           Avg Treating Pressure                10139     psi
5                 Max Slurry Rate                 73.8     bpm
6                 Avg Slurry Rate                 62.1     bpm
7                   Slurry Volume               424446     gal
8                         Avg HHP                15427      hp
9      Max Proppant Concentration                 3.76  lb/gal
10  BH Max Proppant Concentration                 3.76  lb/gal
11           Proppant Mass Pumped              3416.91  100*lb
12                       100 Mesh                40300      lb
13             Premium White 4/70               274480      lb
14                  Premium 30/50      

In [4]:
#
#
# input file
filename = 'TCC Encana Oil  Gas - Horseshoe Hill 10H-1 Stg 3.pdf'
# initiation of patterns
patterns = my_patterns()  
# get page from PyPDF2 as it follows the order
page,page_number=find_correct_page_number(filename)
# get text from pdfminer, as columns names are in the order
text_string = convert_pdf_to_txt(filename,page_number)
# get columns from text_string
columns, stage = find_columns_names(text_string,patterns)
# get value from page using columns from text_string
# make it easier to extract
page = page.split('Disclaimer')[0] 
# remove \n from the page 
page_no_enter = remove_enter_from_string(page)
# find the data contains value and unit
data = find_data_names_from_string(page_no_enter,columns)
# split data into value and unit
data_value,data_unit = split_value_unit_from_string(data,patterns['seperate_value_unit'],columns)
# create the dataframe
DataFrame= pd.DataFrame({'1name':columns,'data_value':np.array(data_value),'unit':pd.Series(data_unit)}) 
print(DataFrame)


#html_string = convert_pdf_to_html(filename,page_number)

#patterns = dict()
#patterns['stage']='.*Stage (\d+) \n'
#patterns['text_name']='.*\n([A-Z].*|\d{1,3} [A-Z].*|\d{1,3}/.*)'
#patterns['text_name']='.*\n([A-Z].*|\d{1,3} [A-Z].*|\d{1,3}/.*)'
#patterns['html_name'] = '.*<br>.*12px">(\d.*\n)'
#text_split1 = text_string.split('Disclaimer')[0]

#text_string_0 = text_string.index('Start Time') # find the first Start Time string
#text_string_1 = text_string.index('Disclaimer')
#text_split1 = text_string[text_string_0-1:text_string_1] # in order to contains \n for the Start Time

#html_string_0 = html_string.index('Start Time')
#html_string_1 = html_string.index('Disclaimer')
#html_split1 = html_string[html_string_0-1:html_string_1]

#page_string_0 = page.index('Start Time')
#page_string_1 = page.index('Disclaimer')
#page_split1 = page[page_string_0-1:page_string_1]

#print(string_0)
# miner from text
#columns_name_text = re.findall(patterns['text_name'],text_split1)
#stage_number = re.findall(patterns['stage'],text_split1)


#columns_name_text = [e.rstrip() for e in columns_name_text]

#print(columns_name_text,len(columns_name_text))
#print(page)

#page_no_enter = page
#print(page_no_enter)



#a = page_no_enter.index('Water Frac G (5)')
#print('index',a)
#columns = ['Start Time','End Time','Max Treating Pressure']


#print('data',data)
#print('data',data1)
#
# find the value in the data 

#data_number 
#print(data_value)
#print(text_split1)
#print(page)

#print(a)


no
correct page
                            1name           data_value    unit
0                      Start Time   02-Mar-12 21:11:33     sec
1                        End Time   03-Mar-12 00:03:18     sec
2                       Pump Time                  172     min
3           Max Treating Pressure                 9983     psi
4           Avg Treating Pressure                 9124     psi
5                 Max Slurry Rate                 61.0     bpm
6                 Avg Slurry Rate                 60.1     bpm
7                   Slurry Volume               378137     gal
8                         Avg HHP                13440      hp
9      Max Proppant Concentration                 3.96  lb/gal
10  BH Max Proppant Concentration                 3.95  lb/gal
11           Proppant Mass Pumped               360000      lb
12                       100 Mesh                40000      lb
13            40/70 Premium White               280000      lb
14                30/50 Interprop      

In [3]:
a = '123 3344 ddg'


NameError: name 'columns_name_text' is not defined

In [206]:
pattern_unit = '(\d+|\d*.\d+|\d*,\d*)(?:[a-z\s\*])'
match=re.search(pattern_unit,'.96psi/ft')
match.group(0)

'.96p'

In [None]:
index_start = list()
index_end = list()
for index,col in enumerate(columns):
    #print('name',col)
    if col in page_no_enter:
        the_index_start = page_no_enter.index(col)
        the_index_end = the_index_start + len(col)
    elif remove_last_white_space(col,0) in page_no_enter:
        
        print('not in, try remove first')
        
        col_new = remove_last_white_space(col,0)
        print(col_new)
        the_index_start = page_no_enter.index(col_new)
        the_index_end = the_index_start + len(col_new)
    elif remove_last_white_space(col,-1) in page_no_enter:
        print('not in, try remove last one')
       
        col_new = remove_last_white_space(col,-1)
        #print(col_new)
        the_index_start = page_no_enter.index(col_new)
        the_index_end = the_index_start + len(col_new)
    #print('index',the_index_start,the_index_end)
    #print('col',page_no_enter[the_index_start:the_index_end])
    #match = re.search(col,page_no_enter)
    #print(match.start(),match.end())
    
    #index_end.append(match.start())
    index_end.append(the_index_start)
    if index == len(columns)-1: # end of column
        index_end.append(len(page_no_enter))
        index_end.pop(0)
    #elif index > 0: 
    index_start.append(the_index_end)
    #index_start.append(match.end()+1)
#for index,e in enumerate(index_end):
my_list = [(index_start[index],e) for index,e in enumerate(index_end)]
#print(my_list)
#name = 
data = [page_no_enter[e1:e2] for e1,e2 in my_list]
pattern_unit = '(\d+|\d*.\d+|\d*,\d*)(?:[a-z\s\*])'
match=re.search(pattern_unit,'.96psi/ft')
data_value = ['']*len(data)
data_unit = ['']*len(data)
for i,e in enumerate(data):
    if 'Start Time' in columns_name_text[i] or 'End Time' in columns_name_text[i]:
        data_value[i] = data[i].rstrip( )
        data_unit[i] = 'sec'
    else:
        print('col',e)
        match=re.search(pattern_unit,e)
        data_value[i] = match.group(0).rstrip( )
        if ',' in data_value[i]:
            data_value[i]=data_value[i].replace(',','')
        data_unit[i] = data[i][match.end():].rstrip( ) # remove the space at the end

In [2]:
filename = 'TCC Encana Oil  Gas - Horseshoe Hill 10H-1 Stg 13.pdf'
pdf = PyPDF2.PdfFileReader(open(filename, "rb")) 
page_number = 5
find_page = False
while page_number<10 and not find_page: 
    page = pdf.getPage(page_number).extractText()
    if 'Job Summary' in page:
        print('yes')
        find_page = True
    else:
        print('no')
        page_number += 1

page = page.split('Job Summary')[1]
page = page.split('Disclai')[0]
#print(page.split('\n'))
test_page ='  Start Time123444End Time555555'
columns = ['Start Time','End Time','Pump Time','Max Treating Pressure']
#print(page)
index_start = list()
index_end = list()
for index,col in enumerate(columns):
    match = re.search(col,page)
    print(match.start(),match.end())
    
    index_end.append(match.start())
    if index == len(columns)-1:
        index_end.append(len(page))
        index_end.pop(0)
    #elif index > 0: 
    index_start.append(match.end()+1)
#for index,e in enumerate(index_end):
my_list = [(index_start[index],e) for index,e in enumerate(index_end)]
data = [page[e1:e2] for e1,e2 in my_list]
#for index,e in enumerate(data):
    #while '\n' in e:
    #data[index]=e.replace('\n','')
print(my_list)
print(data[:])
#print(index_start)
#print(index_end)

    #print(page[match.end()+1:match.end()+7])
#data = page.split('\n')
#number_column = 7 
#column_names = data[2:7]
# 

no
yes
(2, 12)
(39, 47)
(74, 83)
(95, 116)
[(13, 39), (48, 74), (84, 95), (117, 713)]
[u' 05\n-Mar\n-12 \n20:37\n:51\n  ', u' 05\n-Mar\n-12 23:\n05\n:55\n  ', u' 146\n min\n ', u' 9,495\n psi\n Avg Treating Pressure\n 8,933\n psi\n Max Slurry \nRate\n 70.7\n bpm Avg Slurry Rate\n 69.9\n bpm Slurry Volume\n 415\n,753\n gal\n Avg HHP\n 15,304\n hp Max Proppant Concentration\n 4.84\n lb/gal\n BH Max Proppant Concentration\n 4.85\n lb/gal\n Proppant Mass Pumped\n 455,110\n 100*lb\n 100 Mesh\n 40,060\n lb 40/70 Premium White\n 370,000\n lb 30/50 Interprop\n 45,050\n lb Load to Recover\n 395\n,369\n gal\n Water Frac G (5)\n 256,012\n gal\n Hybor G (27)\n 139,357\n gal\n NaCl\n 13,020\n gal\n Slurry Water Frac G (5)\n 263,451\n gal\n Slurry Hybor G (27)\n 152,302\n gal\n ISIP\n 5,516\n Psi\n Frac Gradient\n 0.\n857\n psi/ft\n  ']


{}