In [6]:
from docx import Document
import pandas as pd
import numpy as np
import os

In [7]:
def get_title(table):
    
    visited_row_cells = []
    title_row = table.rows[0]
    title_text = ""
    
    for cell in title_row.cells:         
        if cell not in visited_row_cells:
            title_text += cell.text
            visited_row_cells.append(cell)
            
    return title_text

def extract_temp(title_text, temp_string):
    
    temp_index = title_text.find(temp_string) - 2 # skip space
    temp_begin = temp_index
        
    while title_text[temp_begin].isdigit() or title_text[temp_begin] == '.':
        temp_begin -= 1
        
    temp = title_text[temp_begin + 1:temp_index + 1]
    
    return temp

def get_temp(title_text):

    if 'oC' in title_text:
        return extract_temp(title_text, 'oC')
        
    elif 'o C' in title_text:
        return extract_temp(title_text, 'o C')
    
    else:
        return 'n/a'
    
def separate_tables(table_rows):
    
    subtables = []
    slice_index = 0
    visited_row_cells = []

    index = slice_index
    
    while index < len(table_rows):
        curr_row = table_rows[index]
        empty_cells = 0
        for cell in curr_row.cells:
            if cell.text == '':
                empty_cells += 1
        if empty_cells == len(curr_row.cells):
            subtables.append(table_rows[slice_index:index])
            slice_index = index + 1
            break
        
        index += 1
        
    subtables.append(table_rows[index:])
    return subtables #each subtable is just a list of rows, no longer a table

def extract_titles(row):
    
    visited_row_cells = []
    row_titles = []
    
    for cell in row.cells:
        if cell not in visited_row_cells:
            row_titles.append(cell.text.strip())
            visited_row_cells.append(cell)
    
    return row_titles

def check_if_decimal(title):
    
    if title.isdigit():
        return True
    
    decimal_pt = title.find('.')
    
    if decimal_pt == -1 or decimal_pt == len(title) - 1: #no dot or last character is dot; cannot be decimal
        return False
    elif title[decimal_pt+1:].isdigit(): #point precedes numbers, certainly decimal point
        return True
    
    return False
        

def get_column_titles(subtable):
    
    first_row_titles = extract_titles(subtable[0])
    second_row_titles = extract_titles(subtable[1])
    
    for title in second_row_titles:
        if check_if_decimal(title):
            return first_row_titles #second row is data; column titles are in first row
        
    column_titles = []
        
    while len(first_row_titles) > 0 and len(second_row_titles) > 0: #compare title by title
        if first_row_titles[0] == second_row_titles[0]:
            column_titles.append(first_row_titles[0])
            second_row_titles = second_row_titles[1:]
            
        else:
            column_titles.append(first_row_titles[0] + '_' + second_row_titles[0])
            column_titles.append(first_row_titles[0] + '_' + second_row_titles[1])
            if len(second_row_titles) > 2 and second_row_titles[2]!= second_row_titles[0]:
                column_titles.append(first_row_titles[0] + '_' + second_row_titles[2])
                if len(second_row_titles) > 3 and second_row_titles[3] != second_row_titles[0]:
                    column_titles.append(first_row_titles[0] + '_' + second_row_titles[3])
                    second_row_titles = second_row_titles[4:]
                else:
                    second_row_titles = second_row_titles[3:]
            else:
                first_row_titles = first_row_titles[1:]
                second_row_titles = second_row_titles[2:]
    
    
    return column_titles

def dataframe_rows(index, subtable):
    
    df_rows = []
    
    for row in subtable[index:]:
        row_data = []
        visited_row_cells = []
        for cell in row.cells:
            if cell not in visited_row_cells:
                row_data.append(cell.text)
            visited_row_cells.append(cell)
                
        df_rows.append(row_data)
        
    return df_rows
    

In [8]:
def dataframe(subtable, columns):
    
    if columns == extract_titles(subtable[0]): #data starts from second row           
        df_rows = dataframe_rows(2, subtable)
        return pd.DataFrame(np.array(df_rows), columns=get_column_titles(subtable))
                    
    else:
        df_rows = dataframe_rows(3, subtable)
        return pd.DataFrame(np.array(df_rows), columns=get_column_titles(subtable))

In [35]:
def get_text_index(text, table_rows):
    index = 0
    
    for row in table_rows:
        for cell in row.cells:
            if text in cell.text:
                return index
        index += 1
        
    return len(table_rows)

def interruption_index(table_rows):
    index = 0
      
    for row in table_rows:
        for cell in row.cells:
            if cell.text == '':
                return index
        index += 1
    return len(table_rows)

def get_extras(last_index, table_rows, column_titles):
    extra_info_dict = {}
    
    for i in range(last_index, len(table_rows)):
        curr_row = table_rows[i]
        visited_cells = []
        curr_property = column_titles[i]
        property_vals = []
        for cell in curr_row.cells:
            if cell not in visited_cells and cell.text != '':
                property_vals.append(cell.text)
            visited_cells.append(cell)
        extra_info_dict[curr_property] = property_vals[1:]
    return extra_info_dict

def get_num(index, text):
    start_ind = index - 1
    index += 1

    while start_ind >= 0 and text[start_ind].isdigit():
        start_ind -= 1
    
    while index < len(text) and text[index].isdigit():
        index += 1

    return text[start_ind:index+1]
    

def get_num_val(index, table_rows): #works only for one particular numval in row
    
    curr_row = table_rows[index]
    for cell in curr_row.cells:
        if check_if_decimal(cell.text):
            return (get_num(cell.text.find('.'), cell.text))
    return 'n/a'

In [39]:
DIR_NAME = 'data_docs'

comparative_dict = {} # filename to (temperature, dataframe): experimental & simulated results comparison
oil_phase_dict = {}   # filename to (temperature, dataframe, api): differential liberation of oil phase
gas_phase_dict = {}   # filename to (temperature, dataframe,): diff. lib. of gas phase
molar_comp_dict = {}  # filename to (temperature, dataframe, extra_vals_dict) : molar composition of fluid

for some_filename in os.listdir(DIR_NAME):
    document = Document(DIR_NAME + '/' + some_filename) # make sure no file is open
    tables = document.tables
    table_to_data_dict = {}
    
    comp_index = 0 #to not overwrite dict key if multiple such tables in file
    oil_index = 0
    gas_index = 0
    molar_index = 0

    for table in tables:
        table_title = get_title(table)

        if "fase óleo na" in table_title: # process oil phase data
            
            oil_index += 1
            column_row = table.rows[1]
            column_titles = []
            table_temp = get_temp(table_title)
            
            visited_row_cells = []

            for cell in column_row.cells:
                if cell not in visited_row_cells:
                    column_titles.append(cell.text)
                visited_row_cells.append(cell)

            table_rows = table.rows[2:]

            index = get_text_index("API", table_rows)                
            df_rows = dataframe_rows(0, table_rows[:index])
            API = get_num_val(index, table_rows)

            oil_phase_dict[some_filename + '_' + str(oil_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), API)

        elif "entre resultados" in table_title: # process comparative data
            
            comp_index += 1
            table_temp = get_temp(table_title)
            obs_index = get_text_index("Obs.", table.rows[1:]) 

            table_rows = table.rows[1:obs_index]
            subtables = separate_tables(table_rows)

            name_index = 0

            for subtable in subtables:
                titles = get_column_titles(subtable)
                comparative_dict[some_filename + '_' + str(comp_index) + '_' + str(name_index)] = (table_temp, dataframe(subtable, titles))
                name_index += 1

        elif "fase gás na" in table_title: # process gas phase data
            
            gas_index += 1
            column_row = table.rows[1]
            column_titles = []
            table_temp = get_temp(table_title)
            
            visited_row_cells = []

            for cell in column_row.cells:
                if cell not in visited_row_cells:
                    column_titles.append(cell.text)
                visited_row_cells.append(cell)

            table_rows = table.rows[2:]

            index = get_text_index("Densidade", table_rows)                
            df_rows = dataframe_rows(0, table_rows[:index])
            
            gas_phase_dict[some_filename + '_' + str(gas_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), Densidade)
            
        elif "Composições molares" in table_title: # process molar composition
            
            molar_index += 1
            column_row = table.columns[0]
            column_titles = []
            table_temp = get_temp(table_title)
            
            visited_row_cells = []

            for cell in column_row.cells:
                if cell not in visited_row_cells:
                    column_titles.append(cell.text)
                visited_row_cells.append(cell)
                
            column_titles = column_titles[1:] #remove title row
            df_rows = []
            
            for column in table.columns:
                row_text = []
                visited_cells = []
                row_cells = column.cells[1:]
                for cell in row_cells:
                    if cell not in visited_cells:
                        row_text.append(cell.text)
                    visited_cells.append(cell)
                df_rows.append(row_text)
                
            df_rows = df_rows[1:] #remove titles
            
            molar_comp_dict[some_filename + '_' + str(molar_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), {})
            
            
        elif "Propriedades e composições molares" in table_title:
            
            molar_index += 1
            column_row = table.columns[0]
            column_titles = []
            table_temp = get_temp(table_title)
            
            visited_row_cells = []

            for cell in column_row.cells:
                if cell not in visited_row_cells:
                    column_titles.append(cell.text)
                visited_row_cells.append(cell)

            last_index = interruption_index(table.rows[1:])
            column_titles_df = column_titles[1:last_index + 1] #remove title row and latter info
            df_rows = []
            
            for column in table.columns:
                row_text = []
                visited_cells = []
                row_cells = column.cells[1:last_index + 1]
                for cell in row_cells:
                    if cell not in visited_cells:
                        row_text.append(cell.text)
                    visited_cells.append(cell)
                df_rows.append(row_text)
            
            df_rows = df_rows[1:] #remove titles  
            extra_info_dict = get_extras(last_index, table.rows, column_titles)
            
            molar_comp_dict[some_filename + '_' + str(molar_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles_df), extra_info_dict)

        else:
            continue
            

text is API do óleo residual: 38.48
text is API do óleo residual:  18.80
text is API do óleo residual: 37.9


In [12]:
comparative_dict.items()

dict_items([('PVT_report_example3.docx_1_0', ('70.0',       Pressão(a) Pressão(a)_Bo(b) Pressão(a)_Rs(c) Pressão(a)_o(d)  \
0          390.0           1.1527            67.70             8.66   
1          360.0           1.1554            67.70             8.30   
2          330.0           1.1583            67.70             7.98   
3          300.0           1.1611            67.70             7.66   
4          280.0           1.1632            67.70             7.42   
5          270.0           1.1641            67.70             7.30   
6          260.0           1.1652            67.70             7.20   
7   252.3 (Psat)           1.1661            67.70             7.12   
8          220.0           1.1511            59.70             8.14   
9          190.0           1.1376            52.27             9.26   
10         160.0           1.1234            44.71            10.21   
11         130.0           1.1088            36.91            11.30   
12         100.0       

In [14]:
comparative_dict['PVT_report_example3.docx_1_0'][1]

Unnamed: 0,Pressão(a),Pressão(a)_Bo(b),Pressão(a)_Rs(c),Pressão(a)_o(d),Pressão(a)_o(e),Pressão(a)_Bo(b).1,Pressão(a)_Rs(c).1,Pressão(a)_o(d).1,Pressão(a)_o(e).1,Pressão(a)_Bo(b).2,Pressão(a)_Rs(c).2,Pressão(a)_o(d).2,Pressão(a)_o(e).2
0,390.0,1.1527,67.7,8.66,0.8586,1.1608,68.27,8.35,0.8573,0.7,0.84,-3.63,-0.15
1,360.0,1.1554,67.7,8.3,0.8567,1.1643,68.27,8.04,0.8547,0.77,0.84,-3.1,-0.23
2,330.0,1.1583,67.7,7.98,0.8545,1.168,68.27,7.74,0.8521,0.83,0.84,-3.02,-0.29
3,300.0,1.1611,67.7,7.66,0.8525,1.1719,68.27,7.43,0.8492,0.93,0.84,-2.95,-0.39
4,280.0,1.1632,67.7,7.42,0.8509,1.1747,68.27,7.23,0.8472,0.99,0.84,-2.55,-0.44
5,270.0,1.1641,67.7,7.3,0.8502,1.1761,68.27,7.13,0.8462,1.03,0.84,-2.35,-0.48
6,260.0,1.1652,67.7,7.2,0.8495,1.1776,68.27,7.03,0.8451,1.06,0.84,-2.41,-0.52
7,252.3 (Psat),1.1661,67.7,7.12,0.8488,1.179,68.27,6.93,0.8441,1.11,0.85,-2.69,-0.56
8,220.0,1.1511,59.7,8.14,0.8549,1.1626,60.29,7.74,0.851,1.0,0.99,-4.89,-0.46
9,190.0,1.1376,52.27,9.26,0.8604,1.146,52.37,8.79,0.8583,0.74,0.18,-5.07,-0.24
