In [2]:
from docx import Document
import pandas as pd
import numpy as np
import os
import glob
import win32com.client

In [3]:
def get_title(table):
    
    visited_row_cells = []
    title_row = table.rows[0]
    title_text = ""
    
    for cell in title_row.cells:         
        if cell not in visited_row_cells:
            title_text += cell.text
            visited_row_cells.append(cell)
            
    return title_text

def extract_temp(title_text, temp_string):
    
    temp_index = title_text.find(temp_string) - 2 # skip space
    temp_begin = temp_index
        
    while title_text[temp_begin].isdigit() or title_text[temp_begin] == '.':
        temp_begin -= 1
        
    temp = title_text[temp_begin + 1:temp_index + 1]
    
    return temp

def get_temp(title_text):

    if 'oC' in title_text:
        return extract_temp(title_text, 'oC')
        
    elif 'o C' in title_text:
        return extract_temp(title_text, 'o C')
    
    else:
        return 'n/a'
    
def separate_tables(table_rows):
    
    subtables = []
    slice_index = 0
    visited_row_cells = []

    index = slice_index
    
    while index < len(table_rows):
        curr_row = table_rows[index]
        empty_cells = 0
        for cell in curr_row.cells:
            if cell.text == '':
                empty_cells += 1
        if empty_cells == len(curr_row.cells):
            subtables.append(table_rows[slice_index:index])
            slice_index = index + 1
            break
        
        index += 1
        
    subtables.append(table_rows[index:])
    return subtables #each subtable is just a list of rows, no longer a table

def extract_titles(row):
    
    visited_row_cells = []
    row_titles = []
    
    for cell in row.cells:
        if cell not in visited_row_cells:
            row_titles.append(cell.text.strip())
            visited_row_cells.append(cell)
    
    return row_titles

def check_if_decimal(title):
    
    if title.isdigit():
        return True
    
    decimal_pt = title.find('.')
    
    if decimal_pt == -1 or decimal_pt == len(title) - 1: #no dot or last character is dot; cannot be decimal
        return False
    elif title[decimal_pt+1:].isdigit(): #point precedes numbers, certainly decimal point
        return True
    
    return False
        

def get_column_titles(subtable):
    
    first_row_titles = extract_titles(subtable[0])
    second_row_titles = extract_titles(subtable[1])
    
    for title in second_row_titles:
        if check_if_decimal(title):
            return first_row_titles #second row is data; column titles are in first row
        
    column_titles = []
        
    while len(first_row_titles) > 0 and len(second_row_titles) > 0: #compare title by title
        if first_row_titles[0] == second_row_titles[0]:
            column_titles.append(first_row_titles[0])
            second_row_titles = second_row_titles[1:]
            
        else:
            column_titles.append(first_row_titles[0] + '_' + second_row_titles[0])
            column_titles.append(first_row_titles[0] + '_' + second_row_titles[1])
            if len(second_row_titles) > 2 and second_row_titles[2]!= second_row_titles[0]:
                column_titles.append(first_row_titles[0] + '_' + second_row_titles[2])
                if len(second_row_titles) > 3 and second_row_titles[3] != second_row_titles[0]:
                    column_titles.append(first_row_titles[0] + '_' + second_row_titles[3])
                    second_row_titles = second_row_titles[4:]
                else:
                    second_row_titles = second_row_titles[3:]
            else:
                first_row_titles = first_row_titles[1:]
                second_row_titles = second_row_titles[2:]
    
    
    return column_titles

def dataframe_rows(index, subtable):
    
    df_rows = []
    
    for row in subtable[index:]:
        row_data = []
        visited_row_cells = []
        for cell in row.cells:
            if cell not in visited_row_cells:
                row_data.append(cell.text)
            visited_row_cells.append(cell)
                
        df_rows.append(row_data)
        
    return df_rows
    

In [4]:
def dataframe(subtable, columns):
    
    if columns == extract_titles(subtable[0]): #data starts from second row           
        df_rows = dataframe_rows(2, subtable)
        return pd.DataFrame(np.array(df_rows), columns=get_column_titles(subtable))
                    
    else:
        df_rows = dataframe_rows(3, subtable)
        return pd.DataFrame(np.array(df_rows), columns=get_column_titles(subtable))

In [5]:
def get_text_index(text, table_rows):
    index = 0
    
    for row in table_rows:
        for cell in row.cells:
            if text in cell.text:
                return index
        index += 1
        
    return len(table_rows)

def interruption_index(table_rows):
    index = 0
      
    for row in table_rows:
        for cell in row.cells:
            if cell.text == '':
                return index
        index += 1
    return len(table_rows)

def get_extras(last_index, table_rows, column_titles):
    extra_info_dict = {}
    
    for i in range(last_index, len(column_titles)):
        curr_row = table_rows[i]
        visited_cells = []
        curr_property = column_titles[i]
        property_vals = []
        for cell in curr_row.cells:
            if cell not in visited_cells and cell.text != '':
                property_vals.append(cell.text)
            visited_cells.append(cell)
        extra_info_dict[curr_property] = property_vals[1:]
    return extra_info_dict

def get_num(index, text):
    start_ind = index - 1
    index += 1

    while start_ind > 0 and text[start_ind].isdigit():
        start_ind -= 1
    
    while index < len(text) and text[index].isdigit():
        index += 1

    return text[start_ind:index+1]
    

def get_num_val(index, table_rows): #works only for one particular numval in row
    curr_row = table_rows[index]
    for cell in curr_row.cells:
        if check_if_decimal(cell.text):
            return (get_num(cell.text.find('.'), cell.text))
    return 'n/a'

def get_text(document):
    text = []
    for paragraph in document.paragraphs:
        text.append(paragraph.text)
    return text

def get_report_code(text):
    for item in text:
        if "CT" in item:
            return item
        
def get_well_name(text):
    for item in text:
        if "poço" in item.lower():
            return item


In [28]:
DIR_NAME = 'data_docs'

comparative_dict = {} # filename to (temperature, dataframe): experimental & simulated results comparison
oil_phase_dict = {}   # filename to (temperature, dataframe, api): differential liberation of oil phase
gas_phase_dict = {}   # filename to (temperature, dataframe, density): diff. lib. of gas phase
molar_comp_dict = {}  # filename to (temperature, dataframe, extra_vals_dict) : molar composition of fluid

word = win32com.client.Dispatch("Word.Application")
word.Visible = 0

subfolders = [DIR_NAME + '\\' + dI for dI in os.listdir(DIR_NAME) if os.path.isdir(os.path.join(DIR_NAME,dI))]

for subfolder in subfolders:    
    print(subfolder)
    for i, doc in enumerate(glob.iglob(f"{subfolder}/*.doc")):
        in_file = os.path.abspath(doc)
        wb = word.Documents.Open(in_file)
        out_file = os.path.abspath(f"{subfolder}/out{i}.docx")
        wb.SaveAs2(out_file, FileFormat=16) # file format for docx
        wb.Close()
        os.remove(doc)
        
        word.Quit()

    file_to_key_dict = {} # (report code, well name) to dict of {dict_name to list of relevant dict_keys}

    for some_filename in os.listdir(subfolder):
        print(some_filename)
        document = Document(subfolder + '/' + some_filename) # make sure no file is open
        tables = document.tables
        text = get_text(document)

        report_code = get_report_code(text)
        well_name = get_well_name(text)

        comp_index = 0 #to not overwrite dict key if multiple such tables in file
        oil_index = 0
        gas_index = 0
        molar_index = 0


        name_to_value_dict = {}
        name_to_value_dict['comparative_dict'] = []
        name_to_value_dict['oil_phase_dict'] = []
        name_to_value_dict['gas_phase_dict'] = []
        name_to_value_dict['molar_comp_dict'] = []

        for table in tables:
            table_title = get_title(table)

            if "fase óleo na" in table_title: # process oil phase data

                oil_index += 1
                column_row = table.rows[1]
                column_titles = []
                table_temp = get_temp(table_title)

                visited_row_cells = []

                for cell in column_row.cells:
                    if cell not in visited_row_cells:
                        column_titles.append(cell.text)
                    visited_row_cells.append(cell)

                table_rows = table.rows[2:]

                index = get_text_index("API", table_rows)                
                df_rows = dataframe_rows(0, table_rows[:index])
                API = get_num_val(index, table_rows)

                oil_phase_dict[some_filename + '_' + str(oil_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), API)
                name_to_value_dict['oil_phase_dict'].append(some_filename + '_' + str(oil_index))

            elif "entre resultados" in table_title: # process comparative data

                comp_index += 1
                table_temp = get_temp(table_title)
                obs_index = get_text_index("Obs.", table.rows[1:]) 

                table_rows = table.rows[1:obs_index]
                subtables = separate_tables(table_rows)

                name_index = 0

                for subtable in subtables:
                    titles = get_column_titles(subtable)
                    comparative_dict[some_filename + '_' + str(comp_index) + '_' + str(name_index)] = (table_temp, dataframe(subtable, titles))
                    name_index += 1

                name_to_value_dict['comparative_dict'].append(some_filename + '_' + str(comp_index) + '_' + str(name_index))

            elif "fase gás na" in table_title: # process gas phase data

                gas_index += 1
                column_row = table.rows[1]
                column_titles = []
                table_temp = get_temp(table_title)

                visited_row_cells = []

                for cell in column_row.cells:
                    if cell not in visited_row_cells:
                        column_titles.append(cell.text)
                    visited_row_cells.append(cell)

                table_rows = table.rows[2:]

                index = get_text_index("Densidade", table_rows)                
                df_rows = dataframe_rows(0, table_rows[:index])

                density_row = table_rows[index]
                density = "n\a"

                for cell in density_row.cells:
                    if check_if_decimal(cell.text):
                        density = cell.text
                        break

                gas_phase_dict[some_filename + '_' + str(gas_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), density)
                name_to_value_dict['gas_phase_dict'].append(some_filename + '_' + str(gas_index))

            elif "Composições molares" in table_title: # process molar composition

                molar_index += 1
                column_row = table.columns[0]
                column_titles = []
                table_temp = get_temp(table_title)

                visited_row_cells = []

                for cell in column_row.cells:
                    if cell not in visited_row_cells:
                        column_titles.append(cell.text)
                    visited_row_cells.append(cell)

                column_titles = column_titles[1:] #remove title row
                df_rows = []

                for column in table.columns:
                    row_text = []
                    visited_cells = []
                    row_cells = column.cells[1:]
                    for cell in row_cells:
                        if cell not in visited_cells:
                            row_text.append(cell.text)
                        visited_cells.append(cell)
                    df_rows.append(row_text)

                df_rows = df_rows[1:] #remove titles

                molar_comp_dict[some_filename + '_' + str(molar_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), {})
                name_to_value_dict['molar_comp_dict'].append(some_filename + '_' + str(molar_index))

            elif "Propriedades e composições molares" in table_title:

                molar_index += 1
                column_row = table.columns[0]
                column_titles = []
                table_temp = get_temp(table_title)

                visited_row_cells = []

                for cell in column_row.cells:
                    if cell not in visited_row_cells:
                        column_titles.append(cell.text)
                    visited_row_cells.append(cell)

                last_index = interruption_index(table.rows[1:])
                column_titles_df = column_titles[1:last_index + 1] #remove title row and latter info
                df_rows = []

                for column in table.columns:
                    row_text = []
                    visited_cells = []
                    row_cells = column.cells[1:last_index + 1]
                    for cell in row_cells:
                        if cell not in visited_cells:
                            row_text.append(cell.text)
                        visited_cells.append(cell)
                    df_rows.append(row_text)

                df_rows = df_rows[1:] #remove titles  
                extra_info_dict = get_extras(last_index, table.rows, column_titles)

                molar_comp_dict[some_filename + '_' + str(molar_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles_df), extra_info_dict)
                name_to_value_dict['molar_comp_dict'].append(some_filename + '_' + str(molar_index))

            else:
                continue

        file_to_key_dict[(report_code, well_name)] = name_to_value_dict


data_docs\docs_1
out0.docx
PVT report-example4.docx
data_docs\new new
PVT_report_example3.docx
report_1.docx


In [29]:
file_to_key_dict.items()

dict_items([(('CT TR YYY/2005', 'ANÁLISE PVT SIMULADA DO FLUIDO DO POÇO 7-XXX-0###HP-XXX'), {'comparative_dict': ['PVT_report_example3.docx_1_2'], 'oil_phase_dict': ['PVT_report_example3.docx_1'], 'gas_phase_dict': ['PVT_report_example3.docx_1'], 'molar_comp_dict': ['PVT_report_example3.docx_1']}), (('CT TR Nº ###/####', '#########/2002 – Análise PVT do fluido do poço 1-XXX-0###A-BSS'), {'comparative_dict': ['report_1.docx_1_2'], 'oil_phase_dict': ['report_1.docx_1'], 'gas_phase_dict': ['report_1.docx_1'], 'molar_comp_dict': ['report_1.docx_1', 'report_1.docx_2']})])

In [33]:
def contains_alpha(string):
    for char in string:
        if not (char.isdigit() or char == '.'):
            return True
        
    return False

def make_list_floats(data):
    floats_list = []
    
    for string in data:
        dot = string.find('.')
        
        if contains_alpha(string):
            floats_list.append(float(get_num(dot, string)))
            
        else:
            floats_list.append(float(string))
        
    return floats_list
            

In [34]:
def predict_bo_standing(Rs, dg, API, T):
    inter = Rs * 5.61 * np.sqrt(dg / (141.5 / (API + 131.5)))+ 1.25 * T
    return 0.972 + 1.47 * pow(10, -4) * Rs * 5.61 * pow(inter, 1.175)

def predict_bo_vazquez(Rs, dg, API, T, T_flash):
    dg_Vazquez = dg * (1 + 5.912 * pow(10, -5) * -0.89 * API * (T_flash * 1.8 + 32))
    
    if  API <= 30:
        return 1 + 4.677 * pow(10, -4) * Rs * 5.61 + 1.751 * pow(10, -5) * (T - 60) * (API / dg_Vazquez) - 1.811 * pow(10, -8) * Rs * 5.61 * (T - 60) * (API / dg_Vazquez)
    
    return 1 + 4.67 * pow(10, -4) * Rs * 5.61 + 1.1 * pow(10, -5) * (T-60) * (API / dg_Vazquez) + 1.337 * pow(10, -9) * Rs * 5.61 * (T-60) * (API/dg_Vazquez)

def predict_bo_glaso(Rs, dg, API, T):
    do = 141.5 / (API + 131.5)
    return 1 + 10**(-6.58511+2.91329*np.log10(Rs * 5.61 * ((dg/do)**0.526) + 0.968 * T) - 0.27683*(np.log10(Rs * 5.61 * ((dg/do) ** 0.526) + 0.968 *T))**2)

In [35]:
print(oil_phase_dict['out0.docx_1'][1])

           P (kgf/cm2)(a)  Bo (b)   Rs (c) o  (cP) (d) o  (g/cm3) (e)
0                   640.0  1.9025   323.63      0.6872         0.6131
1                   600.0  1.9326   323.63      0.6501         0.6048
2                   560.0  1.9626   323.63      0.6131         0.5965
3                   520.0  1.9927   323.63      0.5761         0.5882
4                   480.0  2.0228   323.63      0.5390         0.5799
5                   440.0  2.0528   323.63      0.5020         0.5716
6                   400.0  2.0828   323.63      0.4650         0.5632
7                   360.0  2.1130   323.63      0.4280         0.5550
8                   350.0  2.1200   323.63      0.4187         0.5530
9                   340.0  2.1280   323.63      0.4094         0.5505
10                  330.0  2.1350   323.63      0.4002         0.5485
11                  320.0  2.1430   323.63      0.3909         0.5468
12           316.8 (Psat)  2.1450   323.63      0.3850         0.5460
13                  

In [36]:
file_to_bo_dict = {} # (report no, well no) to {bo: [bo by row], bo_predicted_1: [predicted bo by row] ...} 

for item in file_to_key_dict.items():
    file_info = item[0] # (report no, well no) tuple
    dict_of_dicts = item[1]
    
    info_dict = {} # pressure, T, dg, API, Rs
    bo_predictions_dict = {}
    
    bo_predictions_dict["standing"] = []
    bo_predictions_dict["vazquez"] = []
    bo_predictions_dict["glaso"] = []
     
    dg = 0 # no dg found yet
    API = 0 # no API found yet
    
    for item in dict_of_dicts.items():
        if item[0] == 'oil_phase_dict':
            key = item[1]
            T = oil_phase_dict[key[0]][0]
            oil_df = oil_phase_dict[key[0]][1]
            
            bo_predictions_dict['bo'] = oil_df['Bo (b)'].values.tolist() # currently all strings
            if '/' in T:
                info_dict['T'] = T
            else:
                info_dict['T'] = float(T)
            info_dict['P'] = make_list_floats(oil_df['P (kgf/cm2)(a)'].values.tolist())
            info_dict['Rs'] = make_list_floats(oil_df['Rs (c)'].values.tolist())
        
        elif item[0] == 'molar_comp_dict':
            keys = item[1]
            
            for key in keys:
                extra_info = molar_comp_dict[key][2]
                T_flash = float(molar_comp_dict[key][0])

                if extra_info != {}:
                    for item in extra_info.items():
                        if item[0] == "Densidade do gás":
                            dg = extra_info[item[0]][0]

                        elif item[0] == "API":
                            API = extra_info[item[0]][0]
                
            info_dict['API'] = float(API)
            info_dict['dg'] = float(dg)
    
    for i in range(len(info_dict['P'])):
        Rs = info_dict['Rs'][i]
        T = info_dict['T']
        if type(T) == str:
            continue
        API = info_dict['API']
        P = info_dict['P'][i]
        dg = info_dict['dg']
        
        T = T * 1.8 + 32 # convert to fahrenheit
        
        bo_predictions_dict["vazquez"].append(predict_bo_vazquez(Rs, dg, API, T, T_flash))
        bo_predictions_dict["standing"].append(predict_bo_standing(Rs, dg, API, T))
        bo_predictions_dict["glaso"].append(predict_bo_glaso(Rs, dg, API, T))
    
    file_to_bo_dict[file_info] = bo_predictions_dict
    
for item in file_to_bo_dict.items():
    print(item)

(('CT TR YYY/2005', 'ANÁLISE PVT SIMULADA DO FLUIDO DO POÇO 7-XXX-0###HP-XXX'), {'standing': [96.47223153351007, 96.47223153351007, 96.47223153351007, 96.47223153351007, 96.47223153351007, 96.47223153351007, 96.47223153351007, 96.47223153351007, 88.90007960544415, 71.2027700147435, 56.50692313081454, 43.4578912380409, 32.108114469921816, 22.42825282185013, 14.430003354690985, 7.1316342824133265, 0.972], 'vazquez': [1.2220050458446436, 1.2220050458446436, 1.2220050458446436, 1.2220050458446436, 1.2220050458446436, 1.2220050458446436, 1.2220050458446436, 1.2220050458446436, 1.2142432380374784, 1.1946429459190873, 1.1763554818274273, 1.1579528573825457, 1.1395272008670196, 1.121078512280849, 1.102721951977256, 1.081417286631179, 1.0552528543791317], 'glaso': [1.1748540258870002, 1.1748540258870002, 1.1748540258870002, 1.1748540258870002, 1.1748540258870002, 1.1748540258870002, 1.1748540258870002, 1.1748540258870002, 1.166938442434803, 1.147266878054944, 1.1293723914224902, 1.1118822614937