In [1]:
from docx import Document
import pandas as pd
import numpy as np
import os
import glob
import win32com.client
import pythoncom
from traceback import print_exc
from pythoncom import com_error

In [2]:
def get_title(table):
    
    visited_row_cells = []
    title_row = table.rows[0]
    title_text = ""
    
    for cell in title_row.cells:         
        if cell not in visited_row_cells:
            title_text += cell.text
            visited_row_cells.append(cell)
            
    return title_text

def extract_temp(title_text, temp_string):
    
    temp_index = title_text.find(temp_string) - 2 # skip space
    temp_begin = temp_index
        
    while title_text[temp_begin].isdigit() or title_text[temp_begin] == '.':
        temp_begin -= 1
        
    temp = title_text[temp_begin + 1:temp_index + 1]
    
    return temp

def get_temp(title_text):

    if 'oC' in title_text:
        return extract_temp(title_text, 'oC')
        
    elif 'o C' in title_text:
        return extract_temp(title_text, 'o C')
    
    else:
        return 'n/a'
    
def separate_tables(table_rows):
    
    subtables = []
    slice_index = 0
    visited_row_cells = []

    index = slice_index
    
    while index < len(table_rows):
        curr_row = table_rows[index]
        empty_cells = 0
        for cell in curr_row.cells:
            if cell.text == '':
                empty_cells += 1
        if empty_cells == len(curr_row.cells):
            subtables.append(table_rows[slice_index:index])
            slice_index = index + 1
            break
        
        index += 1
        
    subtables.append(table_rows[index:])
    return subtables #each subtable is just a list of rows, no longer a table

def extract_titles(row):
    
    visited_row_cells = []
    row_titles = []
    
    for cell in row.cells:
        if cell not in visited_row_cells:
            row_titles.append(cell.text.strip())
            visited_row_cells.append(cell)
    
    return row_titles

def check_if_decimal(title):
    
    if title.isdigit():
        return True
    
    decimal_pt = title.find('.')
    
    if decimal_pt == -1 or decimal_pt == len(title) - 1: #no dot or last character is dot; cannot be decimal
        return False
    elif title[decimal_pt+1:].isdigit(): #point precedes numbers, certainly decimal point
        return True
    
    return False
        

def get_column_titles(subtable):
    
    first_row_titles = extract_titles(subtable[0])
    second_row_titles = extract_titles(subtable[1])
    
    for title in second_row_titles:
        if check_if_decimal(title):
            return first_row_titles #second row is data; column titles are in first row
        
    column_titles = []
        
    while len(first_row_titles) > 0 and len(second_row_titles) > 0: #compare title by title
        if first_row_titles[0] == second_row_titles[0]:
            column_titles.append(first_row_titles[0])
            second_row_titles = second_row_titles[1:]
            
        else:
            column_titles.append(first_row_titles[0] + '_' + second_row_titles[0])
            column_titles.append(first_row_titles[0] + '_' + second_row_titles[1])
            if len(second_row_titles) > 2 and second_row_titles[2]!= second_row_titles[0]:
                column_titles.append(first_row_titles[0] + '_' + second_row_titles[2])
                if len(second_row_titles) > 3 and second_row_titles[3] != second_row_titles[0]:
                    column_titles.append(first_row_titles[0] + '_' + second_row_titles[3])
                    second_row_titles = second_row_titles[4:]
                else:
                    second_row_titles = second_row_titles[3:]
            else:
                first_row_titles = first_row_titles[1:]
                second_row_titles = second_row_titles[2:]
    
    
    return column_titles

def dataframe_rows(index, subtable):
    
    df_rows = []
    
    for row in subtable[index:]:
        row_data = []
        visited_row_cells = []
        for cell in row.cells:
            if cell not in visited_row_cells:
                row_data.append(cell.text)
            visited_row_cells.append(cell)
                
        df_rows.append(row_data)
        
    return df_rows
    

In [3]:
def dataframe(subtable, columns):
    
    if columns == extract_titles(subtable[0]): #data starts from second row           
        df_rows = dataframe_rows(2, subtable)
        return pd.DataFrame(np.array(df_rows), columns=get_column_titles(subtable))
                    
    else:
        df_rows = dataframe_rows(3, subtable)
        return pd.DataFrame(np.array(df_rows), columns=get_column_titles(subtable))

In [16]:
def get_text_index(text, table_rows):
    index = 0
    
    for row in table_rows:
        for cell in row.cells:
            if text in cell.text:
                return index
        index += 1
        
    return len(table_rows)

def interruption_index(table_rows):
    index = 0
      
    for row in table_rows:
        for cell in row.cells:
            if cell.text == '':
                return index
        index += 1
    return len(table_rows)

def get_extras(last_index, table_rows, column_titles):
    extra_info_dict = {}
    
    for i in range(last_index, len(column_titles)):
        curr_row = table_rows[i]
        visited_cells = []
        curr_property = column_titles[i]
        property_vals = []
        for cell in curr_row.cells:
            if cell not in visited_cells and cell.text != '':
                property_vals.append(cell.text)
            visited_cells.append(cell)
        extra_info_dict[curr_property] = property_vals[1:]
    return extra_info_dict

def get_num(index, text):
    start_ind = index - 1
    index += 1

    while start_ind > 0 and text[start_ind].isdigit():
        start_ind -= 1
    
    while index < len(text) and text[index].isdigit():
        index += 1

    return text[start_ind:index+1]
    

def get_num_val(index, table_rows): #works only for one particular numval in row
    curr_row = table_rows[index]
    for cell in curr_row.cells:
        if check_if_decimal(cell.text):
            return (get_num(cell.text.find('.'), cell.text))
    return 'n/a'

def get_text(document):
    text = []
    for paragraph in document.paragraphs:
        text.append(paragraph.text)
    return text

def get_report_code(text):
    for item in text:
        if "CT" in item:
            return item
        
def get_well_name(text):
    for item in text:
        if "poço" in item.lower():
            return item


In [14]:
DIR_NAME = 'data_docs'

comparative_dict = {} # filename to (temperature, dataframe): experimental & simulated results comparison
oil_phase_dict = {}   # filename to (temperature, dataframe, api): differential liberation of oil phase
gas_phase_dict = {}   # filename to (temperature, dataframe, density): diff. lib. of gas phase
molar_comp_dict = {}  # filename to (temperature, dataframe, extra_vals_dict) : molar composition of fluid

word = win32com.client.Dispatch("Word.Application")
word.Visible = 0

subfolders = [DIR_NAME + '\\' + dI for dI in os.listdir(DIR_NAME) if os.path.isdir(os.path.join(DIR_NAME,dI))]

for subfolder in subfolders:    
    print("Subfolder:", subfolder)
    subsubfolders = [subfolder + '\\' + dI for dI in os.listdir(subfolder) if os.path.isdir(os.path.join(subfolder,dI))]
    for subsubfolder in subsubfolders:
        print("Next:", subsubfolder)
        for dir_path, dirs, files in os.walk(subsubfolder):
            for file_name in files:

                file_path = os.path.join(dir_path, file_name)
                print("To convert:", file_name)
                file_name, file_extension = os.path.splitext(file_path)

                if "~$" not in file_name:
                    if file_extension.lower() == '.doc': 
                        docx_file = '{0}{1}'.format(file_path, 'x')

                        if not os.path.isfile(docx_file): # Skip conversion where docx file already exists

                            file_path = os.path.abspath(file_path)
                            docx_file = os.path.abspath(docx_file)
                            try:
                                wordDoc = word.Documents.Open(file_path)
                                wordDoc.SaveAs2(docx_file, FileFormat = 16)
                                wordDoc.Close()
                            except Exception as e:
                                print('Failed to Convert: {0}'.format(file_path))
                                print(e)
                        
                        os.remove(file_path)


        file_to_key_dict = {} # (report code, well name) to dict of {dict_name to list of relevant dict_keys}
        
        for some_filename in os.listdir(subsubfolder):
            print("Processing:", some_filename)
            document = Document(subsubfolder + '\\' + some_filename) # make sure no file is open
            tables = document.tables
            text = get_text(document)
            report_code = get_report_code(text)
            well_name = get_well_name(text)

            comp_index = 0 #to not overwrite dict key if multiple such tables in file
            oil_index = 0
            gas_index = 0
            molar_index = 0
            

            print("Initialize name_to_value_dict")
            name_to_value_dict = {}
            name_to_value_dict['comparative_dict'] = []
            name_to_value_dict['oil_phase_dict'] = []
            name_to_value_dict['gas_phase_dict'] = []
            name_to_value_dict['molar_comp_dict'] = []

            for table in tables:
                print("Processing table", table)
                table_title = get_title(table)

                if "fase óleo na" in table_title: # process oil phase data

                    oil_index += 1
                    column_row = table.rows[1]
                    column_titles = []
                    table_temp = get_temp(table_title)

                    visited_row_cells = []

                    for cell in column_row.cells:
                        if cell not in visited_row_cells:
                            column_titles.append(cell.text)
                        visited_row_cells.append(cell)

                    table_rows = table.rows[2:]

                    index = get_text_index("API", table_rows)                
                    df_rows = dataframe_rows(0, table_rows[:index])
                    API = get_num_val(index, table_rows)

                    oil_phase_dict[some_filename + '_' + str(oil_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), API)
                    print("Update name_to_value_dict")
                    name_to_value_dict['oil_phase_dict'].append(some_filename + '_' + str(oil_index))

                elif "entre resultados" in table_title: # process comparative data

                    comp_index += 1
                    table_temp = get_temp(table_title)
                    obs_index = get_text_index("Obs.", table.rows[1:]) 

                    table_rows = table.rows[1:obs_index]
                    subtables = separate_tables(table_rows)

                    name_index = 0

                    for subtable in subtables:
                        titles = get_column_titles(subtable)
                        comparative_dict[some_filename + '_' + str(comp_index) + '_' + str(name_index)] = (table_temp, dataframe(subtable, titles))
                        name_index += 1

                    print("Update name_to_value_dict")
                    name_to_value_dict['comparative_dict'].append(some_filename + '_' + str(comp_index) + '_' + str(name_index))

                elif "fase gás na" in table_title: # process gas phase data

                    gas_index += 1
                    column_row = table.rows[1]
                    column_titles = []
                    table_temp = get_temp(table_title)

                    visited_row_cells = []

                    for cell in column_row.cells:
                        if cell not in visited_row_cells:
                            column_titles.append(cell.text)
                        visited_row_cells.append(cell)

                    table_rows = table.rows[2:]

                    index = get_text_index("Densidade", table_rows)
                    df_rows = dataframe_rows(0, table_rows[:index])

                    
                    density = "n\a"
                    if index < len(table_rows):
                        density_row = table_rows[index]

                        for cell in density_row.cells:
                            if check_if_decimal(cell.text):
                                density = cell.text
                                break

                    gas_phase_dict[some_filename + '_' + str(gas_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), density)
                    print("Update name_to_value_dict")
                    name_to_value_dict['gas_phase_dict'].append(some_filename + '_' + str(gas_index))

                elif "Composições molares" in table_title: # process molar composition

                    molar_index += 1
                    column_row = table.columns[0]
                    column_titles = []
                    table_temp = get_temp(table_title)

                    visited_row_cells = []

                    for cell in column_row.cells:
                        if cell not in visited_row_cells:
                            column_titles.append(cell.text)
                        visited_row_cells.append(cell)

                    column_titles = column_titles[1:] #remove title row
                    df_rows = []

                    for column in table.columns:
                        row_text = []
                        visited_cells = []
                        row_cells = column.cells[1:]
                        for cell in row_cells:
                            if cell not in visited_cells:
                                row_text.append(cell.text)
                            visited_cells.append(cell)
                        df_rows.append(row_text)

                    df_rows = df_rows[1:] #remove titles

                    molar_comp_dict[some_filename + '_' + str(molar_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles), {})
                    print("Update name_to_value_dict")
                    name_to_value_dict['molar_comp_dict'].append(some_filename + '_' + str(molar_index))

                elif "Propriedades e composições molares" in table_title:

                    molar_index += 1
                    column_row = table.columns[0]
                    column_titles = []
                    table_temp = get_temp(table_title)

                    visited_row_cells = []

                    for cell in column_row.cells:
                        if cell not in visited_row_cells:
                            column_titles.append(cell.text)
                        visited_row_cells.append(cell)

                    last_index = interruption_index(table.rows[1:])
                    column_titles_df = column_titles[1:last_index + 1] #remove title row and latter info
                    df_rows = []

                    for column in table.columns:
                        row_text = []
                        visited_cells = []
                        row_cells = column.cells[1:last_index + 1]
                        for cell in row_cells:
                            if cell not in visited_cells:
                                row_text.append(cell.text)
                            visited_cells.append(cell)
                        df_rows.append(row_text)

                    df_rows = df_rows[1:] #remove titles  
                    extra_info_dict = get_extras(last_index, table.rows, column_titles)

                    molar_comp_dict[some_filename + '_' + str(molar_index)] = (table_temp, pd.DataFrame(np.array(df_rows), columns=column_titles_df), extra_info_dict)
                    print("Update name_to_value_dict")
                    name_to_value_dict['molar_comp_dict'].append(some_filename + '_' + str(molar_index))

                else:
                    continue
                    
            print("Update file_to_key_dict")
            file_to_key_dict[(report_code, well_name)] = name_to_value_dict


Subfolder: data_docs\sub1
Next: data_docs\sub1\new new
To convert: out0.docx
To convert: PVT_report_example3.docx
To convert: report_1.docx
To convert: RJS-704 CT_TR_049_13.docx
To convert: RJS-704 CT_TR_080_13.docx
Processing: out0.docx
Initialize name_to_value_dict
Processing table <docx.table.Table object at 0x11258210>
Processing table <docx.table.Table object at 0x11258070>
Update name_to_value_dict
Processing table <docx.table.Table object at 0x11258050>
Update name_to_value_dict
Processing table <docx.table.Table object at 0x11258950>
Update name_to_value_dict
Processing table <docx.table.Table object at 0x11258830>
Processing table <docx.table.Table object at 0x11258970>
Update name_to_value_dict
Processing table <docx.table.Table object at 0x11258B30>
Update file_to_key_dict
Processing: PVT_report_example3.docx
Initialize name_to_value_dict
Processing table <docx.table.Table object at 0x112B84F0>
Processing table <docx.table.Table object at 0x112B8310>
Update name_to_value_dic

In [6]:
file_to_key_dict.items()

dict_items([(('CT TR 098/12', 'PESQUISA E DESENVOLVIMENTO DE GEOENGENHARIA E ENGENHARIA DE POÇO'), {'comparative_dict': [], 'oil_phase_dict': ['out0.docx_1'], 'gas_phase_dict': ['out0.docx_1'], 'molar_comp_dict': ['out0.docx_1', 'out0.docx_2']}), (('CT TR 013/14', 'PESQUISA E DESENVOLVIMENTO DE GEOENGENHARIA E ENGENHARIA DE POÇO'), {'comparative_dict': [], 'oil_phase_dict': ['PVT report-example4.docx_1'], 'gas_phase_dict': ['PVT report-example4.docx_1'], 'molar_comp_dict': ['PVT report-example4.docx_1', 'PVT report-example4.docx_2']})])

In [7]:
def contains_alpha(string):
    for char in string:
        if not (char.isdigit() or char == '.'):
            return True
        
    return False

def make_list_floats(data):
    floats_list = []
    
    for string in data:
        dot = string.find('.')
        
        if contains_alpha(string):
            floats_list.append(float(get_num(dot, string)))
            
        else:
            floats_list.append(float(string))
        
    return floats_list
            

In [8]:
def predict_bo_standing(Rs, dg, API, T):
    inter = Rs * 5.61 * np.sqrt(dg / (141.5 / (API + 131.5)))+ 1.25 * T
    return 0.972 + 1.47 * pow(10, -4) * Rs * 5.61 * pow(inter, 1.175)

def predict_bo_vazquez(Rs, dg, API, T, T_flash):
    dg_Vazquez = dg * (1 + 5.912 * pow(10, -5) * -0.89 * API * (T_flash * 1.8 + 32))
    
    if  API <= 30:
        return 1 + 4.677 * pow(10, -4) * Rs * 5.61 + 1.751 * pow(10, -5) * (T - 60) * (API / dg_Vazquez) - 1.811 * pow(10, -8) * Rs * 5.61 * (T - 60) * (API / dg_Vazquez)
    
    return 1 + 4.67 * pow(10, -4) * Rs * 5.61 + 1.1 * pow(10, -5) * (T-60) * (API / dg_Vazquez) + 1.337 * pow(10, -9) * Rs * 5.61 * (T-60) * (API/dg_Vazquez)

def predict_bo_glaso(Rs, dg, API, T):
    do = 141.5 / (API + 131.5)
    return 1 + 10**(-6.58511+2.91329*np.log10(Rs * 5.61 * ((dg/do)**0.526) + 0.968 * T) - 0.27683*(np.log10(Rs * 5.61 * ((dg/do) ** 0.526) + 0.968 *T))**2)

In [9]:
print(oil_phase_dict['out0.docx_1'][1])

           P (kgf/cm2)(a)  Bo (b)    Rs (c) o (cP) (d) o (g/cm3) (e)
0                   650.0  1.6609    268.08      0.709        0.7429
1                   600.0  1.6722    268.08      0.684        0.7378
2                   550.0  1.6842    268.08      0.658        0.7326
3                   500.0  1.6976    268.08      0.632        0.7268
4                   460.0  1.7090    268.08      0.611        0.7220
5                   430.0  1.7181    268.08      0.595        0.7182
6                   410.0  1.7247    268.08      0.584        0.7154
7                   400.0  1.7282    268.08      0.578        0.7139
8            391.5 (Psat)  1.7311    268.08      0.573        0.7127
9                   360.0  1.6464    234.31      0.630        0.7218
10                  330.0  1.5804    207.30      0.691        0.7297
11                  300.0  1.5230    183.69      0.757        0.7378
12                  250.0  1.4437    150.60      0.886        0.7523
13                  200.0  1.3742 

In [10]:
file_to_bo_dict = {} # (report no, well no) to {bo: [bo by row], bo_predicted_1: [predicted bo by row] ...} 

for item in file_to_key_dict.items():
    file_info = item[0] # (report no, well no) tuple
    dict_of_dicts = item[1]
    
    info_dict = {} # pressure, T, dg, API, Rs
    bo_predictions_dict = {}
    
    bo_predictions_dict["standing"] = []
    bo_predictions_dict["vazquez"] = []
    bo_predictions_dict["glaso"] = []
     
    dg = 0 # no dg found yet
    API = 0 # no API found yet
    
    for item in dict_of_dicts.items():
        if item[0] == 'oil_phase_dict':
            key = item[1]
            T = oil_phase_dict[key[0]][0]
            oil_df = oil_phase_dict[key[0]][1]
            
            bo_predictions_dict['bo'] = oil_df['Bo (b)'].values.tolist() # currently all strings
            if '/' in T:
                info_dict['T'] = T
            else:
                info_dict['T'] = float(T)
            info_dict['P'] = make_list_floats(oil_df['P (kgf/cm2)(a)'].values.tolist())
            info_dict['Rs'] = make_list_floats(oil_df['Rs (c)'].values.tolist())
        
        elif item[0] == 'molar_comp_dict':
            keys = item[1]
            
            for key in keys:
                extra_info = molar_comp_dict[key][2]
                T_flash = float(molar_comp_dict[key][0])

                if extra_info != {}:
                    for item in extra_info.items():
                        if item[0] == "Densidade do gás":
                            dg = extra_info[item[0]][0]

                        elif item[0] == "API":
                            API = extra_info[item[0]][0]
                
            info_dict['API'] = float(API)
            info_dict['dg'] = float(dg)
    
    for i in range(len(info_dict['P'])):
        Rs = info_dict['Rs'][i]
        T = info_dict['T']
        if type(T) == str:
            continue
        API = info_dict['API']
        P = info_dict['P'][i]
        dg = info_dict['dg']
        
        T = T * 1.8 + 32 # convert to fahrenheit
        
        bo_predictions_dict["vazquez"].append(predict_bo_vazquez(Rs, dg, API, T, T_flash))
        bo_predictions_dict["standing"].append(predict_bo_standing(Rs, dg, API, T))
        bo_predictions_dict["glaso"].append(predict_bo_glaso(Rs, dg, API, T))
    
    file_to_bo_dict[file_info] = bo_predictions_dict
    
for item in file_to_bo_dict.items():
    print(item)

(('CT TR 098/12', 'PESQUISA E DESENVOLVIMENTO DE GEOENGENHARIA E ENGENHARIA DE POÇO'), {'standing': [1484.7573455306654, 1484.7573455306654, 1484.7573455306654, 1484.7573455306654, 1484.7573455306654, 1484.7573455306654, 1484.7573455306654, 1484.7573455306654, 1484.7573455306654, 1132.5877888433567, 887.4315336504734, 699.1914085178028, 475.3589230861327, 313.4938195531997, 192.73284559765003, 105.01830196087498, 43.982734573927836, 0.972], 'vazquez': [1.6612971004638193, 1.6612971004638193, 1.6612971004638193, 1.6612971004638193, 1.6612971004638193, 1.6612971004638193, 1.6612971004638193, 1.6612971004638193, 1.6612971004638193, 1.5875390132275393, 1.5285456482215598, 1.4769783269408794, 1.404705448449659, 1.3399896615141844, 1.2787248007804664, 1.2196003879440405, 1.1600609903112007, 1.0757753940290553], 'glaso': [1.8979279306174073, 1.8979279306174073, 1.8979279306174073, 1.8979279306174073, 1.8979279306174073, 1.8979279306174073, 1.8979279306174073, 1.8979279306174073, 1.89792793061