In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path

In [2]:
# Get the current working directory (might be different in Jupyter)
current_dir = Path.cwd()

table_types_filepath = (current_dir / '..' / '..' / '..' / 'Data' / 'Working_with_Files' / 'File_Types_Handled_by_GEO.htm').resolve()

In [3]:
def extract_geo_file_types(htm_file_path):
    """
    Extracts the file type information from GEO HTM file.
    
    Args:
        htm_file_path (str): Path to the HTM file
        
    Returns:
        pandas.DataFrame: DataFrame containing the extracted table data
    """
    # Read the HTM file
    with open(htm_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the main table (it has class "Table_Style_1")
    table = soup.find('table', {'class': 'Table_Style_1'})
    
    # Extract table headers
    headers = []
    header_row = table.find('tr')
    for th in header_row.find_all('td'):
        headers.append(th.get_text(strip=True))
    
    # Extract table rows
    data = []
    for row in table.find_all('tr')[1:]:  # Skip header row
        row_data = []
        for td in row.find_all('td'):
            # Clean up the text - remove extra spaces and newlines
            text = td.get_text(' ', strip=True)
            row_data.append(text)
        data.append(row_data)
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=headers)
    
    return df

In [4]:
table_types_df = extract_geo_file_types(table_types_filepath)

In [5]:
table_types_df

Unnamed: 0,FILE TYPE,EXTENSION,DESCRIPTION,LOAD,EXPORT
0,LIS-79,"LIS, TAP, NTI, TIF",LIS data (usually wireline data) is converted ...,No,No
1,Canadian Well Log ASCII (CWLAS),LASData_load,This is a special ASCII implementation of wire...,Yes,Yes
2,Text (ASCII),"ASC, TXTData_load","Mudlog, MWD and well test data are usually in ...",Yes,Yes
3,Comma Separated Values (ASCII),CSVData_load,Some service companies present mudlog data as ...,Yes,Yes
4,Tab Delimited (ASCII),TXTData_load,One form of ASCII formats. Tab delimited files...,Yes,Yes
5,Metafile,CGMExport_CGM,The ISO-8632 implementation of Computer Graphi...,No,Yes
6,Windows Metafile,WMFExport_WMF,Both placeable and standard Windows Metafiles ...,No,Yes
7,Enhanced Metafile,EMFExport_EMF,Enhanced Metafiles are supported. This is anot...,Yes,Yes
8,Scalable Vector Graphics,SVGExport_SVG,SVG is a language for describing two-dimension...,,Yes
9,Extensible Mark-up Language,XMLExport_XML,The Extensible Markup Language (XML) is the un...,No,No
