In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from functools import reduce
import tabula
import pandas as pd
import os
import re

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 52, Finished, Available, Finished)

## Read all the PDF files from the Lakehouse

In [None]:
# Define paths and mapping
lakehouse_path = "/lakehouse/default/Files/"

master_lst = ['API_WellNo', 'Lease_ID','Well_Name', 'County', 'Oil_Field', 'Operator', 'Day' ,'Month_Prod', 'Year', 'Oil_Prod', 'Gas_Prod', 'Water_Prod']
mapping = {
    'API_WellNo': ['api', 'Api_ID'],
    'Lease_ID': ['Lease_ID','LeaseNo','LeaseID'],
    'well_name': ['wellname', 'well_name', 'well_Nm'],
    'county': ['countyname', 'county'],
    'Oil_Field': ['fieldname', 'oilfield', 'field'],
    'Operator': ['CoName','Company','Company_Name','Operators'],
    'Day': ['Day_Prod','Day'],
    'Month_Prod': ['Month', 'MonthProd'],
    'Year': ['Year', 'YearProd'],
    'Oil_Prod': ['Oil', 'OilProd'],
    'Gas_Prod': ['Gas', 'GasProd'],
    'Water_Prod': ['Water', 'WaterProd']
}

# Convert master list and mapping keys to a consistent case
master_lst = [col.title() for col in master_lst]
mapping = {key.title(): [v.title() for v in values] for key, values in mapping.items()}

# Function to get all PDF files in the specified directory
def get_pdf_files(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]

# List of all PDF file paths
files = get_pdf_files(lakehouse_path)

# Function for extracting tabular data from a PDF file
def read_pdf(file_path):
    try:
        tables = tabula.read_pdf(
            file_path,
            pages="all",
            lattice=True,
            pandas_options={"header": [0]},
            relative_area=True,
            multiple_tables=False
        )
        if not tables:
            return None
        pdf_df = pd.concat(tables, ignore_index=True)
        return pdf_df
    except Exception as e:
        print(f"Error reading PDF {file_path}: {e}")
        return None


StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 53, Finished, Available, Finished)

In [3]:
dataframes = []
removed_rows = []

# Process each PDF file
for file_path in files:
    pdf_df = read_pdf(file_path)  # UDF
    if pdf_df is not None:

        # Convert pandas DF to Spark DF
        df = spark.createDataFrame(pdf_df)
        
        # Normalize and map columns to master list
        original_columns = df.columns
        normalized_columns = [col.title() for col in original_columns]

        # mapping the normalized column names to their aliases
        alias_map = {master: None for master in master_lst}
        
        # Populate alias_map with matching columns from the DataFrame
        for normalized, original in zip(normalized_columns, original_columns):
            for master, variations in mapping.items():
                if normalized in variations or normalized == master:
                    alias_map[master] = original
                    break
        
        # Standardize DataFrame with master list columns
        standardized_columns = [
            col(alias_map[master]).alias(master) if alias_map[master] is not None else lit(None).alias(master)
            for master in master_lst
        ]
        aliased_df = df.select(*standardized_columns)

        # Check which columns are completely null
        null_columns = [
            column for column in aliased_df.columns
            if aliased_df.select(
                count(when(col(column).isNotNull(), column)).alias("non_null_count")
            ).collect()[0]["non_null_count"] == 0
        ]

        # Separate rows into valid and invalid
        if null_columns:
            # If there are columns with all nulls, add the entire dataframe to removed_rows
            removed_rows.append(aliased_df)
        else:
            # Otherwise, append the valid dataframe to dataframes
            dataframes.append(aliased_df)

# Combine valid rows into final_master_df
final_master_df = reduce(lambda df1, df2: df1.unionByName(df2), dataframes) if dataframes else None

# Combine invalid rows into invalid_master_df
invalid_master_df = reduce(lambda df1, df2: df1.unionByName(df2), removed_rows) if removed_rows else None

StatementMeta(, , , Waiting, , Waiting)

Failed to import jpype dependencies. Fallback to subprocess.
No module named 'org.apache.commons'


In [None]:
final_master_df.show()
# invalid_master_df.show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 54, Finished, Available, Finished)

+--------------+--------+--------------------+-----------+---------------+--------------------+---+----------+----+--------+--------+----------+
|    Api_Wellno|Lease_Id|           Well_Name|     County|      Oil_Field|            Operator|Day|Month_Prod|Year|Oil_Prod|Gas_Prod|Water_Prod|
+--------------+--------+--------------------+-----------+---------------+--------------------+---+----------+----+--------+--------+----------+
|31013100000000|  7681JV|     Smallback NY 38| Chautauqua|      Lakeshore|PPP Future Develo...| 23|        12|2023|       0|    $854|         0|
|31029300000000|  4524TT|              Voss 1|       Erie|Alden-Lancaster|Alden Aurora Gas ...|  6|        12|2023|       0|     764|       $13|
|31009200000000|  3435OG|         Hostetler 2|Cattaraugus|      Lakeshore|Empire Energy E &...| 19|         9|2023|       0|     508|         0|
|31013200000000|  5382CJ|           Hammond 1| Chautauqua|      Lakeshore|PPP Future Develo...|  4|        12|2023|       0|     5

## Outputs

In [None]:
try:
    if 'final_master_df' in locals() and final_master_df is not None and final_master_df.count() > 0:
        
        # Process the Final Master DataFrame
        final_master = final_master_df.dropDuplicates(subset=['API_WellNo'])
        final_master1 = final_master.select(['API_WellNo', 'Lease_ID','Well_Name', 'County', 'Oil_Field', 'Operator'])
        final_master1.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("MasterData_PDF")

        # Process transaction data
        transaction_data = (
                            final_master_df.select(col('API_WellNo').alias('API_WellNo'),
                                                    col('Day').alias('Day'),
                                                    col('Month_Prod').alias('Month_Prod'),
                                                    col('Year').alias('Year'),
                                                    col('Oil_prod').alias('Oil_Prod'),
                                                    col('Gas_Prod').alias('Gas_Prod'),
                                                    col('Water_Prod').alias('Water_Prod'),
                )
        )

        if transaction_data.count() > 0:
            transaction_data.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("TransactionData_PDF")

    else:
        print("Master Data Unavailable")
        spark.sql("DELETE FROM MasterData_PDF")
        spark.sql("DELETE FROM TransactionData_PDF")

    # Process invalid_master_df if it exists and has data
    if 'invalid_master_df' in locals() and invalid_master_df is not None and invalid_master_df.count() > 0:
        invalid_master_df = invalid_master_df.filter(col("API_WellNo").isNotNull())
        invalid_master_df.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("UnauthorizedData_PDF")
        print("Removed data from Final_Master due to unauthorized headers")
        # spark.sql("delete from UnauthorizedData_PDF where API_WellNo is NULL") 

    else:
        spark.sql("DELETE FROM UnauthorizedData_PDF")

except Exception as e:
    print(f"An error occurred: {str(e)}")

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 60, Finished, Available, Finished)

In [None]:
display(spark.sql("select * from masterData_pdf limit 10"))
spark.sql("select count(*) from masterData_pdf").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 61, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 27f6b852-012e-4887-85b2-e9757a783e6b)

+--------+
|count(1)|
+--------+
|      72|
+--------+



In [None]:
display(spark.sql("select * from TransactionData_PDF limit 10"))
spark.sql("select count(*) from TransactionData_PDF").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 62, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 040d5eb4-53a0-42f1-95bc-6fa186ab7d32)

+--------+
|count(1)|
+--------+
|   10510|
+--------+



In [None]:
spark.sql("select * from unauthorizedData_pdf").show()
spark.sql("select count(*) from unauthorizedData_pdf").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 58, Finished, Available, Finished)

+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+
|API_WellNo|Lease_ID|Well_Name|County|Oil_Field|Operator|Month_Prod|Year|Oil_Prod|Gas_Prod|Water_Prod|
+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+
+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [None]:
%%sql
select * from transactiondata_pdf where Gas_Prod = '$854'

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 59, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 7 fields>