In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from functools import reduce
import os
import re

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 24, Finished, Available, Finished)

## Read all the CSV files from the Lakehouse

In [None]:
lakehouse_path = "Files/"
files = spark.read.format("binaryFile").load(lakehouse_path).filter("path like '%.csv'").select("path").collect()

# Read each CSV file into a DataFrame and store in a list
CSV_Data = [
    spark.read.csv(file['path'], header=True, inferSchema=True)
    for file in files
]


master_lst = ['API_WellNo', 'Lease_ID','Well_Name', 'County', 'Oil_Field', 'Operator', 'Day' ,'Month_Prod', 'Year', 'Oil_Prod', 'Gas_Prod', 'Water_Prod']
mapping = {
    'API_WellNo': ['api', 'Api_ID','Well_ID', 'APINumber'],
    'Lease_ID': ['Lease', 'LeaseID','LeaseNo'],
    'Well_Name': ['wellname', 'well_name', 'well_Nm'],
    'county': ['countyname'],
    'Oil_Field': ['fieldname', 'oilfield', 'field'],
    'Operator': ['CoName','Company','Company_Name','Operators'],
    'Day': ['Day_Prod','Day'],
    'Month_Prod': ['Month', 'MonthProd'],
    'Year': ['Year', 'YearProd'],
    'Oil_Prod': ['Oil', 'OilProd'],
    'Gas_Prod': ['Gas', 'GasProd'],
    'Water_Prod': ['Water', 'WaterProd']
}

# Convert master list and mapping keys to a consistent case
master_lst = [col.title() for col in master_lst]
mapping = {key.title(): [v.title() for v in values] for key, values in mapping.items()}

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 26, Finished, Available, Finished)

In [None]:
dataframes = []
removed_rows = []

# Process each DataFrame to select and alias columns
for i, df in enumerate(CSV_Data):
    # Get original columns
    original_columns = df.columns

    # Normalize column names to title case for consistent matching
    normalized_columns = [col.title() for col in original_columns]

    # Create a dictionary to map the normalized column names to their aliases (from master_lst)
    alias_map = {master: None for master in master_lst}

    # Populate alias_map with matching columns from the DataFrame
    for normalized, original in zip(normalized_columns, original_columns):
        for master, variations in mapping.items():
            if normalized in variations or normalized == master:
                alias_map[master] = original
                break

    # Standardize the DataFrame to include all columns in master_lst
    standardized_columns = [
        col(alias_map[master]).alias(master) if alias_map[master] is not None else lit(None).alias(master)
        for master in master_lst
    ]

    # Select and alias columns in the DataFrame
    aliased_df = df.select(*standardized_columns)

    # Check which columns are completely null
    null_columns = [
        column for column in aliased_df.columns
        if aliased_df.select(
            count(when(col(column).isNotNull(), column)).alias("non_null_count")
        ).collect()[0]["non_null_count"] == 0
    ]

    # Separate rows into valid and invalid
    if null_columns:
        # If there are columns with all nulls, add the entire dataframe to removed_rows
        removed_rows.append(aliased_df)
    else:
        # Otherwise, append the valid dataframe to dataframes
        dataframes.append(aliased_df)

# Combine valid rows into final_master_df
final_master_df = reduce(lambda df1, df2: df1.unionByName(df2), dataframes) if dataframes else None

# Combine invalid rows into invalid_master_df
invalid_master_df = reduce(lambda df1, df2: df1.unionByName(df2), removed_rows) if removed_rows else None

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 27, Finished, Available, Finished)

In [None]:
# Handle cases where either list might be empty
if final_master_df:
    final_master_df.show()

# if invalid_master_df:
#     invalid_master_df.show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 28, Finished, Available, Finished)

+----------+--------+--------------------+---------+--------------------+--------------------+---+----------+----+--------+--------+----------+
|Api_Wellno|Lease_Id|           Well_Name|   County|           Oil_Field|            Operator|Day|Month_Prod|Year|Oil_Prod|Gas_Prod|Water_Prod|
+----------+--------+--------------------+---------+--------------------+--------------------+---+----------+----+--------+--------+----------+
|     25018|  1915SZ|PRIVATE  WALPOLE ...|Haldimand|     Gachsaran Field|ref_04d0e2967c501...| 25|         1|2022|       0|   $138 |         0|
|      8352|  2391DR|Jasperson No. 3 -...|    Essex|        Marlim Field|     Jasperson, Bon | 10|         1|2022|       0|       7|         0|
|      8365|  9753TJ|  Jasperson - M. Fox|    Essex|            Minagish|     Jasperson, Bon | 27|         1|2022|       0|      10|         0|
|      5706|  6123ZW|Drake & Walker- W...|    Essex|        Dukhan Field|     Drake & Walker | 24|         1|2022|     $2 |       0|    

In [None]:
final_master_df.printSchema()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 29, Finished, Available, Finished)

root
 |-- Api_Wellno: integer (nullable = true)
 |-- Lease_Id: string (nullable = true)
 |-- Well_Name: string (nullable = true)
 |-- County: string (nullable = true)
 |-- Oil_Field: string (nullable = true)
 |-- Operator: string (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Month_Prod: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Oil_Prod: string (nullable = true)
 |-- Gas_Prod: string (nullable = true)
 |-- Water_Prod: string (nullable = true)



### Outputs

In [None]:
try:
    if 'final_master_df' in locals() and final_master_df is not None and final_master_df.count() > 0:
        
        # Process the Final Master DataFrame
        final_master = final_master_df.dropDuplicates(subset=['API_WellNo'])
        final_master1 = final_master.select(['API_WellNo', 'Lease_ID','Well_Name', 'County', 'Oil_Field', 'Operator'])
        final_master1.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("MasterData_CSV")

        # Process transaction data
        transaction_data = (
                            final_master_df.select(col('API_WellNo').alias('API_WellNo'),
                                                    col('Day').alias('Day'),
                                                    col('Month_Prod').alias('Month_Prod'),
                                                    col('Year').alias('Year'),
                                                    col('Oil_prod').alias('Oil_Prod'),
                                                    col('Gas_Prod').alias('Gas_Prod'),
                                                    col('Water_Prod').alias('Water_Prod'),
                )
        )
        if transaction_data.count() > 0:
            transaction_data.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("TransactionData_CSV")
                        
    else:
        print("Master Data Unavailable")
        spark.sql("DELETE FROM MasterData_CSV WHERE true")
        spark.sql("DELETE FROM TransactionData_CSV WHERE true")

    # Unauthorized data
    if 'invalid_master_df' in locals() and invalid_master_df is not None and invalid_master_df.count() > 0:
        invalid_master_df = invalid_master_df.filter(col("API_WellNo").isNotNull())
        invalid_master_df.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("UnauthorizedData_CSV")
        
        print("Removed data from Final_Master due to unauthorized headers")

    else:
        spark.sql("DELETE FROM UnauthorizedData_CSV WHERE true")

except Exception as e:
    print(f"An error occurred: {str(e)}")

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 40, Finished, Available, Finished)

In [None]:
display(spark.sql("select *  from masterdata_csv limit 10"))
spark.sql("select count(*)  from masterdata_csv").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 41, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 8d4fa2fe-f64e-4f9f-9551-43a0082fec84)

+--------+
|count(1)|
+--------+
|   13688|
+--------+



In [None]:
display(spark.sql("select * from transactiondata_csv limit 10"))
spark.sql("select count(*) from transactiondata_csv").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 42, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e214d087-efe7-4bee-8796-4b586020315e)

+--------+
|count(1)|
+--------+
|   13688|
+--------+



In [None]:
spark.sql("select * from unauthorizedData_csv").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 43, Finished, Available, Finished)

+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+
|API_WellNo|Lease_ID|Well_Name|County|Oil_Field|Operator|Month_Prod|Year|Oil_Prod|Gas_Prod|Water_Prod|
+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+
+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+



In [None]:
%%sql
select * from transactiondata_csv where API_WellNo = '25018'

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 44, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 7 fields>