In [None]:
from pyspark.sql.functions import col, lit, when, last, create_map, concat_ws
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd
import os

# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 75, Finished, Available, Finished)

## Previous Code

In [None]:
lakehouse_path = "/lakehouse/default/Files"

excel_files = [
    os.path.join(lakehouse_path, f)
    for f in os.listdir(lakehouse_path)
    if f.endswith((".xlsx", ".xls"))
]

all_dataframes = []

for lakehouse_path in excel_files:
    excel_data = pd.ExcelFile(lakehouse_path)
    
    for sheet_name in excel_data.sheet_names:
        sheet_data = pd.read_excel(lakehouse_path, sheet_name=sheet_name, header=None)

        month = sheet_data.iloc[0, 0]
        year = sheet_data.iloc[0, 1]
        header_row = sheet_data.iloc[1]
        sheet_data.columns = header_row

        sheet_data = sheet_data.iloc[2:].reset_index(drop=True)
        sheet_data["Month"] = month
        sheet_data["Year"] = year
        all_dataframes.append(sheet_data)
    
combined_df = pd.concat(all_dataframes, ignore_index=True)
combined_df["OilField"] = combined_df["OilField"].ffill()
combined_df.dropna(subset=['Lease_Id'], axis=0, inplace=True, ignore_index=True)

num_cols = ['Oil','Gas','Water','Day']
# for num_col in num_cols:
#   combined_df[num_col] = combined_df[num_col].fillna(0)
# combined_df[num_cols] = combined_df[num_cols].astype('int64')

month_mapping = {
    "Jan": '1', "Feb": '2', "Mar": '3', "Apr": '4', "May": '5', "Jun": '6',
    "Jul": '7', "Aug": '8', "Sep": '9', "Oct": '10', "Nov": '11', "Dec": '12'
}
combined_df['Month'] = combined_df['Month'].map(month_mapping)
combined_df = combined_df.astype(str)
spark_df = spark.createDataFrame(combined_df)
final_master_df = spark_df.select(col('APINumber').alias('API_WellNo'),
                                        col('Lease_Id').alias('Lease_ID'),
                                        col('WellName').alias('Well_Name'),
                                        col('County').alias('County'),
                                        col('OilField').alias('Oil_Field'),
                                        col('Operator').alias('Operator'),
                                        col('Month').alias('Month_Prod'),
                                        col('Day').alias('Day'),
                                        col('Year').alias('Year'),
                                        col('Oil').alias('Oil_Prod'),
                                        col('Gas').alias('Gas_Prod'),
                                        col('Water').alias('Water_Prod'),
)
# # New step to concatenate API_WellNo with Month_Prod
# final_master_df = final_master_df.withColumn(
#     'API_WellNo', 
#     concat_ws('', col('API_WellNo'), lit('M'), col('Month_Prod'))
# )

display(final_master_df.limit(5))

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 76, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6c1124db-1a55-4ae2-9778-cdb224d953a0)

In [None]:
final_master_df.filter(col('Gas_Prod')=='184').show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 78, Finished, Available, Finished)

+-------------+--------+--------------------+-------+-----------+--------------------+----------+---+----+--------+--------+----------+
|   API_WellNo|Lease_ID|           Well_Name| County|  Oil_Field|            Operator|Month_Prod|Day|Year|Oil_Prod|Gas_Prod|Water_Prod|
+-------------+--------+--------------------+-------+-----------+--------------------+----------+---+----+--------+--------+----------+
|   0902120210|   1345H|   NBI Collier 2-12H|Collier|Bear Island|           Breitburn|         1| 27|2024|    1189|     184|     66137|
|0905120051-02|   645DH| Lehigh Acres Lan...|    Lee| West Felda|BreitBurn Operati...|         2|  4|2024|    1982|     184|     55517|
|    902120210|   1345H|   NBI Collier 2-12H|Collier|Bear Island|           Breitburn|         1| 27|2024|    1189|     184|     66137|
|0905120051-02|   645DH| Lehigh Acres Lan...|    Lee| West Felda|BreitBurn Operati...|         2|  4|2024|    1982|     184|     55517|
+-------------+--------+--------------------+---

In [None]:
try:
    if 'final_master_df' in locals() and final_master_df is not None and final_master_df.count() > 0:
        
        # Process the Final Master DataFrame
        final_master = final_master_df.dropDuplicates(subset=['API_WellNo'])
        final_master1 = final_master.select(['API_WellNo', 'Lease_ID','Well_Name', 'County', 'Oil_Field', 'Operator'])
        final_master1.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("MasterData_XLSX")

        # Process transaction data
        transaction_data = (
                            final_master_df.select(col('API_WellNo').alias('API_WellNo'),
                                                    col('Day').alias('Day'),
                                                    col('Month_Prod').alias('Month_Prod'),
                                                    col('Year').alias('Year'),
                                                    col('Oil_Prod').alias('Oil_Prod'),
                                                    col('Gas_Prod').alias('Gas_Prod'),
                                                    col('Water_Prod').alias('Water_Prod'),
                )
        )
        if transaction_data.count() > 0:
            transaction_data.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("TransactionData_XLSX")
                        
    else:
        print("Master Data Unavailable")
        spark.sql("DELETE FROM MasterData_XLSX WHERE true")
        spark.sql("DELETE FROM TransactionData_XLSX WHERE true")

    # Unauthorized data
    if 'invalid_master_df' in locals() and invalid_master_df is not None and invalid_master_df.count() > 0:
        invalid_master_df = invalid_master_df.filter(col("API_WellNo").isNotNull())
        invalid_master_df.write.format("delta").option('mergeSchema','true').mode("overwrite").saveAsTable("UnauthorizedData_XLSX")
        
        print("Removed data from Final_Master due to unauthorized headers")

    else:
        spark.sql("DELETE FROM UnauthorizedData_XLSX WHERE true")

except Exception as e:
    # spark.sql("DELETE FROM MasterData_XLSX WHERE true")
    # spark.sql("DELETE FROM TransactionData_XLSX WHERE true")
    print(f"An error occurred: {str(e)}")


StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 12, Finished, Available, Finished)

In [None]:
display(spark.sql("Select * from MasterData_XLSX limit 10"))
spark.sql("Select count(*) from MasterData_XLSX").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 17, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, fc53fd6d-9529-4a6b-a710-cbd37c3cb5e9)

+--------+
|count(1)|
+--------+
|     105|
+--------+



In [None]:
display(spark.sql("Select * from TransactionData_XLSX  limit 10"))
spark.sql("Select count(*) from TransactionData_XLSX").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 19, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e2b39af7-879d-44aa-994a-cc1cf5cd7f64)

+--------+
|count(1)|
+--------+
|     828|
+--------+



In [None]:
spark.sql("Select * from UnauthorizedData_XLSX").show()

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 18, Finished, Available, Finished)

+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+
|API_WellNo|Lease_ID|Well_Name|County|Oil_Field|Operator|Month_Prod|Year|Oil_Prod|Gas_Prod|Water_Prod|
+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+
+----------+--------+---------+------+---------+--------+----------+----+--------+--------+----------+



## Validations

In [None]:
%%sql
select * from transactiondata_xlsx where API_WellNo = '0902120210'

StatementMeta(, d1da372f-9793-46ad-a380-dfdb15135b2a, 68, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 7 fields>