In [74]:
import os
from dotenv import load_dotenv
load_dotenv()
project_path = os.getenv("PROJECT_PATH")

In [75]:
%run "{project_path}\llm_custom_apps\llm_apps\oh_sheet_its_spark\utils.ipynb"

In [76]:
xls_col_info = get_excel_col_header(f"{test_data_path}\SalesData1.xlsx")
xls_col_info

{'SalesInfo': {'StoreID': {'ColumnID': 'A'},
  'SalesYear': {'ColumnID': 'B'},
  'SalesValue': {'ColumnID': 'C'},
  'StoreLocation': {'ColumnID': 'D'},
  'Owner': {'ColumnID': 'E'}},
 'RegionInfo': {'ID': {'ColumnID': 'A'}, 'Location': {'ColumnID': 'B'}},
 'StoreInfo': {'ID': {'ColumnID': 'A'},
  'RegionID': {'ColumnID': 'B'},
  'StoreOwner': {'ColumnID': 'C'}},
 'SaleGrowthReport1': {'StoreID': {'ColumnID': 'A'},
  'AverageSales': {'ColumnID': 'B'},
  'MinSales': {'ColumnID': 'C'},
  'MaxSales': {'ColumnID': 'D'}},
 'SaleGrowthReport2': {'StoreLocation-SalesYear': {'ColumnID': 'A'},
  'Year': {'ColumnID': 'B'},
  'Location': {'ColumnID': 'C'},
  'AverageSales': {'ColumnID': 'D'},
  'MinSales': {'ColumnID': 'E'},
  'MaxSales': {'ColumnID': 'F'}}}

In [77]:
xls_str = process_excel_columns_xlwings(f"{test_data_path}\SalesData1.xlsx",xls_col_info)
xls_str

{'SalesInfo': {'StoreID': {'ColumnID': 'A', 'ColumnValue': '11'},
  'SalesYear': {'ColumnID': 'B', 'ColumnValue': '2020'},
  'SalesValue': {'ColumnID': 'C', 'ColumnValue': '15789'},
  'StoreLocation': {'ColumnID': 'D',
   'ColumnValue': '=IFERROR(INDEX(RegionInfo!B:B, MATCH(INDEX(StoreInfo!B:B, MATCH(A2, StoreInfo!A:A, 0)), RegionInfo!A:A, 0)), "")'},
  'Owner': {'ColumnID': 'E',
   'ColumnValue': '=IFERROR(INDEX(StoreInfo!C:C, MATCH(A2, StoreInfo!A:A, 0)), "")'}},
 'RegionInfo': {'ID': {'ColumnID': 'A', 'ColumnValue': '1'},
  'Location': {'ColumnID': 'B', 'ColumnValue': 'Halifax'}},
 'StoreInfo': {'ID': {'ColumnID': 'A', 'ColumnValue': '11'},
  'RegionID': {'ColumnID': 'B', 'ColumnValue': '1'},
  'StoreOwner': {'ColumnID': 'C', 'ColumnValue': 'Ethan Carter'}},
 'SaleGrowthReport1': {'StoreID': {'ColumnID': 'A',
   'ColumnValue': '=_xlfn.UNIQUE(_xlfn._xlws.FILTER(SalesInfo!A:A, SalesInfo!A:A<>""))'},
  'AverageSales': {'ColumnID': 'B',
   'ColumnValue': '=AVERAGEIF(SalesInfo!A:A, A2, S

In [78]:
messages = [
    {"role": "system", "content": sheet_spark_prompt},
    {"role": "user", "content": f"Convert the excel logic representation to equivalent pyspark code : {xls_str}"},
  ]

In [79]:
cd_resp = generate_responses_from_inf(hf_token,messages,"Qwen/Qwen2.5-Coder-32B-Instruct",5000)

In [80]:
cd_resp

### Explanation of Sheets

1. **SalesInfo**:
   - **Hardcoded Columns**: StoreID, SalesYear, SalesValue
   - **Derived Columns**: StoreLocation, Owner

2. **RegionInfo**:
   - **Hardcoded Columns**: ID, Location
   - **Derived Columns**: None

3. **StoreInfo**:
   - **Hardcoded Columns**: ID, RegionID, StoreOwner
   - **Derived Columns**: None

4. **SaleGrowthReport1**:
   - **Hardcoded Columns**: None
   - **Derived Columns**: StoreID, AverageSales, MinSales, MaxSales

5. **SaleGrowthReport2**:
   - **Hardcoded Columns**: None
   - **Derived Columns**: StoreLocation-SalesYear, Year, Location, AverageSales, MinSales, MaxSales

### PySpark Code

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("ExcelToSpark").getOrCreate()

# Read Excel sheets into DataFrames if they have hardcoded columns
sales_info = spark.read.format("excel").option("sheetName", "SalesInfo").load().select("A", "B", "C")
region_info = spark.read.format("excel").option("sheetName", "RegionInfo").load().select("A", "B")
store_info = spark.read.format("excel").option("sheetName", "StoreInfo").load().select("A", "B", "C")

# Create derived columns for SalesInfo
sales_info_df = (
    sales_info
    .withColumn("StoreID", col("A"))  # Pseudo Code: StoreID is the value in column A
    .withColumn("SalesYear", col("B"))  # Pseudo Code: SalesYear is the value in column B
    .withColumn("SalesValue", col("C"))  # Pseudo Code: SalesValue is the value in column C
    .join(store_info, sales_info.A == store_info.A, "left")
    .withColumn("Owner", col("C"))  # Pseudo Code: Owner is derived from StoreInfo
    .join(region_info, store_info.B == region_info.A, "left")
    .withColumn("StoreLocation", col("B"))  # Pseudo Code: StoreLocation is derived from RegionInfo
    .select("StoreID", "SalesYear", "SalesValue", "StoreLocation", "Owner")
)

# Create derived columns for SaleGrowthReport1
sale_growth_report_1_df = (
    sales_info_df
    .groupBy("StoreID")
    .agg(
        avg("SalesValue").alias("AverageSales"),  # Pseudo Code: AverageSales is the average of SalesValue for each StoreID
        min("SalesValue").alias("MinSales"),      # Pseudo Code: MinSales is the minimum of SalesValue for each StoreID
        max("SalesValue").alias("MaxSales")       # Pseudo Code: MaxSales is the maximum of SalesValue for each StoreID
    )
)

# Create derived columns for SaleGrowthReport2
sale_growth_report_2_df = (
    sales_info_df
    .withColumn("StoreLocation-SalesYear", concat(col("StoreLocation"), lit("-"), col("SalesYear")))  # Pseudo Code: StoreLocation-SalesYear is a concatenation of StoreLocation and SalesYear
    .groupBy("StoreLocation-SalesYear")
    .agg(
        last(split(col("StoreLocation-SalesYear"), "-").getItem(1)).alias("Year"),  # Pseudo Code: Year is the part after "-" in StoreLocation-SalesYear
        last(split(col("StoreLocation-SalesYear"), "-").getItem(0)).alias("Location"),  # Pseudo Code: Location is the part before "-" in StoreLocation-SalesYear
        avg("SalesValue").alias("AverageSales"),  # Pseudo Code: AverageSales is the average of SalesValue for each StoreLocation-SalesYear
        min("SalesValue").alias("MinSales"),      # Pseudo Code: MinSales is the minimum of SalesValue for each StoreLocation-SalesYear
        max("SalesValue").alias("MaxSales")       # Pseudo Code: MaxSales is the maximum of SalesValue for each StoreLocation-SalesYear
    )
)
```

This code initializes a Spark session, reads the necessary sheets from an Excel file into DataFrames, and applies transformations to derive the required columns as specified in the dictionary.