In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
project_path = os.getenv("PROJECT_PATH")

In [2]:
%run "{project_path}\llm_custom_apps\common_code\common_func.ipynb"

In [3]:
sheet_spark_prompt ="""
You are a helpful assistant to a Data Scientist who is working on a project to transform Excel Spreadsheets to PySpark Dataframes. 
The Data Scientist has provided you with a dictionary of dictionaries which contains details about Excel spreadsheets.
Each dictionary has only one key which is the name of the Sheet and the value is a dictionary with multiple keys.
Each key is the name of the Column Header in the Excel Spreadsheet. The value is again a dictionary with two keys.
The first key is 'ColumnID' with the value being the associated ColumnID in Excel i.e. A, B, C, or etc.
The second key is 'ColumnValue' with the value being the associated formula for generating the column or the hardcoded value in the absence of the formula.
The dictionary looks something like:

{{'SheetNum1': {'X': {'ColumnID': 'A', 'ColumnValue': 11},
   'Y': {'ColumnID': 'B', 'ColumnValue': 2020},
   'Z': {'ColumnID': 'C', 'ColumnValue': 15789},
   'P': {'ColumnID': 'D', 'ColumnValue': 'KIO'},
   'Q': {'ColumnID': 'E', 'ColumnValue': 'SECCC'}}},
 {'SheetNum2': {'ID': {'ColumnID': 'A', 'LLOP': 1},
   'Location': {'ColumnID': 'B', 'ColumnValue': 'LOPP'}}}}

You have to convert the spreadsheet transformation logic to pyspark dataframe keeping the following in mind

1) Parsing the dictionary figure out the dependency between sheets and between columns to identify the order in which the transformations need to be defined and the columns that are hardcoded.
2) In your response at the beginning provide explanation about the Sheets that are present.For each Sheet specify what columns are hardcoded and what columns are derived .Do not specify the values of the columns but only the column names.
3) Now respond with code with the code block starting with ```python and ending with ``` . Ensure:
  a) All the python dependencies that will be required to accomplish this task are imported and after declaring dependencies, also create a placeholder to read the spreadsheet creating multiple pyspark dataframes , one for each sheet in the spreadsheet if and only if it has atleast one hardcoded column (i.e. not derived using formula) and only read/select the hardcoded columns from dataframe.Rememeber I want you to create Dataframes from the individual sheets of spreadsheet and not creating them explictly from the hardcoded values in the dictionary provided
  b) Now create new dataframes from the dataframes declared as input and derive transformed columns for each of the column that is derived using formulas based on dictionary provided , with column name in pyspark being same as Column Header provided in the dictionary.
  c) Always have in comments the Psuedo Code(Logic in plain english used to derive the column,followed by excel formula.) on the same line against each transformation (withColumn,agg methods, join etc. ) .
  d) Subsequently the user might ask for modications for removing /adding /modifying existing transformations to which you should comply accordingly by updating entire code but keep explanation short and bare minimum.



"""


In [4]:
def get_excel_col_header(file_path):
    """
    Traverses through all sheets in the Excel file and returns a dictionary.
    
    Each key represents a sheet name, and its value is another dictionary where
    each key is the Column Name (from the first row) and the value is another 
    dictionary with key 'ColumnID' and value as the Column ID (column letter, e.g., 'A', 'B', 'C').

    :param file_path: Path to the Excel file
    :return: Dictionary with sheet names and column headers
    """
    result = {}

    # Load the workbook
    workbook = openpyxl.load_workbook(file_path, data_only=True)

    # Iterate through each sheet in the workbook
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        sheet_data = {}

        # Get the first row (header) for each column
        for column_idx, cell in enumerate(sheet[1], 1):  # sheet[1] gets the first row
            col_letter = openpyxl.utils.get_column_letter(column_idx)
            sheet_data[cell.value] = {'ColumnID': col_letter}  # Nested dict with 'ColumnID'

        # Add sheet data to the result dictionary
        result[sheet_name] = sheet_data

    return result

# Example Usage
# file_path = "path_to_your_excel_file.xlsx"
# result = get_excel_col_header(file_path)
# print(result)


In [5]:
def process_excel_columns_xlwings(file_path, sheet_columns_dict):
    """
    Updates the sheet_columns_dict with an additional key 'ColumnValue' for each field,
    extracting the formula (if applicable) or the value from the second row of the specified column.

    :param file_path: Path to the Excel spreadsheet.
    :param sheet_columns_dict: Dictionary containing sheet and column information.
    :return: Updated dictionary with 'ColumnValue' for each field.
    """
    # Open the workbook in xlwings
    app = xw.App(visible=False)
    wb = app.books.open(file_path)
    
    try:
        # Iterate over each sheet and its column configurations
        for sheet_name, columns in sheet_columns_dict.items():
            # Ensure the sheet exists in the workbook
            if sheet_name not in [sheet.name for sheet in wb.sheets]:
                raise ValueError(f"Sheet '{sheet_name}' not found in the Excel file.")

            # Get the worksheet
            ws = wb.sheets[sheet_name]

            # Process each column
            for column_name, column_info in columns.items():
                # Extract the column ID
                column_id = column_info['ColumnID']
                
                # Get the cell in the second row for the specified column
                cell = ws.range(f"{column_id}2")

                # Extract formula if applicable, else get the value
                column_value = cell.formula if cell.formula != "" else cell.value

                # Add the 'ColumnValue' key to the column_info
                column_info['ColumnValue'] = column_value

        # Return the updated dictionary
        return sheet_columns_dict
    
    finally:
        # Close the workbook and quit the app
        wb.close()
        app.quit()


In [6]:
def gen_initial_msg(file_path):
    xls_col_info = get_excel_col_header(file_path)
    xls_str = process_excel_columns_xlwings(file_path, xls_col_info)
    initial_msg = [
    {"role": "system", "content": sheet_spark_prompt},
    {"role": "user", "content": f"Convert the excel logic representation to equivalent pyspark code : {xls_str}"},
  ]
    return initial_msg