In [16]:
import os
import glob
import logging
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate
from datetime import datetime
from langchain_text_splitters import RecursiveJsonSplitter
import pandas as pd
import json
import csv
import time
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
from io import StringIO

# Set up directory structure
# base_dir = str(Path(__file__).parent.parent)
# logs_dir = os.path.join(base_dir, "logs")

base_dir = r'C:\Users\ppahil01\genai_de'
logs_dir = os.path.join(base_dir, "logs")
os.makedirs(logs_dir, exist_ok=True)

load_dotenv()


def main():
    
    # Load environment variables
    env_vars = {
        "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
        "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
        "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
    }
    

    model = AzureChatOpenAI(
        azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
        azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
        openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
    )


    prompt =  """Analyze the following SQL code for potential issues. 
	    Please check for:

        - Syntax errors (missing semicolons, commas after column names, incorrect keywords, etc.)
        - Missing JOIN/WHERE columns, table names
        - check for commas, FROM/where clause, proper join and case statements in nested subqueries.

        For each issue found, provide:
        - The specific line or section where the issue occurs, only mention the issue header if the issue is found.

        SQL Code to analyze:
        {code}
        strictly ignore code that is enclosed in these symbols /* */ or code that is commented out.
        strictly Don't give any other information apart from above mentioned issues. Ignore comments and irrelevant information from the code. 
        Please format your response as a structured analysis with clear sections for each type of issue found."""


    
    sensitive_prompt = """
        Please analyze the following Snowflake SQL query focusing on these key areas:

        ## Data Security & Compliance
        1. Identify any sensitive data fields (PII, financial data, protected health information) in the query output
        2. Highlight columns that should be masked or encrypted
        3. Suggest appropriate masking techniques for each sensitive field

        ## Code Quality Analysis
        1. Identify and list all hardcoded values in the view code
        2. For each hardcoded value:
        - Explain the potential risks
        - Suggest parameterization or dynamic alternatives
        - Provide sample code for implementation

        ## Query Structure Optimization
        1. Review nested queries and subqueries:
        - Map the query execution flow
        - Identify performance bottlenecks in nested operations
        - Suggest flattening or restructuring opportunities
        - Provide alternative query structures with explanations

        2. Analyze column usage:
        - Flag any 'SELECT *' statements
        - List unused columns in JOIN conditions- [columns list]
        - Identify columns fetched but not used in final output
        - Provide optimized SELECT statements with specific columns
        - Provide specific unused column names.

        ## Join Analysis
        1. Review all JOIN operations:
        - Evaluate join conditions and their necessity
        - Identify unused columns from joined tables
        - Suggest removal of unnecessary joins
        - Recommend appropriate join types (LEFT, INNER, etc.)

        ## Performance Enhancement Recommendations
        1. Suggest specific coding standards improvements:
        - Table aliases and naming conventions
        - Proper indentation and formatting
        - Use of appropriate indexes
        - Temporary table vs CTE usage
        - Materialized view opportunities

        2. Provide performance-focused recommendations:
        - Partitioning strategies
        - Clustering keys
        - Query result caching
        - Execution plan optimization

        Please provide the analysis with:
        - Clear before/after code examples
        - Expected performance impact of each suggestion
        - Priority ranking for implementing changes (High/Medium/Low)
        - Any potential risks or dependencies to consider

        Provide Solution with specific case or Provide Optimized SQL query.
        Original Query:
        {sensitive_code}
    """
    
    # Set up directory paths
    input_dir = os.path.join(base_dir, "data", "test")
    

    # Set up analysis report file
    report_path = os.path.join(logs_dir, "sql_analysis_report.txt")
    print(f"Analysis report will be saved to: {report_path}")

    # Check if input directory exists
    if not os.path.exists(input_dir):
        print(f"ERROR: Input directory not found: {input_dir}")
        return

    print("\n=== Starting SQL file processing ===")
    sql_files_count = 0
    processed_files_count = 0

    # Open report file for writing
    with open(report_path, 'w') as report_file:
        report_file.write(f"SQL Analysis Report - Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        report_file.write("="*80 + "\n\n")

        for root, dirs, files in os.walk(input_dir):
            sql_files = [f for f in files if f.endswith('.sql')]
            sql_files_count = len(sql_files)
            print(f"Found {sql_files_count} SQL files to process")

            for file in files:
                if file.endswith(".sql"):
                    file_path = os.path.join(root, file)
                    print(f"\nProcessing file: {file_path}")
                    
                    try:
                        with open(file_path, 'r') as file:
                            sql_code = file.read()

                            prompt_template = PromptTemplate(
                                input_variables=["code"],
                                template=prompt
                            )
                            sensitive_prompt_template = PromptTemplate(
                                input_variables=["sensitive_code"],
                                template=sensitive_prompt
                            )
                            
                            start_time = time.time()
                            result = model.invoke(prompt_template.format(code=sql_code))
                            sensitive_result = model.invoke(sensitive_prompt_template.format(sensitive_code=sql_code))
                            analysis_text = result.content
                            sensitive_text = sensitive_result.content
                            end_time = time.time()
                            print(f"Analysis completed in {end_time - start_time:.2f} seconds")
                            
                            # Write to report file
                            report_file.write(f"File: {file_path}\n")
                            report_file.write(f"Analysis Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                            report_file.write("-"*80 + "\n")
                            report_file.write(analysis_text + "\n")
                            report_file.write(sensitive_text + "\n")
                            report_file.write("="*80 + "\n\n")
                        
                            processed_files_count += 1
                            print(f"Progress: {processed_files_count}/{sql_files_count} files processed")
                                
                    except Exception as e:
                        error_message = f"Error processing file {file_path}: {str(e)}"
                        logging.error(error_message)
                        report_file.write(f"ERROR processing {file_path}: {str(e)}\n")
                        report_file.write("="*80 + "\n\n")

    print(f"\n=== Processing completed ===")
    print(f"Total files processed: {processed_files_count}/{sql_files_count}")
    print(f"Report file location: {os.path.abspath(report_path)}")

if __name__ == "__main__":
    try:
        main()
        print("\nScript executed successfully")
    except Exception as e:
        print(f"\nScript execution failed: {str(e)}")
        logging.error(f"Script execution failed: {str(e)}")

Analysis report will be saved to: C:\Users\ppahil01\genai_de\logs\sql_analysis_report.txt

=== Starting SQL file processing ===
Found 6 SQL files to process

Processing file: C:\Users\ppahil01\genai_de\data\test\EDW_PHARMACY_ECOMMERCE_ANALYSIS.sql
Analysis completed in 16.15 seconds
Progress: 1/6 files processed

Processing file: C:\Users\ppahil01\genai_de\data\test\ITG_CHW_ECOMM_DATA.sql
Analysis completed in 52.65 seconds
Progress: 2/6 files processed

Processing file: C:\Users\ppahil01\genai_de\data\test\ITG_METCASH_IND_GROCERY.sql
Analysis completed in 14.77 seconds
Progress: 3/6 files processed

Processing file: C:\Users\ppahil01\genai_de\data\test\itg_perenso_account_custom_list.sql
Analysis completed in 12.41 seconds
Progress: 4/6 files processed

Processing file: C:\Users\ppahil01\genai_de\data\test\itg_perenso_account_fields.sql
Analysis completed in 40.39 seconds
Progress: 5/6 files processed

Processing file: C:\Users\ppahil01\genai_de\data\test\test.sql
Analysis completed i