In [132]:


from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter

import snowflake.connector
import pandas as pd
import json
import csv
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
# from azure.storage.filedatalake import DataLakeServiceClient
from io import StringIO

load_dotenv()


env_vars = {
        "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
        "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
        "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
        "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
        "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
        "SNOWFLAKE_SCHEMA": os.environ.get("SNOWFLAKE_SCHEMA"),
        "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
        "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
        "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
        "connection_string":os.environ.get("AZURE_STORAGE_CONNECTION_STRING")
    }

# conn = snowflake.connector.connect(
#         user=env_vars.get("SNOWFLAKE_USER"),
#         password=env_vars.get("SNOWFLAKE_PASSWORD"),
#         account=env_vars.get("SNOWFLAKE_ACCOUNT"),
#         warehouse=env_vars.get("SNOWFLAKE_WAREHOUSE"),
#         database=env_vars.get("SNOWFLAKE_DATABASE"),
#         schema=env_vars.get("SNOWFLAKE_SCHEMA"),
#     )

# model = AzureChatOpenAI(
#         azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
#         azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
#         openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
#         openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
#     )





# cursor = conn.cursor()
# azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

# adls_client = DataLakeServiceClient.from_connection_string(azure_storage_connection_string)

# cursor.execute("""
#         SELECT table_name 
#         FROM information_schema.tables
#         WHERE table_schema = 'TEST' AND table_type = 'BASE TABLE'
#     """)

# tables =  cursor.fetchall()
# print(metadata)



In [133]:
import snowflake.connector
import pandas as pd
from openai import ChatCompletion
from sqlalchemy import create_engine

class SnowflakeAIAgent:
    def __init__(self, user, password, account, warehouse, database, schema, openai_api_key):
        self.user=user,
        self.password=password,
        self.account=account
        self.warehouse = warehouse
        self.database = database
        self.schema = schema
        self.openai_api_key = openai_api_key
    
    # def connect_to_snowflake(self):
    #     connection_string = f"snowflake://{self.user}:{self.password}@{self.account}/{self.database}/{self.schema}"
    #     engine = create_engine(connection_string)
    #     return engine
    
    # def connect_to_snowflake():
    #     connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    #     engine = create_engine(connection_string)
    #     return engine
        
    
    def fetch_metadata(self,engine):
        query = f"""
        SELECT table_name, COLUMN_NAME, DATA_TYPE
        FROM INFORMATION_SCHEMA.COLUMNS
        WHERE TABLE_SCHEMA = '{self.schema}';
        """
        return pd.read_sql(query, engine)

    def fetch_sample_data(self, table_name,engine):
        query = f"SELECT * FROM {self.schema}.{table_name} SAMPLE (10);"
        return pd.read_sql(query, engine)

    def generate_queries_with_llm(self, table_name, schema_details, sample_data):
        prompt = f"""
        The following metadata and sample data are from Snowflake:

        Table: {table_name}
        Schema Details: {schema_details}
        Sample Data: {sample_data}

        Identify anomalies and generate efficient SQL queries to detect them. Also, recommend solutions.
        give solution in following format:
            table_name : <table name>
            solution :  <solution>
            snowflake query : < snowflake query>
        Strictly follow the format provided.
        give solution in concise way.
        Also generate SQL query which is strictly snowflake friendly to get anomalies.
        Provide snowflake query tablewise not column or anomalywise.
        
        Only provide table name , solution and Snowflake query.
        """
        
        
        model = AzureChatOpenAI(
            azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
            azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
            openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
            openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
            )
        
        response = model(prompt)
        
        return response.content

    def analyze_table(self, table_name, schema_details,engine):
        sample_data = self.fetch_sample_data(table_name,engine)
        return self.generate_queries_with_llm(
            table_name,
            schema_details.to_dict(),
            sample_data.head(10).to_dict()
        )

    def run_analysis(self,engine):
        hs = open("hst001.txt","w+")
        metadata = self.fetch_metadata(engine)
        tables = metadata["table_name"].unique()
        results = {}

        for table in tables:
            schema_details = metadata[metadata["table_name"] == table]
            op = self.analyze_table(table, schema_details, engine).replace("```plaintext", "").replace("```", "").strip()
            hs = open("hst001.txt","a")
            hs.write(op + "\n" + "========================================================================================================================================" + "\n"+"\n")
            results[table] = op
        hs.close()
        return results
    
    def perform_task(self,engine):
        print("AI Agent is performing tasks to analyze anomalies in Snowflake.")
        metadata = self.fetch_metadata(engine)
        results = self.run_analysis(engine)
        
        print("Analysis complete.")
        return results


In [134]:
agent = SnowflakeAIAgent(
    user=env_vars.get("SNOWFLAKE_USER"),
    password=env_vars.get("SNOWFLAKE_PASSWORD"),
    account=env_vars.get("SNOWFLAKE_ACCOUNT"),
    warehouse=env_vars.get("SNOWFLAKE_WAREHOUSE"),
    database=env_vars.get("SNOWFLAKE_DATABASE"),
    schema=env_vars.get("SNOWFLAKE_SCHEMA"),
    openai_api_key=env_vars.get("OPENAI_API_KEY")
)

In [135]:
# engine = agent.connect_to_snowflake()


In [136]:
def connect_to_snowflake():
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    return engine

engine = connect_to_snowflake()

In [137]:
agent.perform_task(engine)

AI Agent is performing tasks to analyze anomalies in Snowflake.
Analysis complete.


{'DAILYSALES': 'table_name: DAILYSALES\nsolution: The sample data provided for the DAILYSALES table is entirely empty, which suggests a potential issue with data extraction or loading. Ensure data is correctly extracted from the source and loaded into Snowflake. Check for any ETL pipeline issues and verify source connectivity and data mapping.\nsnowflake query: \nSELECT \n    COUNT(*) AS total_rows,\n    COUNT(DISTINCT salesmancode) AS distinct_salesmancode,\n    COUNT(DISTINCT salinvno) AS distinct_salinvno,\n    COUNT(DISTINCT prdcode) AS distinct_prdcode,\n    COUNT(DISTINCT rtrcode) AS distinct_rtrcode,\n    COUNT(DISTINCT deliveryroutecode) AS distinct_deliveryroutecode\nFROM \n    DAILYSALES;',
 'SDL_CSL_DAILYSALES': "sql\ntable_name: SDL_CSL_DAILYSALES\nsolution: Identify and fix missing or inconsistent data entries, such as null or anomalous values, using appropriate data cleaning techniques. Ensure proper constraints and data validation rules are applied to prevent future anom