In [140]:


from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter

import snowflake.connector
import pandas as pd
import json
import csv
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
# from azure.storage.filedatalake import DataLakeServiceClient
from io import StringIO

load_dotenv()


env_vars = {
        "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
        "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
        "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
        "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
        "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
        "SNOWFLAKE_SCHEMA": 'TEST3',
        "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
        "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
        "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
        "connection_string":os.environ.get("AZURE_STORAGE_CONNECTION_STRING")
    }





In [141]:
# import snowflake.connector
# import pandas as pd
# from sqlalchemy import create_engine
# import os

# # Snowflake connection
# def connect_to_snowflake():
#     connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
#     engine = create_engine(connection_string)
#     return engine

In [142]:
import snowflake.connector
import pandas as pd
from openai import ChatCompletion
from sqlalchemy import create_engine

class SnowflakeAIAgent:
    def __init__(self, user, password, account, warehouse, database, schema, openai_api_key):
        self.user=user,
        self.password=password,
        self.account=account
        self.warehouse = warehouse
        self.database = database
        self.schema = schema
        self.openai_api_key = openai_api_key
    
    def connect_to_snowflake(self):
        connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
        engine = create_engine(connection_string)
        return engine
    
    # def connect_to_snowflake():
    #     connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    #     engine = create_engine(connection_string)
    #     return engine
        
    
    def fetch_metadata(self,engine):
        query = f"""
        SELECT table_name, COLUMN_NAME, DATA_TYPE
        FROM INFORMATION_SCHEMA.COLUMNS
        WHERE TABLE_SCHEMA = '{self.schema}';
        """
        return pd.read_sql(query, engine)

    def fetch_sample_data(self, table_name,engine):
        query = f"SELECT * FROM {self.schema}.{table_name};"
        return pd.read_sql(query, engine)
    
    def generate_queries_with_llm(self, table_name, schema_details, anomalies,model):
        prompt = f"""
        The following metadata and sample data are from Snowflake:

        Table: {table_name}
        Schema Details: {schema_details}
        anomalies: {anomalies}

        analize anomalies and generate efficient SQL queries to detect them. Also, recommend solutions.
        give solution in following format:
            table_name : <table name>
            solution :  <solution>
            snowflake query : < snowflake query>
        Strictly follow the format provided.
        give solution in concise way.
        Also generate SQL query which is strictly snowflake friendly to get anomalies.
        Provide snowflake query tablewise not column or anomalywise.
        
        Dont provide any solution, Just provide what are the anmalies present in specific table.
        provide Specific solution for specific anomalies.
        Only provide table name , solution and Snowflake query.
        
        Analyze the database for potential failure scenarios, including data inconsistencies, missing or 
        null critical values, referential integrity violations, duplicate records, schema issues, 
        anomalous patterns, and performance bottlenecks. Identify root causes and suggest preventive measures.
        
        Go through the data and find anomalies. dont provide any suggestion that it could be there , etc.
        """
        
        
        response = model(prompt)
        
        return response.content

    def analyze_table(self, table_name, schema_details,engine,model):
        df = self.fetch_sample_data(table_name,engine)
        anomalies = []
        chunk_size = 5000
        print(len(df))
        
        
        chunk_size = 1000  # Adjust based on your dataset size and requirements
        chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
        
        for chunk in chunks:
            anomalies_detected = self.generate_queries_with_llm(
                table_name,
                schema_details.to_dict(),
                anomalies,model).replace("```plaintext", "").replace("```", "").strip()
            anomalies.append(anomalies)
            with open("test.txt","a") as f:
                f.write(anomalies_detected + "\n" + "========================================================================================================================================" + "\n"+"\n")
        
        
        return anomalies
        
    def run_analysis(self,engine,model):
        
        metadata = self.fetch_metadata(engine)
        tables = metadata["table_name"].unique()
        results = {}
        hs = open("test.txt","w+")
        
        for table in tables:
            
            schema_details = metadata[metadata["table_name"] == table]
            op = self.analyze_table(table, schema_details, engine, model)
            # .replace("```plaintext", "").replace("```", "").strip()
            # hs = open("test1.txt","a")
            # hs.write(op + "\n" + "========================================================================================================================================" + "\n"+"\n")
            results[table] = op
        
        return results
    
    def perform_task(self,engine):
        model = AzureChatOpenAI(
            azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
            azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
            openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
            
            openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
            )
        
        print("AI Agent is performing tasks to analyze anomalies in Snowflake.")
        # metadata = self.fetch_metadata(engine)
        results = self.run_analysis(engine,model)
        
        print("Analysis complete.")
        return results


In [143]:
agent = SnowflakeAIAgent(
    user=env_vars.get("SNOWFLAKE_USER"),
    password=env_vars.get("SNOWFLAKE_PASSWORD"),
    account=env_vars.get("SNOWFLAKE_ACCOUNT"),
    warehouse=env_vars.get("SNOWFLAKE_WAREHOUSE"),
    database=env_vars.get("SNOWFLAKE_DATABASE"),
    schema=env_vars.get("SNOWFLAKE_SCHEMA"),
    openai_api_key=env_vars.get("OPENAI_API_KEY")
)

In [144]:
engine = agent.connect_to_snowflake()


In [145]:
model = AzureChatOpenAI(
            azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
            azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
            openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
            openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
            )
        

In [None]:
agent.perform_task(engine)

AI Agent is performing tasks to analyze anomalies in Snowflake.
66612
