In [83]:


from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter

import snowflake.connector
import pandas as pd
import json
import csv
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
# from azure.storage.filedatalake import DataLakeServiceClient
from io import StringIO

load_dotenv()


env_vars = {
        "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
        "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
        "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
        "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
        "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
        "SNOWFLAKE_SCHEMA": "TEST",
        "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
        "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
        "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
    }

# conn = snowflake.connector.connect(
#         user=env_vars.get("SNOWFLAKE_USER"),
#         password=env_vars.get("SNOWFLAKE_PASSWORD"),
#         account=env_vars.get("SNOWFLAKE_ACCOUNT"),
#         warehouse=env_vars.get("SNOWFLAKE_WAREHOUSE"),
#         database=env_vars.get("SNOWFLAKE_DATABASE"),
#         schema=env_vars.get("SNOWFLAKE_SCHEMA"),
#     )

# model = AzureChatOpenAI(
#         azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
#         azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
#         openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
#         openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
#     )





# cursor = conn.cursor()
# azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

# adls_client = DataLakeServiceClient.from_connection_string(azure_storage_connection_string)

# cursor.execute("""
#         SELECT table_name 
#         FROM information_schema.tables
#         WHERE table_schema = 'TEST' AND table_type = 'BASE TABLE'
#     """)

# tables =  cursor.fetchall()
# print(metadata)



In [84]:
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import os

# Snowflake connection
def connect_to_snowflake():
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    return engine

# Fetch metadata
def get_table_metadata(engine,database,schema, table):
    query = f"""
        SELECT 
            TABLE_NAME, column_name, DATA_TYPE, IS_NULLABLE ,CHARACTER_MAXIMUM_LENGTH
        FROM {database}.INFORMATION_SCHEMA.COLUMNS
        WHERE TABLE_SCHEMA = '{schema}' and TABLE_NAME = '{table}'
    """
    return pd.read_sql(query, engine)

In [85]:
def analyze_metadata(metadata_df):
    issues = []
    for _, row in metadata_df.iterrows():
        if row["is_nullable"] == "NO" and row["data_type"].lower() in ("varchar", "text"):
            issues.append(
                f"Column {row['column_name']} in table {row['table_name']} is non-nullable but allows free text, which might cause issues."
            )
    return issues

In [86]:
# import pandas as pd

# data = {
#     "Name": ["Alice", "Bob", "Charlie"],
#     "Age": [25, 30, 35],
#     "Height": [5.5, 6.0, 5.8],
#     "City": ["NY", "LA", "SF"],
#     "Score": [90.5, 88.0, 85.5],
# }

# df = pd.DataFrame(data)
# print(df)

# numeric_columns = []
# text_columns = []

# # Iterate through the columns to check if they are numeric or non-numeric
# for column in df.columns:
#     if pd.to_numeric(df[column], errors='coerce').notna().all():  # Check if all values can be converted to numeric
#         numeric_columns.append(column)
#     else:
#         text_columns.append(column)
# # Select numeric data

# numeric_data = df.select_dtypes(include=["number"])
# numeric_data1 = df[numeric_columns]
# print(numeric_data)
# print(numeric_data1)


In [87]:
from sklearn.ensemble import IsolationForest

# Null value check
def check_nulls(df, table_name, metadata):
    null_issues = []
    for column in df.columns:
        # Filter the metadata for the current table and column
        column_metadata = metadata[(metadata["table_name"] == table_name) & (metadata["column_name"] == column)]
        
        if column_metadata.empty:
            # Skip if no metadata is found for the column
            null_issues.append(f"No metadata found for column {column} in table {table_name}.")
            continue
        
        # Access the first row's value for 'is_nullable' safely
        nullable = column_metadata["is_nullable"].iloc[0]
        
        if nullable == "NO" and df[column].isnull().sum() > 0:
            null_issues.append(f"Column {column} in {table_name} has null values but is non-nullable.")
    
    return null_issues

# Anomaly detection
def detect_anomalies(df, table_name="Unnamed Table"):
    # Select numeric data for anomaly detection
    # numeric_data = df.select_dtypes(include=["number"])
    
    numeric_columns = []
    text_columns = []

    # Iterate through the columns to check if they are numeric or non-numeric
    for column in df.columns:
    # Check if the column is not datetime and all values can be converted to numeric
        if not pd.api.types.is_datetime64_any_dtype(df[column]) and pd.to_numeric(df[column], errors='coerce').notna().all():
            numeric_columns.append(column)
        else:
            text_columns.append(column)

    # Output the lists
    # final_columns = numeric_columns+text_columns
    
    numeric_data = df[numeric_columns]
    # Check if there is numeric data to process
    if numeric_data.empty:
        return f"No numeric data available for anomaly detection in table '{table_name}'."
    
    # Convert numeric data to a NumPy array to avoid feature name issues
    numeric_array = numeric_data.to_numpy()
    
    # Initialize the Isolation Forest model
    model = IsolationForest(contamination= 0.01, 
                            max_features= 0.5,
                            max_samples = 0.5,
                            n_estimators =50,
                            random_state = 42 )
    
    # Fit the model and predict anomalies
    anomalies = model.fit_predict(numeric_array)
    
    # Identify the indices of anomalous rows
    anomaly_indices = numeric_data.index[anomalies == -1]
    anomaly_rows = df.loc[anomaly_indices]
    
    # Count anomalies
    anomaly_count = len(anomaly_indices)
    
    if anomaly_count > 0:
        # Format the anomaly rows as a string
        rows_str = anomaly_rows.to_string(index=False)
        return (
            f"Detected {anomaly_count} anomalies in the dataset of table '{table_name}'.\n"
            f"Anomalous rows:\n{rows_str}"
        )
    else:
        return f"No anomalies detected in table '{table_name}'."





In [88]:
from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter

import snowflake.connector
import pandas as pd
import json
import csv
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
from io import StringIO

load_dotenv()

model = AzureChatOpenAI(
        azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
        azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
        openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
    )


In [89]:
import openai

# Generate insights using GPT
def generate_insights(prompt):
    response = model(prompt)
    return response.content

# Example prompt
def create_prompt_for_issues(issues):
    prompt = f"""The following issues were detected in the database:\n\n{issues}\n.
                Give specific solution based on the anomalies.
                Dont add any extra line other than solution to the anomaly.
                Give tablewise solution.
                dont mix up solution for different tables.
                Ensure the format intact for every table same.
                Provide specific issue with wrong values.
                
                give solution in concise way.
                Also generate SQL query which is strictly snowflake friendly to get anomalies. 
                
                Sample output:
            
                table_name : <table name>
                solution : solution for issues provided.

                SQL Query:
                <sql query>
                

                


                """
    return prompt


In [90]:
def create_prompt_for_symantic_meaning(data,schema_details,table_name):
    prompt = f""" 
                Given a column name, please provide a description of its likely semantic meaning, including what type of data it represents and its expected data type or format. Your response should focus on:
                
                Sample data: 
                {data}
                
                metadata: 
                {schema_details}

                1. Scan throught the records of each column to check if the data it holds aligns with its semantic meaning of its column name.
                2. Highlight errors ONLY IF the semantic meaning does not align with the column name.
                3. Skip the columns where the semantic meaning and the data it holds is valid.
                4. DONT SKIP text columns in the table.
                5. ONLY provide column names and its issues.
                6. Go through all the columns.
                7. Ensure the format intact .
                8. Please provide details of columns which has issues.
                Sample output:
                Issue: <issue>

                Please provide concise output


                """
    return prompt

In [91]:
from langchain.sql_database import SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain

# Create a LangChain SQL agent
def create_sql_agent(engine):
    db = SQLDatabase(engine)
    return SQLDatabaseChain.from_llm(llm="gpt-4", database=db)

# Execute natural language query
def execute_query_with_agent(agent, query):
    return agent.run(query)


In [92]:
def generate_report(metadata_issues, null_issues, anomaly_issues):
    report = "Data Quality Analysis Report\n\n"
    report += "Metadata Issues:\n" + "\n".join(metadata_issues) + "\n\n"
    report += "Null Value Issues:\n" + "\n".join(null_issues) + "\n\n"
    report += f"Anomaly Detection: {anomaly_issues}\n"
    return report


In [93]:
def get_table_names(engine,database,schema):
    query = f"""
        SELECT 
            TABLE_NAME, column_name, DATA_TYPE, IS_NULLABLE ,CHARACTER_MAXIMUM_LENGTH
        FROM {database}.INFORMATION_SCHEMA.COLUMNS
        WHERE TABLE_SCHEMA = '{schema}'
    """
    return pd.read_sql(query, engine)

In [94]:
def fetch_metadata(engine,schema):
        query = f"""
        SELECT table_name, COLUMN_NAME, DATA_TYPE,CHARACTER_MAXIMUM_LENGTH
        FROM INFORMATION_SCHEMA.COLUMNS
        WHERE TABLE_SCHEMA = '{schema}';
        """
        return pd.read_sql(query, engine)

In [95]:
def main():
    MY_DATABASE = env_vars.get("SNOWFLAKE_DATABASE")
    MY_SCHEMA = env_vars.get("SNOWFLAKE_SCHEMA")
    
    engine = connect_to_snowflake()
    metadata = get_table_names(engine, database=env_vars.get("SNOWFLAKE_DATABASE"), schema=env_vars.get("SNOWFLAKE_SCHEMA"))
    metadata_1=fetch_metadata(engine,MY_SCHEMA)
    # #Analyze metadata
    # metadata_issues = analyze_metadata(metadata)
    
    # Analyze table data
    null_issues = []
    anomaly_issues = []
    report = ""
    hs = open("test3_copy.txt","w+")
    for table in metadata["table_name"].unique():
        query = f"SELECT * FROM {MY_DATABASE}.{MY_SCHEMA}.{table}"
        df = pd.read_sql(query, engine)
    #     data = {
    #     "Name": ["Alice", "Bob", "1"],
    #     "Age": [25, 30, 35],
    #     "Height": [5.5, 6.0, 5.8],
    #     "City": [1, "LA", "SF"],
    #     "Score": [90.5, 88.0, 90.0],
    # }
        # df = pd.DataFrame(data)
        
        metadata1 = get_table_metadata(engine, database=env_vars.get("SNOWFLAKE_DATABASE"), schema=env_vars.get("SNOWFLAKE_SCHEMA"),table=table)
    
        #Analyze metadata
        # metadata_issues = analyze_metadata(metadata1)
            
        # Null checks
        # null_results = check_nulls(df, table, metadata1)
        # null_issues.extend(null_results)
        
        #Anomaly detection
        chunk_size = 5000  # Adjust based on your dataset size and requirements
        chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
        schema_details = metadata_1[metadata_1["table_name"] == table]
        for chunk in chunks:
            anomaly_result = detect_anomalies(chunk,table_name=table)
            symantic_issues_prompt= create_prompt_for_symantic_meaning(chunk,schema_details,table_name=table)
            symantic_issues=generate_insights(symantic_issues_prompt).replace("```plaintext", "").replace("```", "").strip()
            if "Detected" in anomaly_result:
                anomaly_issues.append(f"{table}: {anomaly_result}")
                
            gpt_prompt = create_prompt_for_issues(anomaly_result)
            gpt_response = generate_insights(gpt_prompt).replace("```plaintext", "").replace("```", "").strip()  
            
            hs = open("test3_copy.txt","a")
            hs.write(gpt_response + "\n" +symantic_issues+"\n" +"========================================================================================================================================" + "\n"+"\n")  

        
        
    

if __name__ == "__main__":
    main()


  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)


In [96]:
# def connect_to_snowflake():
#     connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
#     engine = create_engine(connection_string)
#     return engine

# engine = connect_to_snowflake()

# query = f"SELECT * FROM RAW.TEST3.SALES_DATA"
# df = pd.read_sql(query, engine)
# df

In [97]:
# def summarize_chunk(chunk):
#     # Generate a concise summary of the chunk (e.g., descriptive statistics)
#     summary = chunk.describe(include="all").to_string()
#     return summary

# # Chunking the data
# chunk_size = 40000  # Adjust based on your dataset size and requirements
# chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]

# summaries = [summarize_chunk(chunk) for chunk in chunks]


In [98]:
# def generate_insights(summary, prompt_template):
#     # Construct the prompt
#     prompt = f"{prompt_template}\n\nSummary of the data:\n{summary}"
    
#     # Get insights from the AI model
#     model = AzureChatOpenAI(
#         azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
#         azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
#         openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
#         openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
#     )
#     response = model(prompt)
#     return response.content

# # Define a general prompt template
# prompt_template = "Based on the following data summary, provide key insights, trends, and any notable patterns."

# # Generate insights for each chunk
# insights = [generate_insights(summary, prompt_template) for summary in summaries]


In [99]:
# # Combine all insights
# full_insight_report = "\n\n".join(insights)

# # Save the report to a file
# with open("insights_report.txt", "w") as file:
#     file.write(full_insight_report)

# print("Insight generation complete. Check insights_report.txt for details.")
