In [1]:
from langchain_openai import AzureChatOpenAI
import snowflake.connector
import pandas as pd
from dotenv import load_dotenv
import os
import time
from datetime import datetime
from sqlalchemy import create_engine
import json
from io import StringIO
import warnings
warnings.filterwarnings("ignore")
load_dotenv(override=True)

True

In [2]:
required_vars = {
    "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
    "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
    "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
    "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
    "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
    "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
    "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
    "SNOWFLAKE_SCHEMA": os.environ.get("SNOWFLAKE_SCHEMA")
}

missing_vars = [key for key, value in required_vars.items() if value is None]
if missing_vars:
    raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
print("Environment variables loaded successfully")

Environment variables loaded successfully


In [3]:
connection_string = (
    f"snowflake://{required_vars['SNOWFLAKE_USER']}:"
    f"{required_vars['SNOWFLAKE_PASSWORD']}@"
    f"{required_vars['SNOWFLAKE_ACCOUNT']}/"
    f"{required_vars['SNOWFLAKE_DATABASE']}/"
    f"{required_vars['SNOWFLAKE_SCHEMA']}?warehouse="
    f"{required_vars['SNOWFLAKE_WAREHOUSE']}"
)

engine = create_engine(connection_string)
print("Connected to Snowflake")

Connected to Snowflake


In [4]:
query = f"""
    SELECT 
        c.TABLE_NAME, c.COLUMN_NAME, c.DATA_TYPE, c.IS_NULLABLE, c.CHARACTER_MAXIMUM_LENGTH
    FROM {required_vars['SNOWFLAKE_DATABASE']}.INFORMATION_SCHEMA.COLUMNS c
    JOIN {required_vars['SNOWFLAKE_DATABASE']}.INFORMATION_SCHEMA.TABLES t 
        ON c.TABLE_NAME = t.TABLE_NAME
    WHERE t.TABLE_TYPE = 'BASE TABLE' 
    AND c.TABLE_SCHEMA = '{required_vars['SNOWFLAKE_SCHEMA']}'
"""
metadata = pd.read_sql(query, engine)
metadata.columns = [col.lower() for col in metadata.columns]

print("\nAvailable tables:", metadata['table_name'].unique())

AttributeError: 'Engine' object has no attribute 'cursor'

In [19]:
for i in metadata['table_name'].unique():
    print(i)

SALES_DATA
PATIENT_ADMISSIONS
ADDRESSES


In [None]:
table_name = metadata['table_name'].unique()[0]  # Get first table only

print(f"\nRetrieving data from table: {table_name}")

query = f"SELECT * FROM {required_vars['SNOWFLAKE_DATABASE']}.{required_vars['SNOWFLAKE_SCHEMA']}.{table_name}"
df = pd.read_sql(query, engine)
print(f"Retrieved {len(df)} rows")



Retrieving data from table: SALES_DATA
Retrieved 66612 rows


In [21]:
# # if you want to load all tables >>
# for i in metadata['table_name'].unique():
#     print(i)
#     print(f"\nRetrieving data from table: {table_name}")

#     query = f"SELECT * FROM {required_vars['SNOWFLAKE_DATABASE']}.{required_vars['SNOWFLAKE_SCHEMA']}.{table_name}"
#     df = pd.read_sql(query, engine)
#     print(f"Retrieved {len(df)} rows")

In [23]:
sample_size = min(int(len(df) * 0.05), 1000)
sampled_df = df.sample(n=sample_size, random_state=42)
print(f"\nSampled {len(sampled_df)} rows from original {len(df)} rows")
print("\nSample of data:")
print(sampled_df.head())


Sampled 1000 rows from original 66612 rows

Sample of data:
      sls_doc_typ billing_type  cust_no  fisc_yr  fisc_mo     cal_day  \
57924        ZORT         ZF2T   107399     2024        7  2024-07-23   
40124        ZF2K         ZF2K   140722     2024        7  2024-07-19   
44867        ZORH         ZF2H   100002     2024        7  2024-07-22   
23578        ZF2K         ZF2K   125816     2024        7  2024-07-03   
51710        ZORH         ZF2H   110256     2024        7  2024-07-24   

       fisc_wk_num  sls_ofc_cv_cd            sls_ofc_cv sls_grp_cv_cd  ...  \
57924            4           1240                 Hyper           T42  ...   
40124            3           3220  Neighborhood Channel           K50  ...   
44867            3           1150                CHAINS           H52  ...   
23578            1           3210          Modern Trade           K12  ...   
51710            4           1160         GENERAL TRADE           H63  ...   

           country             

In [24]:
model = AzureChatOpenAI(
    azure_endpoint=required_vars["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=required_vars["AZURE_OPENAI_4o_DEPLOYMENT_NAME"],
    openai_api_version=required_vars["AZURE_OPENAI_API_VERSION"],
    openai_api_key=required_vars["AZURE_OPENAI_API_KEY"],
)

In [25]:
data_str = StringIO()
sampled_df.head(100).to_csv(data_str, index=False)
data_str = data_str.getvalue()

In [26]:
table_metadata = metadata[metadata['table_name'] == table_name]

In [None]:
data_quality_prompt = f"""
Analyze this data sample ({len(sampled_df)} total rows, showing first 100):

Table Data:
{data_str}

Table Metadata:
{table_metadata[['column_name', 'data_type']].to_string()}

Please provide a comprehensive analysis focusing on:
1. Data quality issues
2. Pattern anomalies
3. Potential sensitive data fields
4. Suggested improvements

Format your response as JSON with these keys:
- data_quality_issues
- recommended_solutions
- sql_queries
- sensitive_data_recommendations
"""

system_prompt_quality = """You are a specialized data analyst expert in Snowflake databases.
Analyze the provided data sample focusing on data quality and patterns.
Keep responses focused and brief. Ensure JSON format."""

messages_quality = [
    {"role": "system", "content": system_prompt_quality},
    {"role": "user", "content": data_quality_prompt}
]

print("Generating data quality insights...")
quality_response = model.invoke(messages_quality)
print("\nData quality insights generated")
print(quality_response)

Generating data quality insights...

Data quality insights generated


In [33]:
semantic_prompt = f"""
Analysis for Snowflake table '{table_name}':
Consider Snowflake-specific data types and variant/array columns.

Sample data:
{data_str}

Metadata:
{table_metadata.to_string()}

Analyze each column focusing on:
1. Scan through the records of each column to check if the data aligns with its semantic meaning.
2. Highlight errors ONLY IF the semantic meaning does not align with the column name.
3. Skip the columns where the semantic meaning and the data it holds is valid.
4. Check for Snowflake-specific data type optimizations.
5. ONLY provide column names and their issues.
6. Go through all the columns and all the tables.
7. Ensure the format intact.
8. Please provide details of columns which has issues.
9. Provide all the inconsistencies present with their values.
10. In example provide all the discrepancy values.

Format your response focusing only on columns with issues, providing:
1. Column name
2. Current semantic meaning
3. Data type issues
4. Example of inconsistent values
5. Recommended improvements
"""

system_prompt_semantic = """You are a data semantic analysis expert.
Focus on semantic meaning of columns and data type consistency.
Provide clear, specific examples of any misalignments found.
Keep analysis focused on columns with actual issues."""

messages_semantic = [
    {"role": "system", "content": system_prompt_semantic},
    {"role": "user", "content": semantic_prompt}
]

print("\nGenerating semantic analysis...")
semantic_response = model.invoke(messages_semantic)
print("Semantic analysis generated")


Generating semantic analysis...
Semantic analysis generated


In [34]:
semantic_prompt = f"""
Analysis for Snowflake table '{table_name}':
Consider Snowflake-specific data types and variant/array columns.

Sample data:
{data_str}

Metadata:
{table_metadata.to_string()}

Analyze each column focusing on:
1. Scan through the records of each column to check if the data aligns with its semantic meaning.
2. Highlight errors ONLY IF the semantic meaning does not align with the column name.
3. Skip the columns where the semantic meaning and the data it holds is valid.
4. Check for Snowflake-specific data type optimizations.
5. ONLY provide column names and their issues.
6. Go through all the columns and all the tables.
7. Ensure the format intact.
8. Please provide details of columns which has issues.
9. Provide all the inconsistencies present with their values.
10. In example provide all the discrepancy values.

Format your response focusing only on columns with issues, providing:
1. Column name
2. Current semantic meaning
3. Data type issues
4. Example of inconsistent values
5. Recommended improvements
"""

system_prompt_semantic = """You are a data semantic analysis expert.
Focus on semantic meaning of columns and data type consistency.
Provide clear, specific examples of any misalignments found.
Keep analysis focused on columns with actual issues."""

messages_semantic = [
    {"role": "system", "content": system_prompt_semantic},
    {"role": "user", "content": semantic_prompt}
]

print("\nGenerating semantic analysis...")
semantic_response = model.invoke(messages_semantic)
print("Semantic analysis generated")


Generating semantic analysis...
Semantic analysis generated


In [35]:
# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# results = {
#     'table_name': table_name,
#     'timestamp': timestamp,
#     'data_quality_analysis': quality_response.content,
#     'semantic_analysis': semantic_response.content
# }

# # Save to JSON file
# with open(f'analysis_results_{timestamp}.json', 'w') as f:
#     json.dump(results, f, indent=2)
# print(f"\nResults saved to analysis_results_{timestamp}.json")


Results saved to analysis_results_20250217_131953.json


In [37]:
### json to csv >>>>>>>>>>>>


timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Parse the quality analysis response if it's in JSON format
try:
    quality_data = json.loads(quality_response.content)
except json.JSONDecodeError:
    quality_data = quality_response.content

# Create a flattened dictionary for CSV
flattened_results = {
    'table_name': [table_name],
    'timestamp': [timestamp]
}

# Add quality analysis data
if isinstance(quality_data, dict):
    for key, value in quality_data.items():
        flattened_results[f'quality_{key}'] = [value]
else:
    flattened_results['data_quality_analysis'] = [quality_data]

# Add semantic analysis
flattened_results['semantic_analysis'] = [semantic_response.content]

# Convert to DataFrame
df = pd.DataFrame(flattened_results)

# Save to CSV
csv_filename = f'analysis_results_{timestamp}.csv'
df.to_csv(csv_filename, index=False, encoding='utf-8')
print(f"\nResults saved to {csv_filename}")

# Display preview
print("\nPreview of CSV content:")
print(df.head())


Results saved to analysis_results_20250217_132252.csv

Preview of CSV content:
   table_name        timestamp  \
0  SALES_DATA  20250217_132252   

                               data_quality_analysis  \
0  ```json\n{\n  "data_quality_issues": {\n    "i...   

                                   semantic_analysis  
0  1. **CATEGROY_2**\n   - **Current semantic mea...  


In [38]:
engine.dispose()
print("\nDatabase connection closed")


Database connection closed
