In [21]:


from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter

import snowflake.connector
import pandas as pd
import json
import csv
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
# from azure.storage.filedatalake import DataLakeServiceClient
from io import StringIO

load_dotenv()


env_vars = {
        "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
        "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
        "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
        "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
        "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
        "SNOWFLAKE_SCHEMA": "TESTS",
        "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
        "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
        "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
    }

# conn = snowflake.connector.connect(
#         user=env_vars.get("SNOWFLAKE_USER"),
#         password=env_vars.get("SNOWFLAKE_PASSWORD"),
#         account=env_vars.get("SNOWFLAKE_ACCOUNT"),
#         warehouse=env_vars.get("SNOWFLAKE_WAREHOUSE"),
#         database=env_vars.get("SNOWFLAKE_DATABASE"),
#         schema=env_vars.get("SNOWFLAKE_SCHEMA"),
#     )

# model = AzureChatOpenAI(
#         azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
#         azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
#         openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
#         openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
#     )





# cursor = conn.cursor()
# azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

# adls_client = DataLakeServiceClient.from_connection_string(azure_storage_connection_string)

# cursor.execute("""
#         SELECT table_name 
#         FROM information_schema.tables
#         WHERE table_schema = 'TEST' AND table_type = 'BASE TABLE'
#     """)

# tables =  cursor.fetchall()
# print(metadata)



In [22]:
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import os

# Snowflake connection
def connect_to_snowflake():
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    return engine

# Fetch metadata
def get_table_metadata(engine,database,schema, table):
    query = f"""
        SELECT 
            TABLE_NAME, column_name, DATA_TYPE, IS_NULLABLE 
        FROM {database}.INFORMATION_SCHEMA.COLUMNS
        WHERE TABLE_SCHEMA = '{schema}' and TABLE_NAME = '{table}'
    """
    return pd.read_sql(query, engine)

In [23]:
def analyze_metadata(metadata_df):
    issues = []
    for _, row in metadata_df.iterrows():
        if row["is_nullable"] == "NO" and row["data_type"].lower() in ("varchar", "text"):
            issues.append(
                f"Column {row['column_name']} in table {row['table_name']} is non-nullable but allows free text, which might cause issues."
            )
    return issues

In [48]:
import pandas as pd

data = {
    "Name": ["Alice", "Bob", 1],
    "Age": [25, 30, 35],
    "Height": [5.5, 6.0, 5.8],
    "City": [1, "LA", "SF"],
    "Score": [90.5, 88.0, 90.0],
}

df = pd.DataFrame(data)
print(df)

numeric_columns = []
text_columns = []

# Iterate through the columns to check if they are numeric or non-numeric
for column in df.columns:
    if pd.to_numeric(df[column], errors='coerce').notna().all():  # Check if all values can be converted to numeric
        numeric_columns.append(column)
    else:
        text_columns.append(column)
# Select numeric data

print(numeric_columns,text_columns)
# numeric_data = df.select_dtypes(include=["number"])
# numeric_data1 = df[numeric_columns]
# print(numeric_data)
# print(numeric_data1)


    Name  Age  Height City  Score
0  Alice   25     5.5    1   90.5
1    Bob   30     6.0   LA   88.0
2      1   35     5.8   SF  chomu
['Age', 'Height'] ['Name', 'City', 'Score']


In [25]:
from sklearn.ensemble import IsolationForest

# Null value check
def check_nulls(df, table_name, metadata):
    null_issues = []
    for column in df.columns:
        # Filter the metadata for the current table and column
        column_metadata = metadata[(metadata["table_name"] == table_name) & (metadata["column_name"] == column)]
        
        if column_metadata.empty:
            # Skip if no metadata is found for the column
            null_issues.append(f"No metadata found for column {column} in table {table_name}.")
            continue
        
        # Access the first row's value for 'is_nullable' safely
        nullable = column_metadata["is_nullable"].iloc[0]
        
        if nullable == "NO" and df[column].isnull().sum() > 0:
            null_issues.append(f"Column {column} in {table_name} has null values but is non-nullable.")
    
    return null_issues

# Anomaly detection
def detect_anomalies(df, table_name="Unnamed Table"):
    # Select numeric data for anomaly detection
    # numeric_data = df.select_dtypes(include=["number"])
    
    numeric_columns = []
    text_columns = []

    # Iterate through the columns to check if they are numeric or non-numeric
    for column in df.columns:
    # Check if the column is not datetime and all values can be converted to numeric
        if not pd.api.types.is_datetime64_any_dtype(df[column]) and pd.to_numeric(df[column], errors='coerce').notna().all():
            numeric_columns.append(column)
        else:
            text_columns.append(column)

    # Output the lists
    # final_columns = numeric_columns+text_columns
    
    numeric_data = df[numeric_columns]
    # Check if there is numeric data to process
    if numeric_data.empty:
        return f"No numeric data available for anomaly detection in table '{table_name}'."
    
    # Convert numeric data to a NumPy array to avoid feature name issues
    numeric_array = numeric_data.to_numpy()
    
    # Initialize the Isolation Forest model
    model = IsolationForest(contamination= 0.01, 
                            max_features= 0.5,
                            max_samples = 0.5,
                            n_estimators =50,
                            random_state = 42 )
    
    # Fit the model and predict anomalies
    anomalies = model.fit_predict(numeric_array)
    
    # Identify the indices of anomalous rows
    anomaly_indices = numeric_data.index[anomalies == -1]
    anomaly_rows = df.loc[anomaly_indices]
    
    # Count anomalies
    anomaly_count = len(anomaly_indices)
    
    if anomaly_count > 0:
        # Format the anomaly rows as a string
        rows_str = anomaly_rows.to_string(index=False)
        return (
            f"Detected {anomaly_count} anomalies in the dataset of table '{table_name}'.\n"
            f"Anomalous rows:\n{rows_str}"
        )
    else:
        return f"No anomalies detected in table '{table_name}'."





In [26]:
from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter

import snowflake.connector
import pandas as pd
import json
import csv
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
from io import StringIO

load_dotenv()

model = AzureChatOpenAI(
        azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
        azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
        openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
    )


KeyboardInterrupt: 

In [103]:
import openai

# Generate insights using GPT
def generate_insights(prompt):
    response = model(prompt)
    return response.content

# Example prompt
def create_prompt_for_issues(issues):
    prompt = f"""The following issues were detected in the database:\n\n{issues}\n.
                Give specific solution based on the anomalies.
                Dont add any extra line other than solution to the anomaly.
                Give tablewise solution.
                dont mix up solution for different tables.
                Ensure the format intact for every table same.
                give solution in following format:
                table_name : <table name>
                solution :  <solution>
                
                give solution in concise way.
                Also generate SQL query which is strictly snowflake friendly to get anomalies. 
                
                sample outoput:
                table_name : <table name>
                solution : solution for issues provided.

                SQL Query:
                <sql query>
                """
    return prompt


In [104]:
from langchain.sql_database import SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain

# Create a LangChain SQL agent
def create_sql_agent(engine):
    db = SQLDatabase(engine)
    return SQLDatabaseChain.from_llm(llm="gpt-4", database=db)

# Execute natural language query
def execute_query_with_agent(agent, query):
    return agent.run(query)


In [105]:
def generate_report(metadata_issues, null_issues, anomaly_issues):
    report = "Data Quality Analysis Report\n\n"
    report += "Metadata Issues:\n" + "\n".join(metadata_issues) + "\n\n"
    report += "Null Value Issues:\n" + "\n".join(null_issues) + "\n\n"
    report += f"Anomaly Detection: {anomaly_issues}\n"
    return report


In [106]:
def get_table_names(engine,database,schema):
    query = f"""
        SELECT 
            TABLE_NAME, column_name, DATA_TYPE, IS_NULLABLE 
        FROM {database}.INFORMATION_SCHEMA.COLUMNS
        WHERE TABLE_SCHEMA = '{schema}'
    """
    return pd.read_sql(query, engine)

In [107]:
def main():
    MY_DATABASE = env_vars.get("SNOWFLAKE_DATABASE")
    MY_SCHEMA = env_vars.get("SNOWFLAKE_SCHEMA")
    
    engine = connect_to_snowflake()
    metadata = get_table_names(engine, database=env_vars.get("SNOWFLAKE_DATABASE"), schema=env_vars.get("SNOWFLAKE_SCHEMA"))
    
    # #Analyze metadata
    # metadata_issues = analyze_metadata(metadata)
    
    # Analyze table data
    null_issues = []
    anomaly_issues = []
    report = ""
    hs = open("test.txt","w+")
    for table in metadata["table_name"].unique():
        query = f"SELECT * FROM {MY_DATABASE}.{MY_SCHEMA}.{table}"
        df = pd.read_sql(query, engine)
        
        metadata1 = get_table_metadata(engine, database=env_vars.get("SNOWFLAKE_DATABASE"), schema=env_vars.get("SNOWFLAKE_SCHEMA"),table=table)
    
        #Analyze metadata
        # metadata_issues = analyze_metadata(metadata1)
            
        # Null checks
        # null_results = check_nulls(df, table, metadata1)
        # null_issues.extend(null_results)
        
        #Anomaly detection
        chunk_size = 5000  # Adjust based on your dataset size and requirements
        chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
        
        for chunk in chunks:
            anomaly_result = detect_anomalies(chunk,table_name=table)
            if "Detected" in anomaly_result:
                anomaly_issues.append(f"{table}: {anomaly_result}")
                
            gpt_prompt = create_prompt_for_issues(anomaly_result)
            gpt_response = generate_insights(gpt_prompt).replace("```plaintext", "").replace("```", "").strip()  
            
            hs = open("test.txt","a")
            hs.write(gpt_response + "\n" + "========================================================================================================================================" + "\n"+"\n")  

        # issues = list(null_results) + list(anomaly_result)
        # Generate insights with GPT
        # print(anomaly_issues)
        # print(len(anomaly_issues))
        # gpt_prompt = create_prompt_for_issues(anomaly_result)
        # gpt_response = generate_insights(gpt_prompt).replace("```plaintext", "").replace("```", "").strip()
        
        
        # report+=gpt_response
        
        # hs = open("test1.txt","a")
        # hs.write(gpt_response + "\n" + "========================================================================================================================================" + "\n"+"\n")
    hs.close()
        
    

if __name__ == "__main__":
    main()


  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)


In [108]:
# def connect_to_snowflake():
#     connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
#     engine = create_engine(connection_string)
#     return engine

# engine = connect_to_snowflake()

# query = f"SELECT * FROM RAW.TEST3.SALES_DATA"
# df = pd.read_sql(query, engine)
# df

In [109]:
# def summarize_chunk(chunk):
#     # Generate a concise summary of the chunk (e.g., descriptive statistics)
#     summary = chunk.describe(include="all").to_string()
#     return summary

# # Chunking the data
# chunk_size = 40000  # Adjust based on your dataset size and requirements
# chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]

# summaries = [summarize_chunk(chunk) for chunk in chunks]


In [110]:
# def generate_insights(summary, prompt_template):
#     # Construct the prompt
#     prompt = f"{prompt_template}\n\nSummary of the data:\n{summary}"
    
#     # Get insights from the AI model
#     model = AzureChatOpenAI(
#         azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
#         azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
#         openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
#         openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
#     )
#     response = model(prompt)
#     return response.content

# # Define a general prompt template
# prompt_template = "Based on the following data summary, provide key insights, trends, and any notable patterns."

# # Generate insights for each chunk
# insights = [generate_insights(summary, prompt_template) for summary in summaries]


In [111]:
# # Combine all insights
# full_insight_report = "\n\n".join(insights)

# # Save the report to a file
# with open("insights_report.txt", "w") as file:
#     file.write(full_insight_report)

# print("Insight generation complete. Check insights_report.txt for details.")


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import snowflake.connector
from langchain_openai import AzureOpenAIEmbeddings

text_list = ["sales amount", "product name", "total price"]
model = AzureOpenAIEmbeddings(
    deployment=os.environ["AZURE_OPENAI_EMBED_DEPLOYMENT_NAME"],
    model=os.environ["AZURE_OPENAI_EMBED_MODEL_NAME"],
    openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    openai_api_type=os.environ["AZURE_OPENAI_API_TYPE"]
)
columns_embeddings = [model.embed_query(col) for col in text_list]


columns_embeddings


[[0.0076023126021027565,
  -0.017620189115405083,
  0.009870258159935474,
  -0.011527602560818195,
  -0.028315763920545578,
  0.005384691990911961,
  -0.027483737096190453,
  -0.01033324096351862,
  -0.006904483772814274,
  -0.02418246679008007,
  0.03325424715876579,
  0.024236146360635757,
  -0.01767386682331562,
  0.00011134234955534339,
  0.008997971192002296,
  0.0008756414754316211,
  0.014292079955339432,
  -0.03298585116863251,
  -0.017445731908082962,
  -0.02288074791431427,
  -0.012420018203556538,
  0.003864900441840291,
  -0.030248213559389114,
  0.007803609594702721,
  0.00043236894998699427,
  -0.007723091170191765,
  0.031536512076854706,
  -0.022035300731658936,
  0.007978066802024841,
  -0.01403710339218378,
  0.008789964951574802,
  -0.010413759388029575,
  -0.019082946702837944,
  -0.018385116010904312,
  -0.028423121199011803,
  0.007535214070230722,
  -0.0191634651273489,
  -0.0005292430869303644,
  0.0018653512233868241,
  -0.01227240078151226,
  0.024974234402179

In [45]:
from sentence_transformers import SentenceTransformer
import numpy as np
import snowflake.connector
from sqlalchemy import create_engine

# Step 1: Fetch Column Names and Data Samples
def fetch_column_data_samples(table_name, sample_size=10):
    """
    Fetch column names and a sample of data from the specified table.
    """
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    
    query = f"SELECT * FROM {table_name} LIMIT {sample_size}"
    data = pd.read_sql(query,engine)
    
    column_names =data.columns.tolist()
    print("fetch data samples")
    return column_names, data

def fetch_column_names(table_name, sample_size=10):
    """
    Fetch column names and a sample of data from the specified table.
    """
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    
    query = f"SELECT * FROM {table_name} LIMIT {sample_size}"
    data = pd.read_sql(query,engine)
    
    column_names =data.columns.tolist()
    print("fetch column names")
    
    return column_names

# Step 2: Generate Embeddings for Column Names and Data
def generate_embeddings(text_list,model):
    """
    Generate embeddings for a list of text using a SentenceTransformer model.
    """
    columns_embeddings = [model.embed_query(col) for col in text_list]


    columns_embeddings
    print("embeddings generated")
    return columns_embeddings

# Step 3: Calculate Semantic Alignment
def calculate_alignment(column_names, data_samples,table_name):
    """
    Calculate the semantic alignment between column names and data.
    """
    model = AzureOpenAIEmbeddings(
        deployment=os.environ["AZURE_OPENAI_EMBED_DEPLOYMENT_NAME"],
        model=os.environ["AZURE_OPENAI_EMBED_MODEL_NAME"],
        openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
        openai_api_type=os.environ["AZURE_OPENAI_API_TYPE"]
        )
    column_embeddings = generate_embeddings(column_names,model)

    alignment_results = {}
    for idx, column_name in enumerate(column_names):
        # Get a single column's data as text
        idx = len(column_names)-1  # Example index for "address"
        column_data = []

        for row in data_samples:
            # Ensure the index is within the bounds of the row
            if idx < len(row):
                column_data.append(str(row[idx]))
            else:
                column_data.append("N/A")
        column_data = data_samples.columns.tolist()
        data_embedding = generate_embeddings(column_data,model)

        # Calculate average similarity for this column
        similarities = [
            np.dot(column_embeddings[idx], row_emb) / 
            (np.linalg.norm(column_embeddings[idx]) * np.linalg.norm(row_emb))
            for row_emb in data_embedding
        ]
        average_similarity = np.mean(similarities)
        alignment_results[column_name] = average_similarity
    
    print("alignmnet")
    return alignment_results

# Step 4: Flag Misaligned Columns
def find_misaligned_columns(alignment_results, threshold=0.7):
    """
    Identify columns where the average similarity falls below the threshold.
    """
    misaligned_columns = {
        column: similarity
        for column, similarity in alignment_results.items()
        if similarity < threshold
    }
    
    print("misaligned_columns")
    return misaligned_columns

# Main Function
if __name__ == "__main__":
    table_name = "RAW.TEST.sales_data_format"
    column_names, data_samples = fetch_column_data_samples(table_name, sample_size=100)
    print("done 1")
    print(data_samples)

    alignment_results = calculate_alignment(column_names, data_samples,table_name)
    print("done 2")
    misaligned_columns = find_misaligned_columns(alignment_results, threshold=0.7)
    print("done 3")

    print("Alignment Results:", alignment_results)
    print("Misaligned Columns:", misaligned_columns)


fetch data samples
done 1
  sls_doc_typ billing_type  cust_no  fisc_yr
0         typ            1        5        3
1        typ1         what      100        3
2        typ1          are        6        3
embeddings generated
embeddings generated
embeddings generated
embeddings generated
embeddings generated
alignmnet
done 2
misaligned_columns
done 3
Alignment Results: {'sls_doc_typ': 0.824226956560921, 'billing_type': 0.824226956560921, 'cust_no': 0.824226956560921, 'fisc_yr': 0.824226956560921}
Misaligned Columns: {}


In [35]:
import snowflake.connector
import pandas as pd
import openai
from typing import List
import random

# Snowflake connection details
def create_snowflake_connection(user: str, password: str, account: str, warehouse: str, database: str, schema: str):
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    return engine

# Function to fetch column names dynamically from Snowflake
def get_columns_from_snowflake(connection, table_name: str) -> List[str]:
    query = f"""
        SELECT COLUMN_NAME
        FROM INFORMATION_SCHEMA.COLUMNS
        WHERE TABLE_NAME = '{table_name}'
    """
    df = pd.read_sql(query, connection)
    return df['COLUMN_NAME'].tolist()

# Function to fetch semantic meaning of the table using OpenAI
def get_table_semantic_meaning(columns: List[str]) -> str:
    """
    Fetches the semantic meaning of a table based on its column names using OpenAI API.
    """
    prompt = f"Based on the following columns, describe the purpose of the table and the type of data it is likely to contain: {', '.join(columns)}"
    response = openai.Completion.create(
        model="text-davinci-003",  # Replace with the appropriate model
        prompt=prompt,
        max_tokens=100
    )
    return response.choices[0].text.strip()

# Function to fetch data from Snowflake for a given table
def fetch_data_from_snowflake(connection, table_name: str, sample_size: int = 100):
    query = f"SELECT * FROM {table_name} LIMIT {sample_size}"
    df = pd.read_sql(query, connection)
    return df

# Function to check if data aligns with the inferred semantic meaning of the table
def validate_table_data(df: pd.DataFrame, table_name: str) -> str:
    """
    Validates if the data in the table aligns with the inferred semantic meaning of the table.
    """
    columns = df.columns.tolist()
    
    # Get the semantic meaning of the table based on its columns
    table_semantic_meaning = get_table_semantic_meaning(columns)
    
    # Check if each column's data aligns with the expected data type based on semantic meaning
    misalignment = []
    for col in columns:
        if "numeric" in table_semantic_meaning.lower():
            if not pd.api.types.is_numeric_dtype(df[col]):
                misalignment.append(f"Column '{col}' is expected to contain numeric values.")
        elif "date" in table_semantic_meaning.lower():
            if not pd.api.types.is_datetime64_any_dtype(df[col]):
                misalignment.append(f"Column '{col}' is expected to contain datetime values.")
        elif "text" in table_semantic_meaning.lower():
            if not pd.api.types.is_string_dtype(df[col]):
                misalignment.append(f"Column '{col}' is expected to contain string values.")
    
    return misalignment

# Main function to scan the table dynamically
def scan_table_for_semantics(connection, table_name: str, sample_size: int = 100):
    # Fetch columns dynamically from Snowflake
    columns = get_columns_from_snowflake(connection, table_name)
    
    # Fetch sample data from Snowflake
    df = fetch_data_from_snowflake(connection, table_name, sample_size)
    
    # Validate table data and check alignment with inferred semantic meaning
    misalignment = validate_table_data(df, table_name)
    
    # Output the results
    if misalignment:
        print(f"Issues in table '{table_name}':")
        for issue in misalignment:
            print(f"  - {issue}")
    else:
        print(f"Table '{table_name}' data aligns with its semantic meaning.")

# Example usage
if __name__ == "__main__":
    # Snowflake connection details

    # Create Snowflake connection
    conn = create_snowflake_connection(user, password, account, warehouse, database, schema)
    
    # Table name you want to scan
    table_name = 'YOUR_TABLE_NAME'
    
    # Scan the table for semantic alignment
    scan_table_for_semantics(conn, table_name, sample_size=100)


['New York', 'Los Angeles', 'N/A']
