In [15]:


from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter

import snowflake.connector
import pandas as pd
import json
import csv
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
from azure.storage.filedatalake import DataLakeServiceClient
from io import StringIO

load_dotenv()


env_vars = {
        "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
        "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
        "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
        "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
        "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
        "SNOWFLAKE_SCHEMA": os.environ.get("SNOWFLAKE_SCHEMA"),
        "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
        "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
        "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
    }

conn = snowflake.connector.connect(
        user=env_vars.get("SNOWFLAKE_USER"),
        password=env_vars.get("SNOWFLAKE_PASSWORD"),
        account=env_vars.get("SNOWFLAKE_ACCOUNT"),
        warehouse=env_vars.get("SNOWFLAKE_WAREHOUSE"),
        database=env_vars.get("SNOWFLAKE_DATABASE"),
        schema=env_vars.get("SNOWFLAKE_SCHEMA"),
    )

model = AzureChatOpenAI(
        azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
        azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
        openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
    )





cursor = conn.cursor()
azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

adls_client = DataLakeServiceClient.from_connection_string(azure_storage_connection_string)

cursor.execute("""
        SELECT table_name 
        FROM information_schema.tables
        WHERE table_schema = 'TEST' AND table_type = 'BASE TABLE'
    """)

tables =  cursor.fetchall()
# print(metadata)



In [16]:
# tables = cur.fetchall()
table_names = [table[0] for table in tables]
# Initialize an empty dictionary to store data from all tables
all_data = {}
# Fetch data from all tables
for table_name in table_names:
    cursor.execute(f"SELECT * FROM {table_name}")  # Limit the rows for simplicity
    data2 = cursor.fetchall()
    df = pd.DataFrame(data2, columns=[col[0] for col in cursor.description])
    all_data[table_name] = df
    
# Display the first few rows from all tables (for review)
# for table, data2 in all_data.items():
    # print(f"\nData from table: {table}")
    # print(data2.head())

In [18]:
cursor.execute("""
    SELECT 
        TABLE_NAME, 
        COLUMN_NAME, 
        DATA_TYPE, 
        IS_NULLABLE,
        CHARACTER_MAXIMUM_LENGTH 
    FROM 
        INFORMATION_SCHEMA.COLUMNS
    WHERE 
        TABLE_SCHEMA = 'TEST'
    ORDER BY table_name
""")

metadata = cursor.fetchall()
tables = {}

for table_name, column_name, data_type, is_nullable,character_maximum_length in metadata:
    if table_name not in tables:
        tables[table_name] = []
    
    tables[table_name].append({
        "column_name": column_name,
        "data_type": data_type,
        "is_nullable": is_nullable,
        "character_maximum_length": character_maximum_length
    })

    # print(tables)

In [19]:
json_data = json.dumps(tables, indent=4)
# Output the JSON data
print(json_data)



{
    "CUST_CUSTOMER": [
        {
            "column_name": "CUSTOMERID",
            "data_type": "TEXT",
            "is_nullable": "YES",
            "character_maximum_length": 64
        },
        {
            "column_name": "EMAIL",
            "data_type": "TEXT",
            "is_nullable": "YES",
            "character_maximum_length": 128
        },
        {
            "column_name": "SECONDARYTRADECODE",
            "data_type": "TEXT",
            "is_nullable": "YES",
            "character_maximum_length": 128
        },
        {
            "column_name": "DISTRICT",
            "data_type": "TEXT",
            "is_nullable": "YES",
            "character_maximum_length": 256
        },
        {
            "column_name": "FETCHEDSEQUENCE",
            "data_type": "NUMBER",
            "is_nullable": "YES",
            "character_maximum_length": null
        },
        {
            "column_name": "WEBSITE",
            "data_type": "TEXT",
            "is_nulla

In [87]:

from langchain_text_splitters import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=300)

texts = splitter.split_text(json_data=json.loads(json_data))

In [88]:
print(texts[1])

{"DAILYSALES": [{"column_name": "SALINVMODE", "data_type": "TEXT", "is_nullable": "YES"}, {"column_name": "SALGROSSAMT", "data_type": "NUMBER", "is_nullable": "YES"}, {"column_name": "MODIFIEDDATE", "data_type": "TIMESTAMP_NTZ", "is_nullable": "YES"}, {"column_name": "SALDLVDATE", "data_type": "TIMESTAMP_NTZ", "is_nullable": "YES"}, {"column_name": "VCPSCHEMEAMOUNT", "data_type": "NUMBER", "is_nullable": "YES"}, {"column_name": "SALINVLINECOUNT", "data_type": "NUMBER", "is_nullable": "YES"}, {"column_name": "RUN_ID", "data_type": "NUMBER", "is_nullable": "YES"}, {"column_name": "SALDBDISCAMT", "data_type": "NUMBER", "is_nullable": "YES"}, {"column_name": "CREATEDDATE", "data_type": "TIMESTAMP_NTZ", "is_nullable": "YES"}, {"column_name": "SALINVNO", "data_type": "TEXT", "is_nullable": "YES"}, {"column_name": "SALESROUTENAME", "data_type": "TEXT", "is_nullable": "YES"}, {"column_name": "SALINVTYPE", "data_type": "TEXT", "is_nullable": "YES"}, {"column_name": "SALFREEQTYVALUE", "data_type

In [97]:
from langchain_core.prompts import PromptTemplate
from  datetime import datetime
syn_data = {}

for i in range(10): #(len(tex)):
    prompt_template = """
    Generate 1 row of good and bad-quality data for each table based on the given metadata. 
    All data should strictly comply with constraints and data types of respective tables, 
    while bad data should simulate realistic yet invalid scenarios violating constraints like:
    1. Negative or illogical values (e.g., negative age or weight).
    2. Invalid or out-of-range dates (e.g., February 30, year > 9999).
    3. Nullability violations (e.g., null in non-nullable fields).
    4. Duplicate primary keys.
    5. Logical inconsistencies (e.g., start date after end date).
    6. missing values.
    
    
    Output format: 
    please DO NOT give invalid timestamp data.
    strictly Provide a JSON array format containing serializable data for each table.
    Provide the output in pure json JSON array format which I can parse as a json data to various platforms.
    Generate json serializable data.
    Dont provide any comments in between and any description.
    dont repeat table name inside the json data.
    Only json format data is allowed without any // comment in it.
    This is required format in which we require generated data.

    here is the input table Metadata: 
    {metadata}

    """

    # Create the prompt
    prompt = PromptTemplate(input_variables=["metadata"], template=prompt_template)
    formatted_prompt = prompt.format(metadata=texts[i])
    
    df1 = texts[i]
    metadata_dict1 = json.loads(df1)
    table_name = list(metadata_dict1.keys())[0]
    # print(formatted_prompt)
    response = model(formatted_prompt)

    synthetic_data = response.content.replace("```json", "").replace("```", "").strip()
    # Display the result
    syn_data[str(table_name)] = synthetic_data

    parsed_data = json.loads(json.dumps(syn_data))
    final_data = json.loads(parsed_data[table_name])

    syn_data[table_name] = final_data
    # final_data
    print("success")
    print(datetime.now())

print(syn_data)
    

    


success
2024-12-30 17:57:48.574459
success
2024-12-30 17:58:06.141178
success
2024-12-30 17:58:10.690342
success
2024-12-30 17:58:13.250333
success
2024-12-30 17:58:18.269916
success
2024-12-30 17:58:24.409621
success
2024-12-30 17:58:27.893018
success
2024-12-30 17:58:38.441486
success
2024-12-30 17:58:42.642038
success
2024-12-30 17:58:47.969766
{'CUST_CUSTOMER': [{'CUSTOMERNAME': 'John Doe', 'STORETYPE': 'Retail', 'POSTCODE': '12345', 'CUSTOMERID': 'CUST001', 'FETCHEDSEQUENCE': 1, 'AZUREDATETIME': '2023-09-15T12:34:56', 'REMOTEKEY': 'RK123', 'MARKETINGPERMISSION': 'Yes', 'STREETNUMBER': '123', 'AZUREFILE': 'file123.txt', 'COUNTY': 'CountyName', 'REGION': 'RegionName', 'CDL_DATETIME': '2023-09-15T12:34:56', 'WEBSITE': 'https://example.com', 'PHONENUMBER': '+1234567890', 'ECOMMERCEFLAG': 'Y', 'DISTRICT': 'DistrictName', 'SECONDARYTRADENAME': 'TradeName', 'FETCHEDDATETIME': '2023-09-15T12:34:56', 'STREETNAME': 'Main Street', 'STOREREFERENCE': 'StoreRef001', 'SECONDARYTRADECODE': 'TCode

In [98]:
try:
    data1 = json.dumps(syn_data)
except json.JSONDecodeError:
    print("Error: Failed to parse generated JSON data.")
    data1 = []

data1

'{"CUST_CUSTOMER": [{"CUSTOMERNAME": "John Doe", "STORETYPE": "Retail", "POSTCODE": "12345", "CUSTOMERID": "CUST001", "FETCHEDSEQUENCE": 1, "AZUREDATETIME": "2023-09-15T12:34:56", "REMOTEKEY": "RK123", "MARKETINGPERMISSION": "Yes", "STREETNUMBER": "123", "AZUREFILE": "file123.txt", "COUNTY": "CountyName", "REGION": "RegionName", "CDL_DATETIME": "2023-09-15T12:34:56", "WEBSITE": "https://example.com", "PHONENUMBER": "+1234567890", "ECOMMERCEFLAG": "Y", "DISTRICT": "DistrictName", "SECONDARYTRADENAME": "TradeName", "FETCHEDDATETIME": "2023-09-15T12:34:56", "STREETNAME": "Main Street", "STOREREFERENCE": "StoreRef001", "SECONDARYTRADECODE": "TCode01", "COUNTRY": "CountryName", "EMAIL": "email@example.com", "SOLDTOPARTY": "Party001", "CHANNEL": "Online", "CITY": "CityName", "LOAD_KEY": "LoadKey001", "SALESGROUP": "SalesGroup01", "CDL_SOURCE_FILE": "source_file.txt"}, {"CUSTOMERNAME": "", "STORETYPE": null, "POSTCODE": "123456789012345", "CUSTOMERID": "CUST001", "FETCHEDSEQUENCE": -5, "AZURE

In [99]:
import csv

# Data in JSON format
data = json.loads(data1)

output_dir = "test_output"
os.makedirs(output_dir, exist_ok=True)

# Process each table
for table_name, rows in data.items():
    if rows:  # Check if the table has data
        # Define output CSV file path
        output_csv_file = os.path.join(output_dir, f"{table_name}.csv")
        
        # Get column names from the first row
        column_names = rows[0].keys()

        # Write data to CSV file
        with open(output_csv_file, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=column_names)
            
            # Write header and rows
            writer.writeheader()
            writer.writerows(rows)

        print(f"Table '{table_name}' saved to {output_csv_file}")
    else:
        print(f"Table '{table_name}' is empty. No file created.")

Table 'CUST_CUSTOMER' saved to test_output\CUST_CUSTOMER.csv
Table 'DAILYSALES' saved to test_output\DAILYSALES.csv
Table 'ITG_SFA_PM' saved to test_output\ITG_SFA_PM.csv
Table 'KPI2DATA_MAPPING' saved to test_output\KPI2DATA_MAPPING.csv
Table 'MRCHR_MERCHANDISINGRESPONSE' saved to test_output\MRCHR_MERCHANDISINGRESPONSE.csv
Table 'MRCHR_RESPONSES' saved to test_output\MRCHR_RESPONSES.csv
Table 'MS_MASTERSURVEY' saved to test_output\MS_MASTERSURVEY.csv
Table 'PRODBU_PRODUCTBUSINESSUNIT' saved to test_output\PRODBU_PRODUCTBUSINESSUNIT.csv
Table 'PRODTR_PRODUCTTRANSLATION' saved to test_output\PRODTR_PRODUCTTRANSLATION.csv
Table 'PROD_PRODUCT' saved to test_output\PROD_PRODUCT.csv


In [20]:
import os
import pandas as pd
from snowflake.connector.pandas_tools import write_pandas

base_dir = os.getcwd()

conn = snowflake.connector.connect(
        user=env_vars.get("SNOWFLAKE_USER"),
        password=env_vars.get("SNOWFLAKE_PASSWORD"),
        account=env_vars.get("SNOWFLAKE_ACCOUNT"),
        warehouse=env_vars.get("SNOWFLAKE_WAREHOUSE"),
        database=env_vars.get("SNOWFLAKE_DATABASE"),
        schema=env_vars.get("SNOWFLAKE_SCHEMA"),
    )

# base_dir = r"C:\Users\kanil01\genai_de"
#  str(Path(__file__).parent.parent)
base_dir = r"C:\Users\kanil01\genai_de"
output_dir = os.path.join(base_dir, "data", "csv_output") 
print(output_dir)
for table_name in table_names:
    csv_file = os.path.join(output_dir, f"{table_name}.csv")
    
    if os.path.exists(csv_file):
        # Read CSV into DataFrame
        df = pd.read_csv(csv_file)
        try:
            success, nchunks, nrows, _ = write_pandas(
                conn=conn,
                df=df,
                table_name=table_name
            )
            print(f"Loaded {nrows} rows into {table_name}")
        except Exception as e:
            print(f"Failed to insert data into table: {table_name}")
            print(f"Error: {str(e)}")
            continue
        

C:\Users\kanil01\genai_de\data\csv_output
Loaded 5 rows into CUST_CUSTOMER
Loaded 5 rows into DAILYSALES
Loaded 5 rows into ITG_SFA_PM
Loaded 5 rows into KPI2DATA_MAPPING
Loaded 5 rows into MRCHR_MERCHANDISINGRESPONSE
Loaded 5 rows into MRCHR_RESPONSES
Loaded 5 rows into MS_MASTERSURVEY
Loaded 5 rows into PRODBU_PRODUCTBUSINESSUNIT
Loaded 5 rows into PRODTR_PRODUCTTRANSLATION
Loaded 5 rows into SDL_ALL_DISTRIBUTOR_SELLOUT_SALES_FACT
Loaded 5 rows into SDL_AU_DSTR_CHS_HEADER
Loaded 5 rows into SDL_AU_DSTR_SIGMA_HEADER
Loaded 5 rows into SDL_CBD_GT_SALES_REPORT_FACT
Loaded 5 rows into SDL_CCR_PRODUCT_MAPPING
Loaded 5 rows into SDL_CHS_DSTR
Loaded 5 rows into SDL_CLAVIS_GB_PRODUCTS
Loaded 5 rows into SDL_AU_DSTR_SYMBION_HEADER
Loaded 5 rows into SDL_API_DSTR
