In [178]:

from snowflake.connector.pandas_tools import write_pandas
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter

import snowflake.connector
import pandas as pd
import json
import csv
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import Dict, List, Any
# from azure.storage.filedatalake import DataLakeServiceClient
from io import StringIO

load_dotenv()


env_vars = {
        "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
        "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
        "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
        "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
        "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
        "SNOWFLAKE_SCHEMA": os.environ.get("SNOWFLAKE_SCHEMA"),
        "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
        "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
        "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
        "connection_string":os.environ.get("AZURE_STORAGE_CONNECTION_STRING")
    }

In [179]:
import openai
import numpy as np
import snowflake.connector
from sklearn.ensemble import IsolationForest
import pinecone
from sqlalchemy import create_engine

# Setup OpenAI API
# openai.api_key = 'your-openai-api-key'

# Snowflake connection details
def connect_to_snowflake():
        connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
        engine = create_engine(connection_string)
        return engine

In [180]:
engine = connect_to_snowflake()

In [181]:
model = AzureChatOpenAI(
        azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
        azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
        openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
        openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
    )


In [182]:
from langchain_openai import AzureOpenAIEmbeddings

def get_embeddings(texts):
    embeddings_model = AzureOpenAIEmbeddings(
    deployment=os.environ["AZURE_OPENAI_EMBED_DEPLOYMENT_NAME"],
    model=os.environ["AZURE_OPENAI_EMBED_MODEL_NAME"],
    openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    openai_api_type=os.environ["AZURE_OPENAI_API_TYPE"]
    )
    response = embeddings_model.embed_query(texts)
    return np.array(response)

In [254]:
query = "SELECT distinct TABLE_NAME FROM INFORMATION_SCHEMA.tables where TABLE_SCHEMA = 'TEST3' "
tables = pd.read_sql(query, engine)

tables = tables['table_name'].to_numpy()

tables[0]


'SALES_DATA'

In [199]:
def get_anomaly_explanation(anomaly_data):
    prompt = f"Explain why the following data points are unusual or anomalous:\n{anomaly_data}"
    response = model(prompt)
    explanation = response
    return explanation

In [None]:
for table in tables:
    table_name = table
    print(f"Processing table: {table_name}")
    
    # Query to fetch all data from the table
    data_query = f"SELECT * FROM {table_name}"  # No LIMIT applied here
    data = pd.read_sql(query,engine)

    # Convert to NumPy array (assuming your data is tabular)
    data_array = np.array(data)

    # Vectorize the data (Assuming text data in the first column)
    text_data = str(data_array[:, 0])  # Assuming the first column is text-based data

    # Generate embeddings using GPT model
    embeddings = get_embeddings(text_data).reshape(-1,1)

    # Process embeddings (e.g., anomaly detection or storage in Pinecone)
    print(f"Generated embeddings for table {table_name}")
    
    iso_forest = IsolationForest(contamination= 0.01, 
                            max_features= 0.5,
                            max_samples = 0.5,
                            n_estimators =50,
                            random_state = 42 )
    
    print(type(embeddings))
    y_pred = iso_forest.fit_predict(embeddings)
    
    print(y_pred)
    anomalies = np.where(y_pred == -1)[0]
    print(f"Anomalies detected in table {table_name} at indices: {anomalies}")
    
    indexex = []
    for j in anomalies:
        indexex.append(j)
        anomalous_rows = data.iloc[j]
        print(anomalous_rows)
    
    print(indexex)
    # anomalous_rows = data.iloc[indexex]
    # print(anomalous_rows)
    

print("DONE")
print(embeddings)

Processing table: SALES_DATA
Generated embeddings for table SALES_DATA
<class 'numpy.ndarray'>
[1 1 1 ... 1 1 1]
Anomalies detected in table SALES_DATA at indices: [  87  194  228  233  303  541  548  846  954 1055 1120 1246 1335 1348
 1386 1487]
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_name]
Index: []
Empty DataFrame
Columns: [table_n

In [98]:
!pip install sklearn 
!pip install shap 
!pip install matplotlib 
!pip install seaborn 
!pip install sentence-transformers 
!pip install pinecone-client


Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

Collecting shap
  Downloading shap-0.46.0-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba (from shap)
  Downloading numba-0.60.0-cp312-cp312-win_amd64.whl.metadata (2.8 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)
Collecting llvmlite<0.44,>=0.43.0dev0 (from numba->shap)
  Downloading llvmlite-0.43.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Downloading shap-0.46.0-cp312-cp312-win_amd64.whl (456 kB)
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)
Downloading numba-0.60.0-cp312-cp312-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   --------------- ------------------------ 1.0/2.7 MB 5.6 MB/s eta 0:00:01
   ---------------------------------------- 2.7/2.7 MB 6.5 MB/s eta 0:00:00
Downloading llvmlite-0.43.0-cp312-cp31

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pinecone 5.4.2 requires pinecone-plugin-inference<4.0.0,>=2.0.0, but you have pinecone-plugin-inference 1.1.0 which is incompatible.


In [96]:

# pinecone.init(api_key="pcsk_gLrQj_4psa9Mz21uBCPyi4FkoTFQpM5vR3cdSFqA6pcskrzQTsF3w3EiaN8WmejJbxYey", environment="us-west1-gcp")

In [97]:


# # Initialize Pinecone (replace with your actual API key)
# pinecone.init(api_key="your-api-key", environment="us-west1-gcp")

# # Create a cursor object to execute SQL queries
# cursor = conn.cursor()

# # Get list of all tables in the schema
# cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'your_schema'")
# tables = cursor.fetchall()

# # Iterate over each table to fetch data and process it
# for table in tables:
#     table_name = table[0]
#     print(f"Processing table: {table_name}")
    
#     # Query to fetch all data from the table
#     cursor.execute(f"SELECT * FROM {table_name}")  # No LIMIT applied here
#     data = cursor.fetchall()

#     # Convert to NumPy array (assuming your data is tabular)
#     data_array = np.array(data)

#     # Vectorize the data (Assuming text data in the first column)
#     text_data = data_array[:, 0]  # Assuming the first column is text-based data

#     # Generate embeddings using GPT model
#     embeddings = get_embeddings(text_data)

#     # Apply Isolation Forest for anomaly detection
#     iso_forest = IsolationForest(contamination=0.05)
#     y_pred = iso_forest.fit_predict(embeddings)

#     # Anomalies are labeled as -1, normal points as 1
#     anomalies = np.where(y_pred == -1)[0]
#     print(f"Anomalies detected in table {table_name} at indices: {anomalies}")

#     # Get explanation for the anomaly
#     anomalous_data = [text_data[i] for i in anomalies]
#     explanation = get_anomaly_explanation(anomalous_data)
#     print(f"Explanation for anomalies: {explanation}")

#     # Insert data into Pinecone for the current table
#     index_name = f"{table_name}-vector-index"
#     if index_name not in pinecone.list_indexes():
#         pinecone.create_index(index_name, dimension=embeddings.shape[1], metric="cosine")
#     index = pinecone.Index(index_name)

#     # Prepare data for Pinecone insertion
#     pinecone_data = [
#         {"id": f"{table_name}_{i}", "values": embeddings[i].tolist()} 
#         for i in range(embeddings.shape[0])
#     ]

#     # Insert data into Pinecone for the current table
#     index.upsert(vectors=pinecone_data)

# # Close the cursor and connection
# cursor.close()
# conn.close()


AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [99]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import shap

In [100]:
def connect_to_snowflake():
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    return engine

In [101]:
engine = connect_to_snowflake()

In [228]:
query = f"SELECT * FROM RAW.TEST3.sales_data"
df = pd.read_sql(query, engine)
df

Unnamed: 0,sls_doc_typ,billing_type,cust_no,fisc_yr,fisc_mo,cal_day,fisc_wk_num,sls_ofc_cv_cd,sls_ofc_cv,sls_grp_cv_cd,...,country,edw_cust_nm,currency,from_crncy,to_crncy,ex_rt_typ,ex_rt,country_cd,company_nm,current_fisc_per
0,ZC2K,ZC2K,111602,2024,7,2024-07-01,1,3220,Neighborhood Channel,K71,...,South Korea,GS RETAIL- CVS,KRW,KRW,KRW,BWAR,1.000000,KR,J&J Korea S&D LLC,2018012
1,ZC2K,ZC2K,111602,2024,7,2024-07-01,1,3220,Neighborhood Channel,K71,...,South Korea,GS RETAIL- CVS,KRW,KRW,USD,BWAR,0.000755,KR,J&J Korea S&D LLC,2018012
2,ZC2K,ZC2K,111602,2024,7,2024-07-01,1,3220,Neighborhood Channel,K71,...,South Korea,GS RETAIL- CVS,KRW,KRW,SGD,BWAR,0.001023,KR,J&J Korea S&D LLC,2018012
3,ZC2K,ZC2K,111602,2024,7,2024-07-01,1,3220,Neighborhood Channel,K71,...,South Korea,GS RETAIL- CVS,KRW,KRW,KRW,BWAR,1.000000,KR,J&J Korea S&D LLC,2018012
4,ZC2K,ZC2K,111602,2024,7,2024-07-01,1,3220,Neighborhood Channel,K71,...,South Korea,GS RETAIL- CVS,KRW,KRW,USD,BWAR,0.000755,KR,J&J Korea S&D LLC,2018012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66607,ZSMX,ZSMX,110616,2024,7,2024-07-24,4,1170,OTHERS,H76,...,Hong Kong,MARKETING,HKD,HKD,USD,BWAR,0.127490,HK,J&J Hong Kong,2018012
66608,ZSMX,ZSMX,110616,2024,7,2024-07-24,4,1170,OTHERS,H76,...,Hong Kong,MARKETING,HKD,HKD,USD,BWAR,0.127490,HK,J&J Hong Kong,2018012
66609,ZSMX,ZSMX,110616,2024,7,2024-07-24,4,1170,OTHERS,H76,...,Hong Kong,MARKETING,HKD,HKD,SGD,BWAR,0.172890,HK,J&J Hong Kong,2018012
66610,ZSMX,ZSMX,110616,2024,7,2024-07-24,4,1170,OTHERS,H76,...,Hong Kong,MARKETING,HKD,HKD,USD,BWAR,0.127490,HK,J&J Hong Kong,2018012


In [216]:
# numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
# text_columns = df.select_dtypes(include=[object]).columns.tolist()

# # Print out detected column types
# print(f"Numeric Columns: {numeric_columns}")
# print(f"Text Columns: {text_columns}")


numeric_columns = []
text_columns = []

# Iterate through the columns to check if they are numeric or non-numeric
for column in df.columns:
    if pd.to_numeric(df[column], errors='coerce').notna().all():  # Check if all values can be converted to numeric
        numeric_columns.append(column)
    else:
        text_columns.append(column)

# Output the lists
final_columns = numeric_columns+(text_columns)
print(final_columns)
for c in final_columns:
    print(c)

['orderid', 'backlog', 'qty', 'subamt1', 'discount', 'subamt2', 'discountbtline', 'totalbeforevat', 'total', 'no', 'avgdiscount', 'crt_dttm', 'saleunit', 'orderdate', 'customer_id', 'customer_name', 'city', 'region', 'saledistrict', 'saleoffice', 'salegroup', 'customertype', 'storetype', 'saletype', 'salesemployee', 'salename', 'productid', 'productname', 'megabrand', 'brand', 'baseproduct', 'variant', 'putup', 'priceref', 'canceled', 'documentid', 'return_reason', 'promotioncode', 'promotioncode1', 'promotioncode2', 'promotioncode3', 'promotioncode4', 'promotioncode5', 'promotion_code', 'promotion_code2', 'promotion_code3', 'ordertype', 'approverstatus', 'pricelevel', 'optional3', 'deliverydate', 'ordertime', 'shipto', 'billto', 'deliveryrouteid', 'approved_date', 'approved_time', 'ref_15', 'paymenttype', 'filename', 'run_id']
orderid
backlog
qty
subamt1
discount
subamt2
discountbtline
totalbeforevat
total
no
avgdiscount
crt_dttm
saleunit
orderdate
customer_id
customer_name
city
regio

In [203]:
def get_embeddings(texts):
    embeddings_model = AzureOpenAIEmbeddings(
    deployment=os.environ["AZURE_OPENAI_EMBED_DEPLOYMENT_NAME"],
    model=os.environ["AZURE_OPENAI_EMBED_MODEL_NAME"],
    openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    openai_api_type=os.environ["AZURE_OPENAI_API_TYPE"]
    )
    response = embeddings_model.embed_query(texts)
    return np.array(response)

data_array = np.array(df)

    # Vectorize the data (Assuming text data in the first column)
text_data = str(data_array[:])  # Assuming the first column is text-based data
    # Generate embeddings using GPT model
final_data = get_embeddings(text_data).reshape(-1,1)
# Assume we are processing the first text column

print(final_data)
iso_forest = IsolationForest(contamination= 0.01, 
                            max_features= 0.5,
                            max_samples = 0.5,
                            n_estimators =50,
                            random_state = 42 )

print(type(final_data))
y_pred = iso_forest.fit_predict(final_data)

print(y_pred)
anomalies = np.where(y_pred == -1)[0]
print(f"Anomalies detected in table {table_name} at indices: {anomalies}")


[[-0.01863261]
 [-0.01264146]
 [ 0.0001371 ]
 ...
 [-0.02907683]
 [-0.02515109]
 [-0.03123012]]
<class 'numpy.ndarray'>
[1 1 1 ... 1 1 1]
Anomalies detected in table SDL_LA_GT_SALES_ORDER_FACT at indices: [   9  121  194  430  498  702  704  757  820  954 1120 1246 1348 1432
 1487 1500]


In [229]:
import snowflake.connector
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import numpy as np
from sklearn.ensemble import IsolationForest
import pinecone

In [230]:


def preprocess_and_embed(data, embedding_model):
    model = SentenceTransformer(embedding_model)
    embeddings = model.encode(data.astype(str).values.tolist())
    return embeddings

# embedding_model = 'all-MiniLM-L6-v2'
def get_embeddings(texts):
    embeddings_model = AzureOpenAIEmbeddings(
    deployment=os.environ["AZURE_OPENAI_EMBED_DEPLOYMENT_NAME"],
    model=os.environ["AZURE_OPENAI_EMBED_MODEL_NAME"],
    openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    openai_api_type=os.environ["AZURE_OPENAI_API_TYPE"]
    )
    response = embeddings_model.embed_query(texts)
    return np.array(response)

# data_embeddings = preprocess_and_embed(df, embedding_model)

data_array = np.array(df)

    # Vectorize the data (Assuming text data in the first column)
text_data = str(data_array[:])  # Assuming the first column is text-based data
    # Generate embeddings using GPT model
data_embeddings = get_embeddings(text_data).reshape(-1,1)

data_embeddings

array([[-0.03059798],
       [ 0.01390478],
       [ 0.00912571],
       ...,
       [-0.02475275],
       [-0.0205925 ],
       [-0.01756551]])

In [234]:
def reduce_dimensions(embeddings, n_components=1):
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings)
    return reduced_embeddings

reduced_embeddings = reduce_dimensions(data_embeddings)

reduced_embeddings

array([[-0.02983477],
       [ 0.014668  ],
       [ 0.00988893],
       ...,
       [-0.02398954],
       [-0.01982928],
       [-0.01680229]])

In [243]:
def detect_anomalies(embeddings, data_length):
    model = IsolationForest(contamination=0.01, random_state=42)
    model.fit(embeddings)
    scores = model.decision_function(embeddings)
    anomalies = model.predict(embeddings)
    if len(scores) != data_length or len(anomalies) != data_length:
        raise ValueError("Mismatch between data length and anomaly detection output length.")
    return scores, anomalies

scores, anomalies = detect_anomalies(reduced_embeddings, len(data))
# data['anomaly_score'] = scores
# data['is_anomaly'] = anomalies
print(scores,anomalies)


ValueError: Mismatch between data length and anomaly detection output length.

In [253]:
import openai
import snowflake.connector
import pandas as pd
import pinecone
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from openai import OpenAI
import os
from pinecone import Pinecone, ServerlessSpec
load_dotenv()

def connect_to_snowflake():
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    return engine

engine = connect_to_snowflake()
# Pinecone setup (make sure to initialize with your API key)


pc = Pinecone(
    api_key=os.environ.get("OPENAI_API_KEY")
)

# Now do stuff
index_name = "quickstart"

pc.create_index(
    name=index_name,
    dimension=2, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)
 

# OpenAI setup
# openai.api_key = '<your_openai_api_key>'

from openai import OpenAI
client = OpenAI()

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': '99ba62beaa013211895e848671940930', 'Date': 'Wed, 08 Jan 2025 15:06:01 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [None]:
import openai
import snowflake.connector
import pandas as pd
import pinecone
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from openai import OpenAI

def connect_to_snowflake():
    connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
    engine = create_engine(connection_string)
    return engine

engine = connect_to_snowflake()
# Pinecone setup (make sure to initialize with your API key)
pinecone.init(api_key='pcsk_gLrQj_4psa9Mz21uBCPyi4FkoTFQpM5vR3cdSFqA6pcskrzQTsF3w3EiaN8WmejJbxYey', environment='us-west1-gcp')
index_name = 'Sales_data_index'
index = pinecone.Index(index_name)

# OpenAI setup
openai.api_key = '<your_openai_api_key>'

from openai import OpenAI
client = OpenAI()

# Function to extract data from Snowflake
def fetch_data_from_snowflake(query):
    query = "Select 8 from RAW.TEST3.SALES_DATA"
    df = connect_to_snowflake(query, query)
    return df

# Function to generate embeddings from the model
def generate_embeddings(texts, model="text-embedding-ada-002"):
    response = openai.Embedding.create(
        input=texts,
        model=model
    )
    embeddings = [embedding['embedding'] for embedding in response['data']]
    return embeddings

# Function to preprocess data and generate embeddings in chunks
def process_and_embed_data(df, chunk_size=1000):
    embeddings_list = []
    # Loop over the data in chunks
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start+chunk_size]
        # Preprocess data (e.g., combine columns into one text string for embedding)
        texts = chunk.apply(lambda row: ' '.join(row.astype(str)), axis=1).tolist()
        
        # Generate embeddings for this chunk
        embeddings = generate_embeddings(texts)
        embeddings_list.extend(embeddings)
    return embeddings_list

# Function to store embeddings in Pinecone vector database
def store_embeddings_in_pinecone(embeddings_list, ids):
    # Preparing the data for Pinecone
    vectors = [(ids[i], embeddings_list[i]) for i in range(len(ids))]
    
    # Upsert to Pinecone
    index.upsert(vectors)

# Function to reduce dimensionality if needed (PCA)
def reduce_dimensions(embeddings_list, n_components=512):
    # Normalize the embeddings
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings_list)
    
    # Apply PCA
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings_scaled)
    return reduced_embeddings

# Main function to run the end-to-end process
def main():
    # Step 1: Extract data from Snowflake
    query = "SELECT * FROM your_table LIMIT 5000000"  # Adjust to your table
    df = fetch_data_from_snowflake(query)
    
    # Step 2: Process and generate embeddings in chunks
    embeddings_list = process_and_embed_data(df, chunk_size=1000)
    
    # Step 3: Optionally reduce dimensionality (PCA)
    reduced_embeddings = reduce_dimensions(embeddings_list)
    
    # Step 4: Create unique IDs for each embedding (e.g., based on row indices)
    ids = [str(i) for i in range(len(df))]
    
    # Step 5: Store embeddings in Pinecone
    store_embeddings_in_pinecone(reduced_embeddings, ids)
    
    print("Embedding generation and storage complete.")

if __name__ == "__main__":
    main()


In [282]:
query = "SELECT distinct TABLE_NAME FROM INFORMATION_SCHEMA.tables where TABLE_SCHEMA = 'TEST' "
tables = pd.read_sql(query, engine)

tables = tables['table_name'].to_numpy()

tables = tables[0:11]

tables

array(['SDL_ECOM_SHOPEE_COMPENSATION', 'SDL_MDS_LOG',
       'SDL_MDS_MY_PS_TARGETS', 'SDL_POP6_SG_PLANNED_VISITS',
       'SDL_LA_GT_SCHEDULE', 'SDL_MDS_MY_ECOM_PRODUCT',
       'SDL_ECOMMERCE_OFFTAKE_AMAZON', 'SDL_MDS_SG_PRODUCT_EXCEPTIONS',
       'SDL_MDS_VN_PS_WEIGHTS', 'SDL_KR_COUPANG_PRODUCT_MASTER',
       'SDL_ID_POS_IDM_SELLOUT'], dtype=object)

In [286]:
import faiss

for table in tables:
    table_name = table
    print(f"Processing table: {table_name}")
    
    # Query to fetch all data from the table
    data_query = f"SELECT * FROM {table_name}"  # No LIMIT applied here
    data = pd.read_sql(query,engine)

    # Convert to NumPy array (assuming your data is tabular)
    data_array = np.array(data)

    # Vectorize the data (Assuming text data in the first column)
    text_data = str(data_array[:, 0])  # Assuming the first column is text-based data

    # Generate embeddings using GPT model
    embeddings = get_embeddings(text_data).reshape(-1,1)

    # Process embeddings (e.g., anomaly detection or storage in Pinecone)
    print(f"Generated embeddings for table {table_name}")
    
    dimension = embeddings.shape[1]  # This should be the size of the embedding vectors (e.g., 1536 for 'text-embedding-ada-002')
    index = faiss.IndexFlatL2(dimension)  # Use L2 distance for similarity

    # Add embeddings to FAISS index
    embeddings = embeddings.astype(np.float32)  # FAISS requires float32 type
    index.add(embeddings)
    
    distance_threshold = 0.9
    k=4
    
    distances, indices = index.search(embeddings, k)  # k nearest neighbors
    
    anomalies = []
    
    # Iterate over each data point
    for i, dist in enumerate(distances):
        # Calculate the average distance to the k-nearest neighbors (excluding the point itself)
        avg_distance = np.mean(dist[1:])  # Exclude the first distance (which is the point itself)
        
        # If the average distance exceeds the threshold, mark it as an anomaly
        if avg_distance > distance_threshold:
            anomalies.append(i)
    

print("DONE")
print(anomalies)

Processing table: SDL_ECOM_SHOPEE_COMPENSATION
Generated embeddings for table SDL_ECOM_SHOPEE_COMPENSATION
Processing table: SDL_MDS_LOG
Generated embeddings for table SDL_MDS_LOG
Processing table: SDL_MDS_MY_PS_TARGETS
Generated embeddings for table SDL_MDS_MY_PS_TARGETS
Processing table: SDL_POP6_SG_PLANNED_VISITS
Generated embeddings for table SDL_POP6_SG_PLANNED_VISITS
Processing table: SDL_LA_GT_SCHEDULE
Generated embeddings for table SDL_LA_GT_SCHEDULE
Processing table: SDL_MDS_MY_ECOM_PRODUCT
Generated embeddings for table SDL_MDS_MY_ECOM_PRODUCT
Processing table: SDL_ECOMMERCE_OFFTAKE_AMAZON
Generated embeddings for table SDL_ECOMMERCE_OFFTAKE_AMAZON
Processing table: SDL_MDS_SG_PRODUCT_EXCEPTIONS
Generated embeddings for table SDL_MDS_SG_PRODUCT_EXCEPTIONS
Processing table: SDL_MDS_VN_PS_WEIGHTS
Generated embeddings for table SDL_MDS_VN_PS_WEIGHTS
Processing table: SDL_KR_COUPANG_PRODUCT_MASTER
Generated embeddings for table SDL_KR_COUPANG_PRODUCT_MASTER
Processing table: SDL_

In [277]:
!pip install faiss-cpu



In [284]:
import faiss

# Initialize the FAISS index
dimension = embeddings.shape[1]  # This should be the size of the embedding vectors (e.g., 1536 for 'text-embedding-ada-002')
index = faiss.IndexFlatL2(dimension)  # Use L2 distance for similarity

# Add embeddings to FAISS index
embeddings = embeddings.astype(np.float32)  # FAISS requires float32 type
index.add(embeddings)

In [285]:
def detect_anomalies(index, embeddings, k=2, distance_threshold=0.9):
    # Perform k-nearest neighbors search for each embedding
    distances, indices = index.search(embeddings, k)  # k nearest neighbors
    
    anomalies = []
    
    # Iterate over each data point
    for i, dist in enumerate(distances):
        # Calculate the average distance to the k-nearest neighbors (excluding the point itself)
        avg_distance = np.mean(dist[1:])  # Exclude the first distance (which is the point itself)
        
        # If the average distance exceeds the threshold, mark it as an anomaly
        if avg_distance > distance_threshold:
            anomalies.append(i)
    
    return anomalies

# Detect anomalies
anomalies = detect_anomalies(index, embeddings, k=5, distance_threshold=0.5)
print(f"Anomalies found at indices: {anomalies}")


Anomalies found at indices: []


In [297]:
import openai
import pandas as pd
import snowflake.connector
import time




def connect_to_snowflake():
        connection_string = f"snowflake://{env_vars.get('SNOWFLAKE_USER')}:{env_vars.get('SNOWFLAKE_PASSWORD')}@{env_vars.get('SNOWFLAKE_ACCOUNT')}/{env_vars.get('SNOWFLAKE_DATABASE')}/{env_vars.get('SNOWFLAKE_SCHEMA')}"
        engine = create_engine(connection_string)
        return engine
# Function to extract data from Snowflake


def fetch_data_from_snowflake(query):
    engine = connect_to_snowflake()
    
    df = pd.read_sql(query,engine)
    return df




# Function to generate anomaly scores or detection results from Gen-AI
def detect_anomalies_with_gen_ai(data_batch):
    
    model = AzureChatOpenAI(
            azure_endpoint=env_vars.get("AZURE_OPENAI_ENDPOINT"),
            azure_deployment=env_vars.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
            openai_api_version=env_vars.get("AZURE_OPENAI_API_VERSION"),
            openai_api_key=env_vars.get("AZURE_OPENAI_API_KEY"),
            )
        
    
    prompt = f'''Identify anomalies in the following data:
    {data_batch} Please list any rows or patterns that seem unusual or inconsistent.
    Dont provide any extra content as total token limit is less.
    only provide :
    table name: <table name>
    solution: <solution>'''
    
    response = model(prompt)
    
    return response.content

# Function to process data in chunks and detect anomalies
def process_data_and_detect_anomalies(df, chunk_size=500):
    anomalies = []
    
    # Loop over the data in manageable chunks
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        
        # Convert the chunk into a readable string format for the Gen-AI model
        chunk_str = chunk.to_string(index=False)
        
        # Pass the chunk to the Gen-AI model for anomaly detection
        print(f"Processing rows {start} to {start + chunk_size}")
        anomaly_result = detect_anomalies_with_gen_ai(chunk_str)
        
        # Store the result (anomalies detected) for further processing
        anomalies.append((start, start + chunk_size, anomaly_result))
        
        # Optional: Add a sleep to avoid hitting API rate limits
        time.sleep(2)
    
    return anomalies

# Main function to run the end-to-end process
def main():
    # Step 1: Extract data from Snowflake
    query = "SELECT * FROM RAW.TEST.SDL_JNJ_CONSUMERREACH_CVS"  # Adjust as necessary
    df = fetch_data_from_snowflake(query)
    
    # Step 2: Process and detect anomalies in chunks
    anomalies = process_data_and_detect_anomalies(df, chunk_size=500)
    
    # Step 3: Output or further process the detected anomalies
    for start, end, anomaly in anomalies:
        print(f"Rows {start}-{end}:")
        print(anomaly)
        print("-" * 50)

if __name__ == "__main__":
    main()


Processing rows 0 to 500
Rows 0-500:
```
table name: anomalies
solution: 
1. Rows with 'None' in critical fields:
   - 'retail': Rows 1, 7, 14, 19, 25, 30, 35, 39, 43, 46, 50
   - 'retailname': Rows 2, 20
   - 'retailbranch': Rows 8, 22, 27, 38, 49
   - 'retailprovince': Rows 3, 15, 23, 28, 33, 44
   - 'jjskubarcode': Rows 4, 11, 32, 45, 49
   - 'jjskuname': Rows 5, 16, 28, 38, 49
   - 'jjcore': Rows 3, 9, 13, 17, 21, 29, 40, 45, 49
   - 'distribution': Rows 1, 6, 10, 17, 30, 35, 40, 49
   - 'file_name': Rows 3, 12, 22, 34, 47
2. Negative 'run_id': Rows 2, 20, 48
3. Row 49 has 'None' across multiple critical fields.
```
--------------------------------------------------
