In [24]:
import os
import boto3
import pandas as pd
import polars as pl
from io import BytesIO, StringIO

def load_file(path, use_polars, is_s3=False, s3_client=None):
    """
    Helper function to load a single file, either from a local path or from S3.

    Parameters:
    - path: The local path or S3 key to the file.
    - use_polars: Whether to use Polars instead of Pandas.
    - is_s3: If True, load the file from S3.
    - s3_client: The Boto3 S3 client, required if is_s3 is True.

    Returns:
    - A DataFrame (Polars or Pandas) loaded from the file.
    """
    if is_s3 and s3_client:
        response = s3_client.get_object(Bucket=path[0], Key=path[1])
        if path[1].endswith('.csv'):
            data = response['Body'].read().decode('utf-8')
            if use_polars:
                return pl.read_csv(BytesIO(data.encode()))
            else:
                return pd.read_csv(StringIO(data))
        elif path[1].endswith('.parquet'):
            data = response['Body'].read()
            if use_polars:
                return pl.read_parquet(BytesIO(data))
            else:
                return pd.read_parquet(BytesIO(data))
    else:
        if path.endswith('.csv'):
            return pl.read_csv(path) if use_polars else pd.read_csv(path)
        elif path.endswith('.parquet'):
            return pl.read_parquet(path) if use_polars else pd.read_parquet(path)
    raise ValueError("File extension not supported. Please use .csv or .parquet.")

def load_s3(bucket_name: str, 
            s3_directory: str = '',
            file_name: str = '', 
            aws_region: str = 'us-west-2', 
            use_polars: bool = False, 
            load_all: bool = False, 
            selected_files: list = None):
    """
    Loads files from an S3 directory into Pandas or Polars DataFrames. Supports CSV and Parquet formats.
    """
    s3_client = boto3.client('s3', region_name=aws_region)
    
    if not load_all:
        s3_path = f"{s3_directory}/{file_name}" if s3_directory else file_name
        df = load_file((bucket_name, s3_path), use_polars, is_s3=True, s3_client=s3_client)
        print(f"DataFrame loaded from S3://{bucket_name}/{s3_path}")
        return df
    
    list_objects = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_directory)
    all_files = [obj['Key'] for obj in list_objects.get('Contents', []) if obj['Key'].endswith(('.csv', '.parquet'))]
    
    if selected_files:
        files_to_load = [file for file in all_files if any(file.endswith(selected_file) for selected_file in selected_files)]
    else:
        files_to_load = all_files

    dataframes = {}
    for s3_path in files_to_load:
        df = load_file((bucket_name, s3_path), use_polars, is_s3=True, s3_client=s3_client)
        file_key = os.path.basename(s3_path)
        dataframes[file_key] = df
        print(f"DataFrame loaded from S3://{bucket_name}/{s3_path}")
    return dataframes

def load_local(data_folder: str, 
               file_name: str = '', 
               use_polars: bool = False, 
               load_all: bool = False, 
               selected_files: list = None):
    """
    Loads files from a local directory into Pandas or Polars DataFrames. Supports CSV and Parquet formats.
    """
    if not load_all:
        local_path = os.path.join(data_folder, file_name)
        df = load_file(local_path, use_polars)
        print(f"DataFrame loaded from {local_path}")
        return df

    all_files = [f for f in os.listdir(data_folder) if f.endswith(('.csv', '.parquet'))]

    if selected_files:
        files_to_load = [file for file in all_files if file in selected_files]
    else:
        files_to_load = all_files

    dataframes = {}
    for file in files_to_load:
        local_path = os.path.join(data_folder, file)
        df = load_file(local_path, use_polars)
        dataframes[file] = df
        print(f"DataFrame loaded from {local_path}")
    return dataframes

# df = load_s3('jtrade1-dir', 'data', 'hhh.csv', use_polars=False)# Single file loading

# df
# df_dict = load_s3('jtrade1-dir', 'data',use_polars=True, load_all=True) # Massive loading: all files in the directory
# df_dict = load_s3('jtrade1-dir', 'data', use_polars=False,load_all=True, selected_files=['hhh.csv', 'eeeee.parquet'])

load_local(data_folder='data', 
               file_name = 'orders.csv', 
               use_polars = True, 
               load_all = True, 
               selected_files = ['products.csv','users.csv'])

DataFrame loaded from data/products.csv
DataFrame loaded from data/users.csv


{'products.csv': shape: (3, 3)
 ┌─────┬────────────┬───────┐
 │ id  ┆ name       ┆ price │
 │ --- ┆ ---        ┆ ---   │
 │ i64 ┆ str        ┆ i64   │
 ╞═════╪════════════╪═══════╡
 │ 1   ┆ Laptop     ┆ 1000  │
 │ 2   ┆ Smartphone ┆ 800   │
 │ 3   ┆ Tablet     ┆ 600   │
 └─────┴────────────┴───────┘,
 'users.csv': shape: (3, 3)
 ┌─────┬───────────────┬───────────────────────────┐
 │ id  ┆ name          ┆ email                     │
 │ --- ┆ ---           ┆ ---                       │
 │ i64 ┆ str           ┆ str                       │
 ╞═════╪═══════════════╪═══════════════════════════╡
 │ 1   ┆ John Doe      ┆ john.doe@example.com      │
 │ 2   ┆ Jane Smith    ┆ jane.smith@example.com    │
 │ 3   ┆ Emily Johnson ┆ emily.johnson@example.com │
 └─────┴───────────────┴───────────────────────────┘}

In [None]:
df.to_csv('hah.csv')

In [None]:
dm.to_s3(bucket_name = 's3://jtrade1-dir/data/',
        object_name='hah.csv',
        data=df.to_csv(index=False))

In [None]:

# Load data locally
df_loaded = data_master.load_from_local('data/2024/07/25/example.csv', 'csv')
print(df_loaded)

# Save data to S3
data_master.save_to_s3('your-bucket-name', 'example.csv', df_example.to_csv(index=False))


### Embedding

In [None]:
import boto3
import json
bedrock = boto3.client(
            service_name='bedrock-runtime',
            region_name='us-west-2')
prompt = 'canberra is capital of australia'
body = json.dumps({
    "inputText": prompt,
})

model_id = 'amazon.titan-embed-text-v1'
accept = 'application/json'
content_type = 'application/json'

response = bedrock.invoke_model(
                    body=body,
                    modelId = model_id,
                    accept=accept,
                    contentType=content_type)

# print (response)
response_body = json.loads(response['body'].read())
embedding = response_body.get('embedding')
print (embedding)

In [None]:
from text_to_sql_agent import TextToSQLAgent

agent = TextToSQLAgent()

In [None]:
user_prompt = "what are the customer emails"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "provide me the sql only"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "some all the ids then minus the sells price"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "what was my first question"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "generate me a pandas dataframe sample table"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "select the second column"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "what is the value of the 3rd column, second value"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "rename the columsn to city related"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "tell me a story about dragon"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
user_prompt = "tell me more about it"
result = agent.process_prompt(user_prompt)
print(result)


In [None]:
from TAI.genai import AWSBedrock
ab = AWSBedrock()
for i in ab.get_active_models():
    print (i)
#     print (i['modelId'])