In [1]:
import requests
import json
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta, timezone
import pandas as pd
import polars as pl
import boto3
import logging

# Load the environment variables from the .env file
load_dotenv()

# Provide the access key from AWS SSM
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
ssm_client = session.client('ssm', region_name='ap-southeast-2')

# Test the connection to AWS SSM
response = ssm_client.get_parameters_by_path(
    Path='/tug-dinlr/api/',
    Recursive=True,
    WithDecryption=True
)

# Function to get secrets from AWS SSM
def get_secrets():
    
    response = ssm_client.get_parameters_by_path(
        Path='/tug-dinlr/api/',
        Recursive=True,
        WithDecryption=True
    )
    
    secrets = {param['Name'].split('/')[-1]: param['Value'] for param in response['Parameters']}
    
    return secrets

# Get secrets
params = get_secrets()

# Set API details
# Use the access token to interact with the Dinlr API
base_url = "https://api.dinlr.com/v1"
aheaders = {
    "Authorization": f"Bearer {params['ACCESS_TOKEN']}"
}

rheaders = {'Content-Type': 'application/x-www-form-urlencoded'}

def get_locations(restaurant_id, headers):
    """Fetch locations from the API and return an iterable of (id, name)."""
    response = requests.get(f"https://api.dinlr.com/v1/{restaurant_id}/onlineorder/locations", headers=headers)
    data = response.json()
    
    # Extract the 'id' and 'name' from each location and store them in a list of tuples
    locations = [(location['id'], location['name']) for location in data['data']]
    return locations

locations = get_locations(params['RESTAURANT_ID'], aheaders)

def convert_to_datetime(date_string):
    return datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S+08:00")

def convert_to_datetime_timezone(date_string):
    return datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S%z")

def is_token_expired(expiry_date_str):
    """Check if the access token has expired."""
    expiry_date = convert_to_datetime_timezone(expiry_date_str)
    return datetime.now(utc_plus_8) >= expiry_date

def refresh_access_token():
    """Request a new access token using the refresh token."""
    parameters = {
        "refresh_token": params['REFRESH_TOKEN'],
        "client_id": params['CLIENT_ID'],
        "client_secret": params['CLIENT_SECRET'],
        "grant_type": "refresh_token"
    }

    response = requests.post(f"{base_url}/{params['RESTAURANT_ID']}/oauth/token", data=parameters, headers=rheaders)
    response.raise_for_status()  # Ensure we raise an error for bad responses
    data = response.json()

    new_params = {
        'ACCESS_TOKEN': data["access_token"],
        'REFRESH_TOKEN': data["refresh_token"],
        'EXPIRES_AT': (datetime.now(utc_plus_8) + timedelta(seconds=int(data["expires_in"]))).strftime("%Y-%m-%dT%H:%M:%S+08:00"),
        'EXPIRES_IN': str(data["expires_in"])
    }

    for key, value in new_params.items():
        ssm_client.put_parameter(Name=f'/tug-dinlr/api/{key}', Value=value, Type='String', Overwrite=True)

    return new_params


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Defining the functions to get order and order details.

In [2]:
# Function to get all orders
def get_all_orders(location_id, all=True, update_at_min=None, create_at_min=None, create_at_max=None):
    orders = []
    page = 1
    while True:
        url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/orders?location_id={location_id}&page={page}"
        
        if update_at_min:
            update_at_min = update_at_min.replace("+", "%2B")
            url += f"&update_at_min={update_at_min}"
        
        if create_at_min:
            create_at_min = convert_to_datetime(create_at_min).strftime("%Y-%m-%dT%H:%M:%S+08:00").replace("+", "%2B")
            create_at_max = convert_to_datetime(create_at_max).strftime("%Y-%m-%dT%H:%M:%S+08:00").replace("+", "%2B") if create_at_max else None
            url += f"&create_at_min={create_at_min}"
            if create_at_max:
                url += f"&create_at_max={create_at_max}"

        response = requests.get(url, headers=aheaders)
        response.raise_for_status()
        data = response.json()["data"]

        if not data:
            break

        orders.extend(data)
        page += 1

    return orders

# Function to get order details and add 'location' key
def get_order_details(order_id, location='tug'):
    url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/orders/{order_id}"
    response = requests.get(url, headers=aheaders)
    response.raise_for_status()
    order_details = response.json()["data"]
    order_details['location'] = location
    return order_details

# Function to upload to S3
def upload_data_to_s3(data, bucket_name, prefix, date_format="%Y-%m-%d"):
    if not data:
        logging.info(f"No data to upload for {prefix}.")
        return None

    try:
        last_created = convert_to_datetime(data[-1]['created_at']) + timedelta(seconds=1)
        last_created_str = last_created.strftime("%Y-%m-%dT%H:%M:%S+08:00")
        file_key = f"{prefix}_{datetime.now().strftime(date_format)}.json"

        s3.Object(bucket_name, file_key).put(Body=(bytes(json.dumps(data, indent=4).encode('UTF-8'))))
        logging.info(f"Successfully uploaded {prefix} data to S3.")
        return last_created_str
    except Exception as e:
        logging.error(f"Failed to upload {prefix} data: {e}")
        return None

In [3]:
# # Read last item of json data into a pandas dataframe
# master_TUG = pd.read_json('TUG_orders_migration.json')
# master_EVENT = pd.read_json('EVENT_orders_migration.json')

In [4]:
# # get last 'created_at' date
# last_created_TUG = master_TUG['created_at'].max() + pd.Timedelta(seconds=1)
# last_created_EVENT = master_EVENT['created_at'].max() + pd.Timedelta(seconds=1)

# # convert last_created to string in ISO 8601 format: "2024-02-25T02:00:15+08:00"
# # last_created_TUG = last_created_TUG.strftime("%Y-%m-%dT%H:%M:%S%z")
# # last_created_EVENT = last_created_EVENT.strftime("%Y-%m-%dT%H:%M:%S%z")
# # print(f"Last created TUG: {last_created_TUG}")
# # print(f"Last created EVENT: {last_created_EVENT}")

In [5]:
# # get order for TUG at last updated date using the API
# TUG_orders = get_all_orders(lTUG_ID, all=False, create_at_min=last_created_TUG)
# EVENT_orders = get_all_orders(lEVENT_ID, all=False, create_at_min=last_created_EVENT)


In [9]:
# from datetime import datetime
# # Fetch all orders and their details
# TUG_orders = get_all_orders(lTUG_ID)
# EVENT_orders = get_all_orders(lEVENT_ID)

# TUG_all_order_details = [get_order_details(order["id"], location="tug") for order in TUG_orders]
# EVENT_all_order_details = [get_order_details(order["id"], location="event") for order in EVENT_orders]

# # Get today's date
# today = datetime.now().strftime("%Y-%m-%d")

# # Dump details to json with today's date as suffix in the filename
# with open(f'TUG_orders_{today}.json', 'w') as f:
#     json.dump(TUG_all_order_details, f)

# with open(f'EVENT_orders_{today}.json', 'w') as f:
#     json.dump(EVENT_all_order_details, f)

In [None]:
# # Convert to DataFrame
# TUG_df_orders = pd.DataFrame(TUG_all_order_details)
# EVENT_df_orders = pd.DataFrame(EVENT_all_order_details)

# parsed = json.loads(TUG_df_orders.to_json(orient="records"))
# with open("TUG_orders.json", "w") as json_file:
#     json.dump(parsed, json_file, indent=4)

# parsed = json.loads(EVENT_df_orders.to_json(orient="records"))
# with open("EVENT_orders.json", "w") as json_file:
#     json.dump(parsed, json_file, indent=4)

# # Curating data...
# # Define a function to unnest and create separate tables
# def unnest_json(df, field_name):
#     return df.select(pl.col(field_name).arr.flatten().alias(field_name)).explode(field_name)

# # Define a function to obtain json keys that has nested arrays
# def get_nested_keys(json_data):
#     nested_keys = []
#     for key, value in json_data.items():
#         if isinstance(value, list):
#             nested_keys.append(key)
#     return nested_keys

# def check_nested_keys(json_data, parent_key=None):
#     nested_keys = []
#     for key, value in json_data.items():
#         if isinstance(value, list):
#             nested_keys.append((parent_key, key))
#             for item in value:
#                 nested_keys.extend(check_nested_keys(item, key))
#         elif isinstance(value, dict):
#             nested_keys.append((parent_key, key))
#             nested_keys.extend(check_nested_keys(value, key))
#     return nested_keys

# nested_keys = check_nested_keys(json_data)

# def get_order_schema(all_order_details):
#     # Load the JSON data into Polars
#     df = pl.DataFrame(all_order_details)

#     # Get the nested keys
#     nested_keys = get_nested_keys(all_order_details[0])

#     # Create separate tables for each nested keys
#     tables = {}
#     for key in nested_keys:
#         tables[key] = unnest_json(df, key)

#     return tables



# # Call the function with TUG_all_order_details
# process_order_details(TUG_all_order_details)

In [37]:
# TUG_df_orders_items = pd.json_normalize(TUG_df_orders['items'].explode())

Save to parquet / json

In [None]:
# # Save to Parquet with GZIP compression
# parquet_file = '/mnt/data/orders.parquet.gzip'
# df_orders.to_parquet(parquet_file, compression='gzip')