In [1]:
import requests
import json
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta, timezone
import pandas as pd
import polars as pl
import boto3
import logging

# Load the environment variables from the .env file
load_dotenv()

# Provide the access key from AWS SSM
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
ssm_client = session.client('ssm', region_name='ap-southeast-2')
s3 = session.resource('s3')
utc_plus_8 = timezone(timedelta(hours=8))

# Test the connection to AWS SSM
response = ssm_client.get_parameters_by_path(
    Path='/tug-dinlr/api/',
    Recursive=True,
    WithDecryption=True
)

# Function to get secrets from AWS SSM
def get_secrets(path='/tug-dinlr/api/'):
    secrets = {}
    next_token = None
    
    while True:
        params = ssm_client.get_parameters_by_path(
            Path=path,
            Recursive=True,
            WithDecryption=True,
            NextToken=str(next_token) if next_token is not None else ''
        )
        
        for param in params['Parameters']:
            param_name = param['Name'].split('/')[-1]
            param_value = param['Value']
            secrets[param_name] = param_value
        
        if 'NextToken' in params:
            next_token = params['NextToken']
        else:
            break
    
    return secrets

# Get secrets
params = get_secrets()

# Set API details
# Use the access token to interact with the Dinlr API
base_url = "https://api.dinlr.com/v1"
aheaders = {
    "Authorization": f"Bearer {params['ACCESS_TOKEN']}"
}
rheaders = {'Content-Type': 'application/x-www-form-urlencoded'}


def get_locations(restaurant_id, headers):
    """Fetch locations from the API and return an iterable of (id, name)."""
    response = requests.get(f"https://api.dinlr.com/v1/{restaurant_id}/onlineorder/locations", headers=headers)
    data = response.json()
    print(data)
    
    # Extract the 'id' and 'name' from each location and store them in a list of tuples
    locations = [(location['id'], location['name']) for location in data['data']]
    return locations

def convert_to_datetime(date_string):
    return datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S+08:00")

def convert_to_datetime_timezone(date_string):
    return datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S%z")

def is_token_expired(expiry_date_str):
    """Check if the access token has expired."""
    expiry_date = convert_to_datetime_timezone(expiry_date_str)
    return datetime.now(utc_plus_8) >= expiry_date

def refresh_access_token():
    """Request a new access token using the refresh token."""
    parameters = {
        "refresh_token": params['REFRESH_TOKEN'],
        "client_id": params['CLIENT_ID'],
        "client_secret": params['CLIENT_SECRET'],
        "grant_type": "refresh_token"
    }

    response = requests.post(f"{base_url}/{params['RESTAURANT_ID']}/oauth/token", data=parameters, headers=rheaders)
    response.raise_for_status()  # Ensure we raise an error for bad responses
    data = response.json()

    new_params = {
        'ACCESS_TOKEN': data["access_token"],
        'REFRESH_TOKEN': data["refresh_token"],
        'EXPIRES_AT': (datetime.now(utc_plus_8) + timedelta(seconds=int(data["expires_in"]))).strftime("%Y-%m-%dT%H:%M:%S+08:00"),
        'EXPIRES_IN': str(data["expires_in"])
    }

    for key, value in new_params.items():
        ssm_client.put_parameter(Name=f'/tug-dinlr/api/{key}', Value=value, Type='String', Overwrite=True)

    return new_params


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Defining the functions to get order and order details.

In [2]:
# Function to get all orders
# Function to get all orders
def get_all_orders(location_id, all=True, update_at_min=None, create_at_min=None, create_at_max=None, page=1):
    page = 1
    orders = []
    
    # If no update_at_min is provided, get all orders
    if all:
        try:
            while True:
                url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/orders?location_id={location_id}&page={page}"
                response = requests.get(url, headers=aheaders)
                data = response.json()["data"]
                
                if not data:
                    break
                
                orders.extend(data)
                page += 1
                
        except Exception as e:
            print(f"An error occurred: {str(e)}")

    # If update_at_min is provided, get orders updated after the specified time
    # Update + sign with %2B for update_at_min
    if update_at_min:
        update_at_min = update_at_min.replace("+", "%2B")
        while True:
            url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/orders?location_id={location_id}&update_at_min={update_at_min}&page={page}"
            response = requests.get(url, headers=aheaders)
            data = response.json()["data"]
            
            if not data:
                break
            
            orders.extend(data)
            page += 1

    # If create_at_min is provided, get orders created after the specified time along with create_at_max
    if create_at_min:
        # add 32 days to create_at_min
        if not create_at_max:
            create_at_min = convert_to_datetime(create_at_min)
            create_at_max = create_at_min + timedelta(days=31)
        else:
            create_at_min = convert_to_datetime(create_at_min)
            create_at_max = convert_to_datetime(create_at_max)
            
        create_at_min = create_at_min.strftime("%Y-%m-%dT%H:%M:%S+08:00")
        create_at_max = create_at_max.strftime("%Y-%m-%dT%H:%M:%S+08:00")
        create_at_max = create_at_max.replace("+", "%2B")
        create_at_min = create_at_min.replace("+", "%2B")
        while True:
            url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/orders?location_id={location_id}&create_at_min={create_at_min}&create_at_max={create_at_max}&page={page}"
            response = requests.get(url, headers=aheaders)
            data = response.json()["data"]
            
            if not data:
                break
            
            orders.extend(data)
            page += 1
    
    return orders

# Function to get order details and add 'location' key
def get_order_details(order_id, location='tug'):
    url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/orders/{order_id}"
    response = requests.get(url, headers=aheaders)
    response.raise_for_status()
    order_details = response.json()["data"]
    order_details['location'] = location
    return order_details

# Function to upload to S3
def upload_data_to_s3(data, bucket_name, prefix, date_format="%Y-%m-%d"):
    if not data:
        logging.info(f"No data to upload for {prefix}.")
        return None

    last_created = convert_to_datetime(data[-1]['created_at']) + timedelta(seconds=1)
    file_key = f"{prefix}_{last_created.strftime(date_format)}.json"
    
    try:
        # Check if the file already exists in S3
        obj = s3.Object(bucket_name, file_key)
        try:
            existing_data = json.loads(obj.get()['Body'].read().decode('utf-8'))
            logging.info(f"Existing data found for {file_key}.")
        except obj.meta.client.exceptions.NoSuchKey:
            existing_data = []
            logging.info(f"No existing data found for {file_key}. Creating new file.")

        # Combine and deduplicate data using a dictionary keyed by a unique identifier
        combined_data_dict = {item['id']: item for item in existing_data}
        combined_data_dict.update({item['id']: item for item in data})
        
        # Convert back to list to maintain the order
        combined_data = list(combined_data_dict.values())

        last_created = convert_to_datetime(combined_data[-1]['created_at']) + timedelta(seconds=1)
        last_created_str = last_created.strftime("%Y-%m-%dT%H:%M:%S+08:00")
        
        # Upload combined data back to S3
        obj.put(Body=(bytes(json.dumps(combined_data, indent=4).encode('UTF-8'))))
        logging.info(f"Successfully uploaded {prefix} data to S3.")
        
    except Exception as e:
        logging.error(f"Failed to upload {prefix} data: {e}")

    return last_created_str


In [3]:
if is_token_expired(params['EXPIRES_AT']):
    try:
        new_params = refresh_access_token()
        params.update(new_params)
    except Exception as e:
        logging.error(f"Failed to refresh token: {e}")

locations = get_locations(params['RESTAURANT_ID'], aheaders)

{'data': [{'id': 'a7e56eeb-98db-402d-bea1-1ee35d54a4fd', 'name': 'tug GELATO', 'updated_at': '2024-05-20T14:43:01+08:00'}, {'id': 'dd6564a8-1d55-4f2b-a73f-23f77aa59141', 'name': 'tug @ Bangsar', 'updated_at': '2024-05-20T14:44:56+08:00'}, {'id': '6bf38c18-a852-439b-aa54-007547f1bb6a', 'name': 'tug @ Event', 'updated_at': '2024-06-04T10:47:12+08:00'}]}


In [4]:
def get_items_dim(location_id):
    url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/items?location_id={location_id}"
    response = requests.get(url, headers=aheaders)
    items = response.json()["data"]
    return items

def get_discounts_dim(location_id):
    url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/discounts?location_id={location_id}"
    response = requests.get(url, headers=aheaders)
    discounts = response.json()["data"]
    return discounts

def get_promotions_dim(location_id):
    url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/promotions?location_id={location_id}"
    response = requests.get(url, headers=aheaders)
    promotions = response.json()["data"]
    return promotions

def get_customers_dim(location_id):
    url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/customers"
    response = requests.get(url, headers=aheaders)
    customers = response.json()["data"]
    return customers

def get_vouchers_dim():
    url = f"{base_url}/{params['RESTAURANT_ID']}/onlineorder/vouchers"
    response = requests.get(url, headers=aheaders)
    vouchers = response.json()["data"]
    return vouchers

In [7]:
get_promotions_dim('dd6564a8-1d55-4f2b-a73f-23f77aa59141')

[{'id': '3c9fc489-e7a4-4bb5-9bfc-3c4b28ad613d',
  'name': 'DOUBLE',
  'summary': None,
  'type': 'auto',
  'start_date': '2023-09-14T00:00:00+08:00',
  'end_date': '1970-01-01T07:30:00+07:30',
  'updated_at': '2024-05-22T12:52:08+08:00'},
 {'id': '3e0982af-2d3a-4d35-b48f-b7fb0578a6d9',
  'name': 'Buy 10 Scoop Free 1 Scoop',
  'summary': None,
  'type': 'auto',
  'start_date': '2023-10-01T00:00:00+08:00',
  'end_date': '1970-01-01T07:30:00+07:30',
  'updated_at': '2024-05-22T12:52:16+08:00'},
 {'id': 'a2fc4bcf-03f8-414d-a0f3-34a88461d6ee',
  'name': 'PINT Buy 3 Free 1',
  'summary': None,
  'type': 'auto',
  'start_date': '2024-02-01T00:00:00+08:00',
  'end_date': '1970-01-01T07:30:00+07:30',
  'updated_at': '2024-05-22T12:52:36+08:00'}]

In [8]:
get_discounts_dim('dd6564a8-1d55-4f2b-a73f-23f77aa59141')

[{'id': '47410580-1dd8-4087-9e1a-557a5191ff98',
  'name': 'Staff Discount',
  'type': 'percent',
  'stackable': True,
  'manufacturer_discount': False,
  'value': 30,
  'max_value': 0,
  'updated_at': '2024-05-17T11:36:36+08:00'},
 {'id': '80389f24-b26c-4689-be19-72e48d0fc810',
  'name': 'Manual Discount %',
  'type': 'percent',
  'stackable': True,
  'manufacturer_discount': False,
  'value': 0,
  'max_value': 0,
  'updated_at': '2024-05-17T11:36:44+08:00'},
 {'id': '88bb89ab-bba2-4c05-ba2e-a66d6846bc0f',
  'name': 'FREE Sample Scoop',
  'type': 'price',
  'stackable': True,
  'manufacturer_discount': False,
  'value': 1,
  'max_value': 0,
  'updated_at': '2024-05-17T11:37:05+08:00'},
 {'id': 'f2f0a41b-3c85-45a3-876e-aa8ea18f8f7c',
  'name': 'FREE Cooler Bag',
  'type': 'price',
  'stackable': True,
  'manufacturer_discount': False,
  'value': 10,
  'max_value': 0,
  'updated_at': '2024-05-18T10:17:42+08:00'},
 {'id': '42aa2ba6-a176-4672-b6b0-8d78e46cb32b',
  'name': 'Manual Discount 

In [7]:
get_secrets()

{'ACCESS_TOKEN': '4baf9557fce32792069f828b522653d9',
 'CLIENT_ID': 'OZZYNBPLGYFJZXAOQNBGWMVCZWFNKXSK',
 'CLIENT_SECRET': 'IAQZCPQOCKNSGECJOCMVVHVHPNQFNRBH',
 'EXPIRES_AT': '2024-06-20T12:41:43+08:00',
 'EXPIRES_IN': '1209600',
 'LAST_CREATED_BANGSAR': '2024-05-25T16:54:58+08:00',
 'LAST_CREATED_EVENT': '2024-05-25T16:54:58+08:00',
 'LAST_CREATED_TUG': '2024-05-25T21:39:16+08:00',
 'REDIRECT_URI': 'https://fkulhnwllu5wnmapjqes3lj36q0slety.lambda-url.ap-southeast-2.on.aws/',
 'RESTAURANT_ID': '0c39f1fa-9a87-4471-b7d4-78eaff9f7f1c',
 'BANGSAR': 'dd6564a8-1d55-4f2b-a73f-23f77aa59141',
 'EVENT': '6bf38c18-a852-439b-aa54-007547f1bb6a',
 'TUG': 'a7e56eeb-98db-402d-bea1-1ee35d54a4fd',
 'REFRESH_TOKEN': 'fb1666bc0899b20d61b43543d5c813c5'}

In [15]:
locations = {
        'EVENT': params['EVENT'],
        'TUG': params['TUG']
    }

None


In [18]:
last_created_TUG = params.get('LAST_CREATED_TUG')
last_created_BANGSAR = params.get('LAST_CREATED_BANGSAR')
last_created_EVENT = params.get('LAST_CREATED_EVENT')

bucket_name = 'tug-dinlr'
all_order_details = []

orders = get_all_orders(locations[2][0], all=False, create_at_min=last_created_BANGSAR)
# order_details = [get_order_details(order["id"], location="tug_bangsar") for order in orders]
# last_created_BANGSAR = upload_data_to_s3(order_details, bucket_name, 'raw/TUG_Bangsar_orders')

In [35]:
orders = get_all_orders(locations[2][0], all=True, create_at_min="2024-03-18T20:43:16+08:00")
orders

[{'id': 'B39E5A4D-A521-4625-AA3C-94AB4FCB2FAF',
  'customer': None,
  'order_no': '240522DXU7T74',
  'order_ticket': 'BO1',
  'dining_option': '49cb7341-bffb-4a06-ac7c-2e137bdbab3d',
  'subtotal': 48,
  'total': 48,
  'rounding': 0,
  'paid': 48,
  'status': 'closed',
  'financial_status': 'paid',
  'kitchen_status': None,
  'expedite_status': None,
  'updated_at': '2024-05-22T10:43:56+08:00',
  'created_at': '2024-05-22T10:43:42+08:00',
  'objects': []},
 {'id': 'EC7C18A4-8CB1-4723-A401-22026E3F8487',
  'customer': None,
  'order_no': '240522DXUEW3T',
  'order_ticket': 'BO2',
  'dining_option': '49cb7341-bffb-4a06-ac7c-2e137bdbab3d',
  'subtotal': 28,
  'total': 28,
  'rounding': 0,
  'paid': 28,
  'status': 'closed',
  'financial_status': 'paid',
  'kitchen_status': None,
  'expedite_status': None,
  'updated_at': '2024-05-22T11:08:56+08:00',
  'created_at': '2024-05-22T11:08:31+08:00',
  'objects': []},
 {'id': '4BD64CE2-B054-4A1F-B910-2D47359CABD0',
  'customer': None,
  'order_no'

In [49]:
if is_token_expired(params['EXPIRES_AT']):
    try:
        new_params = refresh_access_token()
        params.update(new_params)
    except Exception as e:
        logging.error(f"Failed to refresh token: {e}")

bucket_name = 'tug-dinlr'
all_order_details = []

for location_id, location_name in locations:
    # if "tug gelato" in location_name.lower():
    #     orders = get_all_orders(location_id, all=False, create_at_min="2024-05-19T00:43:16+08:00", create_at_max="2024-05-19T23:55:16+08:00")
    #     order_details = [get_order_details(order["id"], location="tug") for order in orders]
    #     last_created_TUG = upload_data_to_s3(order_details, bucket_name, 'raw/TUG_orders')
    
    if "bangsar" in location_name.lower():
        orders = get_all_orders(location_id, all=False, create_at_min="2024-05-24T00:43:16+08:00", create_at_max="2024-05-24T23:55:16+08:00")
        order_details = [get_order_details(order["id"], location="tug_bangsar") for order in orders]
        last_created_BANGSAR = upload_data_to_s3(order_details, bucket_name, 'raw/TUG_Bangsar_orders')
    # else:
    #     # If there are other locations that need to be processed differently, handle them here
    #     logging.info(f"Unknown location type: {location_name}")

# Update SSM with the new last created timestamps
# if last_created_TUG:
#     ssm_client.put_parameter(Name='/tug-dinlr/api/LAST_CREATED_TUG', Value=last_created_TUG, Type='String', Overwrite=True)
# if last_created_BANGSAR:
#     ssm_client.put_parameter(Name='/tug-dinlr/api/LAST_CREATED_BANGSAR', Value=last_created_EVENT, Type='String', Overwrite=True)
# if last_created_EVENT:
#     ssm_client.put_parameter(Name='/tug-dinlr/api/LAST_CREATED_EVENT', Value=last_created_EVENT, Type='String', Overwrite=True)

In [3]:
# # Read last item of json data into a pandas dataframe
# master_TUG = pd.read_json('TUG_orders_migration.json')
# master_EVENT = pd.read_json('EVENT_orders_migration.json')

In [4]:
# # get last 'created_at' date
# last_created_TUG = master_TUG['created_at'].max() + pd.Timedelta(seconds=1)
# last_created_EVENT = master_EVENT['created_at'].max() + pd.Timedelta(seconds=1)

# # convert last_created to string in ISO 8601 format: "2024-02-25T02:00:15+08:00"
# # last_created_TUG = last_created_TUG.strftime("%Y-%m-%dT%H:%M:%S%z")
# # last_created_EVENT = last_created_EVENT.strftime("%Y-%m-%dT%H:%M:%S%z")
# # print(f"Last created TUG: {last_created_TUG}")
# # print(f"Last created EVENT: {last_created_EVENT}")

In [5]:
# # get order for TUG at last updated date using the API
# TUG_orders = get_all_orders(lTUG_ID, all=False, create_at_min=last_created_TUG)
# EVENT_orders = get_all_orders(lEVENT_ID, all=False, create_at_min=last_created_EVENT)


In [9]:
# from datetime import datetime
# # Fetch all orders and their details
# TUG_orders = get_all_orders(lTUG_ID)
# EVENT_orders = get_all_orders(lEVENT_ID)

# TUG_all_order_details = [get_order_details(order["id"], location="tug") for order in TUG_orders]
# EVENT_all_order_details = [get_order_details(order["id"], location="event") for order in EVENT_orders]

# # Get today's date
# today = datetime.now().strftime("%Y-%m-%d")

# # Dump details to json with today's date as suffix in the filename
# with open(f'TUG_orders_{today}.json', 'w') as f:
#     json.dump(TUG_all_order_details, f)

# with open(f'EVENT_orders_{today}.json', 'w') as f:
#     json.dump(EVENT_all_order_details, f)

In [None]:
# # Convert to DataFrame
# TUG_df_orders = pd.DataFrame(TUG_all_order_details)
# EVENT_df_orders = pd.DataFrame(EVENT_all_order_details)

# parsed = json.loads(TUG_df_orders.to_json(orient="records"))
# with open("TUG_orders.json", "w") as json_file:
#     json.dump(parsed, json_file, indent=4)

# parsed = json.loads(EVENT_df_orders.to_json(orient="records"))
# with open("EVENT_orders.json", "w") as json_file:
#     json.dump(parsed, json_file, indent=4)

# # Curating data...
# # Define a function to unnest and create separate tables
# def unnest_json(df, field_name):
#     return df.select(pl.col(field_name).arr.flatten().alias(field_name)).explode(field_name)

# # Define a function to obtain json keys that has nested arrays
# def get_nested_keys(json_data):
#     nested_keys = []
#     for key, value in json_data.items():
#         if isinstance(value, list):
#             nested_keys.append(key)
#     return nested_keys

# def check_nested_keys(json_data, parent_key=None):
#     nested_keys = []
#     for key, value in json_data.items():
#         if isinstance(value, list):
#             nested_keys.append((parent_key, key))
#             for item in value:
#                 nested_keys.extend(check_nested_keys(item, key))
#         elif isinstance(value, dict):
#             nested_keys.append((parent_key, key))
#             nested_keys.extend(check_nested_keys(value, key))
#     return nested_keys

# nested_keys = check_nested_keys(json_data)

# def get_order_schema(all_order_details):
#     # Load the JSON data into Polars
#     df = pl.DataFrame(all_order_details)

#     # Get the nested keys
#     nested_keys = get_nested_keys(all_order_details[0])

#     # Create separate tables for each nested keys
#     tables = {}
#     for key in nested_keys:
#         tables[key] = unnest_json(df, key)

#     return tables



# # Call the function with TUG_all_order_details
# process_order_details(TUG_all_order_details)

In [37]:
# TUG_df_orders_items = pd.json_normalize(TUG_df_orders['items'].explode())

Save to parquet / json

In [None]:
# # Save to Parquet with GZIP compression
# parquet_file = '/mnt/data/orders.parquet.gzip'
# df_orders.to_parquet(parquet_file, compression='gzip')