In [7]:
import sagemaker
from sagemaker import image_uris
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.parameters import ParameterString, ParameterFloat
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.huggingface import HuggingFace
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.functions import JsonGet
from sagemaker.processing import ScriptProcessor
from sagemaker.workflow.properties import PropertyFile


In [32]:
# AWS setup
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

In [64]:
sagemaker_role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
pipeline_session = PipelineSession()

aws_region = sagemaker_session.boto_session.region_name

# s3_bucket = sagemaker_session.default_bucket()
s3_bucket = "aamlops2024"
s3_prefix = "transaction-HuggingFace-pipeline-v01"
s3_base_url = f"s3://{s3_bucket}/{s3_prefix}"
s3_bucket, s3_prefix, s3_base_url, aws_region

('aamlops2024',
 'transaction-HuggingFace-pipeline-v01',
 's3://aamlops2024/transaction-HuggingFace-pipeline-v01',
 'us-east-1')

In [92]:
# Parameters
input_data = ParameterString(name="InputData", default_value="s3://aamlops2024/transaction-HuggingFace-pipeline-v01/data/ccdata.parquet")
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")
accuracy_threshold = ParameterFloat(name="AccuracyThreshold", default_value=0.85)
output_path = ParameterString(name="OutputPath", default_value=f"s3://{s3_bucket}/{s3_prefix}/archives")
output_path = f"s3://{s3_bucket}/{s3_prefix}/archives"

In [93]:
# Define cache configuration
cache_config = CacheConfig(
    enable_caching=True,             # Enable caching
    expire_after="P30D"              # Cache expiry in ISO 8601 duration format (e.g., P30D = 30 days)
)

In [94]:
# SKLearn Preprocessing
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    base_job_name="preprocessing"
)

In [95]:
processing_step = ProcessingStep(
    name="PreprocessData",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train", destination=f"s3://{s3_bucket}/{s3_prefix}/processing"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation", destination=f"s3://{s3_bucket}/{s3_prefix}/processing"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test", destination=f"s3://{s3_bucket}/{s3_prefix}/processing"),
    ],
    code="./code/preprocessing.py",
    cache_config=cache_config
)

### Training

In [101]:
# Hugging Face Training
huggingface_estimator = HuggingFace(
    entry_point="./code/train.py",
    # source_dir="./",
    base_job_name="hf-train",
    instance_type="ml.p3.2xlarge",
    instance_count=1,
    role=role,
    sagemaker_session=pipeline_session,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
    output_path=output_path,
    hyperparameters={
        "epochs": 3,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 32,
        "model_name": "bert-base-uncased"
    }
)

In [102]:
training_step = TrainingStep(
    name="TrainModel",
    estimator=huggingface_estimator,
    inputs={
        "train": processing_step.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
        "validation": processing_step.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri
    },
    # cache_config=CacheConfig(enable_caching=True)
)

## 6. Pipeline

In [103]:
# Final pipeline
pipeline = Pipeline(
    name="CreditCardTransactionCategorization",
    sagemaker_session=sagemaker_session,
    parameters=[input_data, model_approval_status, accuracy_threshold],
    steps=[
        processing_step,
        training_step,
        # evaluation_step,
        # cond_step
    ],
)

In [104]:
pipeline.upsert(role_arn=sagemaker_role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:047922237497:pipeline/CreditCardTransactionCategorization',
 'ResponseMetadata': {'RequestId': '3f07b451-fdab-4d2e-bf8b-8315c45c9e13',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '3f07b451-fdab-4d2e-bf8b-8315c45c9e13',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '103',
   'date': 'Mon, 21 Apr 2025 01:32:43 GMT'},
  'RetryAttempts': 0}}

In [105]:
pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:047922237497:pipeline/CreditCardTransactionCategorization/execution/24ze6btqfxn1', sagemaker_session=<sagemaker.session.Session object at 0x7fb42feed0d0>)

In [13]:
!aws s3 cp ./data/ccdata.csv s3://aamlops2024/transaction-HuggingFace-pipeline-v01/ccdata.csv

upload: data/ccdata.csv to s3://aamlops2024/transaction-HuggingFace-pipeline-v01/ccdata.csv


In [60]:
import pandas as pd
data = pd.read_csv("./data/ccdata.csv")

In [61]:
data

Unnamed: 0,amount,merchant,location,datetime,description,category
0,13.50,Starbucks,"New York, NY",4/16/2025 8:15,Coffee,Food
1,251.00,Delta Airlines,"Atlanta, GA",4/15/2025 17:45,Flight to Chicago,Travel
2,90.99,Amazon,"Seattle, WA",4/14/2025 12:22,Electronics purchase,Shopping
3,44.20,Whole Foods,"Boston, MA",4/13/2025 14:33,Grocery,Food
4,121.00,Shell,"Dallas, TX",4/12/2025 10:00,Gas refill,Utilities
...,...,...,...,...,...,...
95,89.90,Walgreens,"Houston, TX",4/1/2025 12:12,Prescription refill,Health
96,13.39,Panera Bread,"Nashville, TN",3/31/2025 9:30,Lunch combo,Food
97,397.90,American Airlines,"Miami, FL",3/30/2025 18:20,Round-trip ticket,Travel
98,97.85,Home Depot,"Sacramento, CA",3/29/2025 10:05,Home improvement tools,Utilities


In [6]:
# data.to_parquet("./data/ccdata.parquet", index=False)

In [7]:
# !aws s3 cp ./data/ccdata.parquet s3://aamlops2024/transaction-HuggingFace-pipeline-v01/data/ccdata.parquet

### Step 1: Python Variable Reformulation

In [1]:
import random
import pandas as pd
from datetime import datetime, timedelta

# Range constraints
AMOUNT_MIN = 5
AMOUNT_MAX = 200

# Time range
START_DATE = datetime(2025, 1, 1, 0, 10)
END_DATE = datetime(2025, 3, 31, 12, 50)

# Locations
LOCATIONS = ["New York, NY", "Atlanta, GA", "Boston, MA", "Miami, FL", "Online"]

# Merchant, description, category mapping
MERCHANT_INFO = [
    ("Delta Airlines", "Flight to destination", "Travel"),
    ("Starbucks", "Coffee", "Food"),
    ("Amazon", "Online purchase", "Shopping"),
    ("Shell", "Gas refill", "Utilities"),
    ("CVS Pharmacy", "Medicine purchase", "Health"),
    ("Best Buy", "Electronics purchase", "Shopping"),
    ("Uber", "Ride to location", "Travel"),
    ("Apple Store", "Apple product purchase", "Shopping"),
    ("Marriott Hotel", "Hotel booking", "Travel"),
    ("Macy's", "Clothing", "Shopping"),
    ("Walgreens", "Prescription refill", "Health"),
    ("Panera Bread", "Meal", "Food")
]


In [2]:
def generate_credit_card_transactions(num_records=10) -> pd.DataFrame:
    data = []

    for _ in range(num_records):
        # Random merchant
        merchant, description, category = random.choice(MERCHANT_INFO)

        # Random values
        amount = round(random.uniform(AMOUNT_MIN, AMOUNT_MAX), 2)
        location = random.choice(LOCATIONS)

        # Random datetime between range
        total_seconds = int((END_DATE - START_DATE).total_seconds())
        rand_seconds = random.randint(0, total_seconds)
        transaction_time = START_DATE + timedelta(seconds=rand_seconds)
        transaction_time_str = transaction_time.strftime("%Y-%m-%d %H:%M:%S")

        data.append({
            "amount": amount,
            "datetime": transaction_time_str,
            "location": location,
            "merchant": merchant,
            "description": description,
            "category": category
        })

    df = pd.DataFrame(data)
    return df

In [3]:
df = generate_credit_card_transactions(20)

In [5]:
# df