In [1]:
# CELL 1: Import và setup
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import joblib
import os
import json


# SageMaker setup
role = get_execution_role()
sess = sagemaker.Session()
s3 = boto3.client('s3')

bucket = 'fraud-detection-bucket-76'


print(f"Role: {role}")
print(f"Bucket: {bucket}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Role: arn:aws:iam::308755113402:role/service-role/AmazonSageMaker-ExecutionRole-20250713T212889
Bucket: fraud-detection-bucket-76


In [2]:
# CELL 2: Data processing 
# Load data
bucket_data = 'fraud-detection-bucket-76'
key = 'raw-data/credit_card_fraud_transactions.csv'
s3 = boto3.client('s3')

obj = s3.get_object(Bucket=bucket_data, Key=key)
df = pd.read_csv(obj['Body'])

def vectorized_date_format(series):
    parts = series.str.split('/', expand=True)
    m = parts[0]
    d = parts[1]
    y = parts[2]
    full_year = y.where(y.astype('int') <= 20, '19' + y.astype(str)).where(y.astype('int') > 20, '20' + y.astype(str))
    formatted_str = d + '/' + m + '/' + full_year
    return pd.to_datetime(formatted_str, format='%d/%m/%Y')

def data_correction(df: pd.DataFrame):
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format="%m/%d/%y %H:%M")
    df['dob'] = vectorized_date_format(df['dob'])
    return df

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2.0)**2 + \
        np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Apply transformations
new_df = data_correction(df.copy())
new_df.head(3)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01,2703190000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01,630423000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01,38859500000000.0,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0


In [3]:
# CELL 3: Feature Engineering
new_df = new_df.drop(columns=[
    'Unnamed: 0', 'first', 'last', 'street', 'trans_num', 'merchant', 'job'
])

# Encoding features
category_fraud_rate = new_df.groupby('category')['is_fraud'].mean()
new_df['category_target_enc'] = new_df['category'].map(category_fraud_rate)

state_fraud_rate = new_df.groupby('state')['is_fraud'].mean()
new_df['state_target_enc'] = new_df['state'].map(state_fraud_rate)

new_df['gender'] = new_df['gender'].map({'F': 0, 'M': 1})

# Time-based features
new_df['transaction_hour'] = new_df['trans_date_trans_time'].dt.hour
new_df['is_night'] = new_df['transaction_hour'].apply(lambda x: 1 if (x >= 22 or x < 6) else 0)
new_df['transaction_dayofweek'] = new_df['trans_date_trans_time'].dt.dayofweek

# Age feature
current_date = new_df['trans_date_trans_time'].max()
new_df['age'] = (current_date - new_df['dob']).dt.days // 365

# Distance feature
new_df['distance_to_merchant'] = haversine(
    new_df['lat'], new_df['long'], new_df['merch_lat'], new_df['merch_long']
)

# Drop original columns
new_df = new_df.drop(columns=['category', 'state', 'city', 'trans_date_trans_time', 'dob'])
new_df

Unnamed: 0,cc_num,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,category_target_enc,state_target_enc,transaction_hour,is_night,transaction_dayofweek,age,distance_to_merchant
0,2.703190e+15,4.97,0,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,0.014526,0.004778,0,1,1,32,78.597568
1,6.304230e+11,107.23,0,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,0.013973,0.004524,0,1,1,41,30.212176
2,3.885950e+13,220.11,1,83252,42.1808,-112.2620,4154,1325376051,43.150704,-112.154481,0,0.002435,0.002440,0,1,1,58,108.206083
3,3.534090e+15,45.00,1,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,0.004679,0.003385,0,1,1,53,95.673231
4,3.755340e+14,41.96,1,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,0.003008,0.006178,0,1,1,33,77.556744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,6.011980e+15,77.00,0,21405,39.0305,-76.5515,92106,1362931649,38.779464,-76.317042,0,0.001499,0.005658,16,0,1,76,34.507680
1048571,4.839040e+15,116.94,0,52563,41.1826,-92.3097,1583,1362931670,41.400318,-92.726724,0,0.003008,0.005488,16,0,1,20,42.426383
1048572,5.718440e+11,21.27,0,40202,38.2507,-85.7476,736284,1362931711,37.293339,-84.798122,0,0.001536,0.006129,16,0,1,67,135.264282
1048573,4.646850e+18,9.52,0,11796,40.7320,-73.1000,4056,1362931718,39.773077,-72.213209,0,0.001499,0.007170,16,0,1,29,130.508798


# Training set, Validating set, Testing set

In [7]:
# Split and storing data processed
X = new_df.drop(columns=['is_fraud'])
y = new_df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### SMOTE - Balancing data

In [5]:
!pip install -q imbalanced-learn

In [6]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Initialize SMOTE
smote = SMOTE(
    random_state=42,           # For reproducibility
    k_neighbors=5,             # Number of nearest neighbors
    sampling_strategy='auto'    # Balance to 1:1 ratio
)

# Apply SMOTE to training data
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Check the balanced class distribution
print("\nAfter SMOTE balancing:")
print(f"Class distribution: {Counter(y_train_balanced)}")
print(f"Fraud rate: {y_train_balanced.mean():.4f} ({y_train_balanced.sum()} fraud cases out of {len(y_train_balanced)} total)")


After SMOTE balancing:
Class distribution: Counter({0: 834055, 1: 834055})
Fraud rate: 0.5000 (834055 fraud cases out of 1668110 total)


In [8]:
# Create csv files
training_data = pd.concat([y_train_balanced, X_train_balanced], axis=1)
validation_data = pd.concat([y_test, X_test], axis=1)
feature_names = pd.DataFrame(list(X.columns))

print("Training shape", training_data.shape)
print("Validation shape", validation_data.shape)
print("Features:", feature_names)

training_data.to_csv('train.csv', index=False, header=False)
validation_data.to_csv('validation.csv', index=False, header=False)
feature_names.to_csv('test_features.csv')

Training shape (1668110, 18)
Validation shape (209715, 18)
Features:                         0
0                  cc_num
1                     amt
2                  gender
3                     zip
4                     lat
5                    long
6                city_pop
7               unix_time
8               merch_lat
9              merch_long
10    category_target_enc
11       state_target_enc
12       transaction_hour
13               is_night
14  transaction_dayofweek
15                    age
16   distance_to_merchant


# Storing processed train-test-validate set to S3 buckets

In [9]:
# Storing to S3 bucket
train_prefix = 'training-data'
val_prefix = 'validation-data'

# Upload train.csv
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(train_prefix, 'train.csv')
).upload_file('train.csv')

# Upload validation.csv
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(val_prefix, 'validation.csv')
).upload_file('validation.csv')

# Upload test.csv
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(val_prefix, 'test.csv')
).upload_file('test.csv')

# Upload feature names
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(val_prefix, 'test_features.csv')
).upload_file('test_features.csv')

print("Uploaded to S3 successfully")

Uploaded to S3 successfully


# Starting training and deploying steps

In [11]:
from sagemaker import image_uris

container = image_uris.retrieve('xgboost', region='ap-southeast-2', version='latest')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/'.format(bucket, train_prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/'.format(bucket, val_prefix), content_type='csv')

In [12]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path='s3://{}/model-artifacts/'.format(bucket),
    sagemaker_session=sess
)

xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    silent=1,
    objective='binary:logistic',
    num_round=100,
    eval_metric='error'
)

# Khai báo không gian hyperparameters
hyperparameter_ranges = {
    'max_depth': IntegerParameter(3, 10),
    'eta': ContinuousParameter(0.01, 0.3),
    'gamma': ContinuousParameter(0, 5),            # gamma tuning
    'min_child_weight': IntegerParameter(1, 10),   # min_child_weight tuning
    'subsample': ContinuousParameter(0.6, 1.0),    # subsample tuning
    'colsample_bytree': ContinuousParameter(0.6, 1.0)  # feature sampling
}

# Tạo Tuner
# For fraud detection, use AUC as the metric
tuner = HyperparameterTuner(
    estimator=xgb,
    objective_metric_name='validation:auc',
    objective_type='Maximize',
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=5
)

# Chạy tuning
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................................................................................................................................................................................................................................!


In [13]:
tuner.describe()

{'HyperParameterTuningJobName': 'xgboost-250717-0852',
 'HyperParameterTuningJobArn': 'arn:aws:sagemaker:ap-southeast-2:308755113402:hyper-parameter-tuning-job/xgboost-250717-0852',
 'HyperParameterTuningJobConfig': {'Strategy': 'Bayesian',
  'HyperParameterTuningJobObjective': {'Type': 'Maximize',
   'MetricName': 'validation:auc'},
  'ResourceLimits': {'MaxNumberOfTrainingJobs': 20,
   'MaxParallelTrainingJobs': 5},
  'ParameterRanges': {'IntegerParameterRanges': [{'Name': 'max_depth',
     'MinValue': '3',
     'MaxValue': '10',
     'ScalingType': 'Auto'},
    {'Name': 'min_child_weight',
     'MinValue': '1',
     'MaxValue': '10',
     'ScalingType': 'Auto'}],
   'ContinuousParameterRanges': [{'Name': 'eta',
     'MinValue': '0.01',
     'MaxValue': '0.3',
     'ScalingType': 'Auto'},
    {'Name': 'gamma', 'MinValue': '0', 'MaxValue': '5', 'ScalingType': 'Auto'},
    {'Name': 'subsample',
     'MinValue': '0.6',
     'MaxValue': '1.0',
     'ScalingType': 'Auto'},
    {'Name': 'c

# Deploying

In [22]:
from sagemaker.estimator import Estimator

best_job = tuner.best_training_job()
print("Best training job name:", best_job)

best_estimator = Estimator.attach(best_job)

# Deploy with one line
model_predictor = best_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

Best training job name: xgboost-250717-0852-012-809e6788

2025-07-17 09:06:30 Starting - Found matching resource for reuse
2025-07-17 09:06:30 Downloading - Downloading the training image
2025-07-17 09:06:30 Training - Training image download completed. Training in progress.
2025-07-17 09:06:30 Uploading - Uploading generated training model
2025-07-17 09:06:30 Completed - Resource reused by training job: xgboost-250717-0852-018-e76af136
-----!

# Inferencing model

In [24]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = Predictor(
    endpoint_name='xgboost-2025-07-17-09-24-53-394',
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)


# Preprocessed input (match your training features exactly)
sample = X_test.iloc[32]   # This must be fully numeric and processed
csv_row = ",".join([str(val) for val in sample.values])

print(csv_row)
# e.g. "0.4,0.3,1,25,1000,0,1,36,1.3,..."  # no headers, no index

# Predict
prediction = predictor.predict(csv_row)
print('actual:', y_test.iloc[32])
print(prediction)

213196000000000.0,14.17,1.0,52576.0,41.2001,-92.1354,568.0,1334357517.0,41.17294,-92.824567,0.0024348192311235703,0.0054884235771607785,22.0,1.0,5.0,50.0,57.74970782049139
actual: 0
0.0011404537362977862


In [48]:
# CELL 10: Cleanup (chạy khi không dùng nữa)
model_predictor.delete_endpoint()
print("Endpoint deleted")

Endpoint deleted
