In [None]:
# Credit Card Fraud Detection - EDA & Preprocessing

# Instructions:
# 1) Set REGION/BUCKET/PREFIX below
# 2) Place creditcard.csv in Studio or S3, then run the cells


In [None]:
REGION = "<YOUR_AWS_REGION>"
BUCKET = "<YOUR_S3_BUCKET_NAME>"
PREFIX = "fraud"


In [None]:
import boto3, os, io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

session = boto3.session.Session(region_name=REGION)
s3 = session.client("s3")


In [None]:
# Load dataset from local file system in Studio (upload via left file browser) OR from S3
# Option A: local
local_path = "../data/creditcard.csv"
if os.path.exists(local_path):
    df = pd.read_csv(local_path)
else:
    # Option B: from S3
    obj = s3.get_object(Bucket=BUCKET, Key=f"{PREFIX}/data/raw/creditcard.csv")
    df = pd.read_csv(io.BytesIO(obj["Body"].read()))
df.head()


In [None]:
# Basic stats and class balance
print(df.shape)
print(df.isna().sum().sum(), "missing values total")
print(df['Class'].value_counts())
df['Class'].value_counts().plot(kind='bar')
plt.title('Class Distribution (0=non-fraud, 1=fraud)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()


In [None]:
# Scale Amount and Time, keep PCA features V1..V28 as-is
scaler = RobustScaler()
df[['Time', 'Amount']] = scaler.fit_transform(df[['Time','Amount']])

X = df.drop(columns=['Class'])
y = df['Class']

Xtr, Xtmp, ytr, ytmp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
Xval, Xte,  yval, yte = train_test_split(Xtmp, ytmp, test_size=0.5, stratify=ytmp, random_state=42)

train = pd.concat([Xtr, ytr], axis=1)
val   = pd.concat([Xval, yval], axis=1)
test  = pd.concat([Xte, yte], axis=1)

train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)
test.to_csv("test.csv", index=False)


In [None]:
# Upload splits to S3
for name in ["train.csv","val.csv","test.csv"]:
    s3.upload_file(Filename=name, Bucket=BUCKET, Key=f"{PREFIX}/data/processed/{name}")
print("Uploaded splits to S3")
