In [3]:
import pandas as pd
import random
from faker import Faker
import uuid
from datetime import datetime, timedelta

fake = Faker()
random.seed(42)

# Constants
total_records = 5000
fraud_percentage = 0.35
fraud_count = int(total_records * fraud_percentage)
non_fraud_count = total_records - fraud_count

# Categories, merchant names, and jobs
categories = [
    'shopping_net', 'entertainment', 'misc_net', 'kids_pets', 'grocery_pos', 
    'gas_transport', 'health_fitness', 'shopping_pos', 'misc_pos', 'travel',
    'personal_care', 'home', 'grocery_net', 'food_dining'
]

merchant_names = [fake.company() for _ in range(200)]
jobs = ['Film/video editor', 'Materials engineer', 'Immunologist',
       'Financial adviser', 'Mechanical engineer', 'Quantity surveyor',
       'Trading standards officer', 'Naval architect', 'Mining engineer',
       'Scientist, audiological', 'Scientist, biomedical',
       'Surveyor, land/geomatics', 'Podiatrist',
       'Television production assistant', 'Librarian, public',
       'Systems developer', 'Prison officer',
       'Further education lecturer', 'Clothing/textile technologist',
       'Financial trader', 'Physiotherapist',
       'Historic buildings inspector/conservation officer', 'Dealer',
       'Race relations officer', 'Early years teacher', 'Geoscientist',
       'Exhibition designer', 'Chartered public finance accountant',
       'Teacher, special educational needs', 'Chief Executive Officer',
       'Psychologist, forensic', 'Electrical engineer',
       'Audiological scientist', 'Petroleum engineer', 'Science writer',
       'Agricultural consultant', 'Radiographer, diagnostic', 'Barrister',
       'Secondary school teacher', 'Accounting technician',
       'Magazine features editor', 'Environmental consultant', 'Sub',
       'Systems analyst', 'Paramedic', 'Energy engineer']

# generating single record
def generate_record(fraudulent=False):
    transaction_id = str(uuid.uuid4())[:8]
    category = random.choice(categories)
    transaction_amount = round(random.uniform(1, 5000), 2)
    cc_num = fake.credit_card_number(card_type=None)
    timestamp = fake.date_time_between(start_date=datetime(2019, 1, 1), end_date=datetime(2020, 6, 20)).strftime("%Y-%m-%d %H:%M:%S")
    merchant_name = random.choice(merchant_names)
    location = fake.address().replace("\n", "- ")
    customer_id = str(uuid.uuid4())[:8]
    customer_name = fake.name()
    gender = random.choice(['M', 'F'])
    dob = fake.date_of_birth(minimum_age=18, maximum_age=90).strftime("%Y-%m-%d")
    job = random.choice(jobs)
    address = fake.address().replace("\n", "- ")
    fraud_indicator = 1 if fraudulent else 0

    return [
        transaction_id, category, transaction_amount, cc_num, timestamp, 
        merchant_name, location, customer_id, customer_name, gender, dob, 
        job, address, fraud_indicator
    ]

# Generate the data
data = [generate_record(fraudulent=False) for _ in range(non_fraud_count)]
data += [generate_record(fraudulent=True) for _ in range(fraud_count)]

# Convert to DataFrame
columns = [
    "TransactionID", "Category", "TransactionAmount", "cc_num", "Timestamp", 
    "MerchantName", "Location", "CustomerID", "CustomerName", "gender", 
    "dob", "job", "Address", "FraudIndicator"
]
df = pd.DataFrame(data, columns=columns)

# Shuffle the data to mix fraud and non-fraud records
df = df.sample(frac=1).reset_index(drop=True)

# Save to CSV
df.to_csv('../ETL/bank_fraud_data.csv', index=False)


In [4]:
df = pd.read_csv("credit_card_raw.csv")

total_records = 5000
fraud = int(total_records*0.38)
non_fraud = total_records - fraud

df_fraud = df[df['is_fraud'] == 1].sample(n=fraud, random_state=42)
df_non_fraud = df[df['is_fraud'] == 0].sample(n=non_fraud, random_state=42)

sample_df = pd.concat([df_fraud, df_non_fraud])

sample_df = sample_df.sample(frac=1, random_state=42).reset_index(drop=True)

sample_df.to_csv('../ETL/credit_card_fraud_data.csv', index = False)