<a href="https://colab.research.google.com/github/khp53/Credit-Card-Fraud-Detection-SecureBoost/blob/main/Credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.17.2-py3-none-any.whl.metadata (13 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.35.68-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.35.68-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.0 (from sdv)
  Downloading copulas-0.12.0-py3-none-any.whl.metadata (9.1 kB)
Collecting ctgan>=0.10.2 (from sdv)
  Downloading ctgan-0.10.2-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.6.1 (from sdv)
  Downloading deepecho-0.6.1-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.13.1 (from sdv)
  Downloading rdt-1.13.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.17.0 (from sdv)
  Downloading sdmetrics-0.17.0-py3-none-any.whl.metadata (8.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [2]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import run_diagnostic
import numpy as np
import datetime as dt


# Load or create a dataset
real_data = pd.read_csv('creditcard.csv')

# We will need additional metadata as this dataset does not include much valuable metadata
# For PCA complience all of them are encrypted.

# External Metadata
merchant_category = ["Grocery", "Electronics", "Clothing", "Travel", "Restaurants"]
real_data["Merchant Category"] = np.random.choice(merchant_category, size=len(real_data))

device_type = ["Mobile", "Desktop", "Tablet"]
real_data["Device Type"] = np.random.choice(device_type, size=len(real_data))

locations = ["City A", "City B", "City C"]
real_data["Location"] = np.random.choice(locations, size=len(real_data))

# HEre we are using the existing time values from our real_data to calculate a time stamp for metadata
timestamp_start = dt.datetime(2022, 11, 14)
real_data["Transaction Timestamp"] = real_data["Time"].apply(lambda x: timestamp_start + dt.timedelta(days=x))

payment_methods = ["Credit Card", "Debit Card", "Apple Pay", "Google Pay", "Interact"]
real_data["Payment Method"] = np.random.choice(payment_methods, size=len(real_data))

channel = ["Online", "In-Store", "POS"]
real_data["Channel"] = np.random.choice(channel, size=len(real_data))

season = ["Winter", "Spring", "Summer", "Holiday"]
real_data["Season"] = np.random.choice(season, size=len(real_data))

card_types = ["Visa", "MasterCard", "American Express", "Discover"]
real_data["Card Type"] = np.random.choice(card_types, size=len(real_data))

real_data["Credit Limit"] = np.random.uniform(1000, 10000, size=len(real_data))
real_data["Cardholder Risk Score"] = np.random.uniform(300, 850, size=len(real_data))
real_data["Number of Active Cards"] = np.random.randint(1, 5, size=len(real_data))
real_data["Loyalty Points Available"] = np.random.randint(0, 5000, size=len(real_data))

In [3]:
def generate_card_id(card_type):
    if card_type == 'Visa':
        return '4' + ''.join(np.random.choice(list('0123456789'), size=15))
    elif card_type == 'MasterCard':
        prefix = np.random.choice(['51', '52', '53', '54', '55'] + [str(i) for i in range(2221, 2721)])
        return prefix + ''.join(np.random.choice(list('0123456789'), size=16-len(prefix)))
    elif card_type == 'American Express':
        prefix = np.random.choice(['34', '37'])
        return prefix + ''.join(np.random.choice(list('0123456789'), size=13))
    elif card_type == 'Discover':
        prefix = np.random.choice(['6011'] + [str(i) for i in range(622126, 622926)] + ['644', '645', '646', '647', '648', '649', '65'])
        return prefix + ''.join(np.random.choice(list('0123456789'), size=16-len(prefix)))
    else:
        return None

In [4]:
real_data['Card ID'] = real_data['Card Type'].apply(generate_card_id)

In [5]:
real_data.to_csv("augmented_creditcard.csv", index=False)

In [6]:
from sdv.metadata import SingleTableMetadata

augmented_data = pd.read_csv("augmented_creditcard.csv")

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(augmented_data)

print(metadata.to_dict())

# Save metadata to json for later reading
metadata.save_to_json("creditcard_metadata.json")
print("Metadata saved!")

{'columns': {'Time': {'sdtype': 'numerical'}, 'V1': {'sdtype': 'numerical'}, 'V2': {'sdtype': 'numerical'}, 'V3': {'sdtype': 'numerical'}, 'V4': {'sdtype': 'numerical'}, 'V5': {'sdtype': 'numerical'}, 'V6': {'sdtype': 'numerical'}, 'V7': {'sdtype': 'numerical'}, 'V8': {'sdtype': 'numerical'}, 'V9': {'sdtype': 'numerical'}, 'V10': {'sdtype': 'numerical'}, 'V11': {'sdtype': 'numerical'}, 'V12': {'sdtype': 'numerical'}, 'V13': {'sdtype': 'numerical'}, 'V14': {'sdtype': 'numerical'}, 'V15': {'sdtype': 'numerical'}, 'V16': {'sdtype': 'numerical'}, 'V17': {'sdtype': 'numerical'}, 'V18': {'sdtype': 'numerical'}, 'V19': {'sdtype': 'numerical'}, 'V20': {'sdtype': 'numerical'}, 'V21': {'sdtype': 'numerical'}, 'V22': {'sdtype': 'numerical'}, 'V23': {'sdtype': 'numerical'}, 'V24': {'sdtype': 'numerical'}, 'V25': {'sdtype': 'numerical'}, 'V26': {'sdtype': 'numerical'}, 'V27': {'sdtype': 'numerical'}, 'V28': {'sdtype': 'numerical'}, 'Amount': {'sdtype': 'numerical'}, 'Class': {'sdtype': 'categorical

In [7]:
json_metadata = SingleTableMetadata.load_from_json("creditcard_metadata.json")

synthesizer = CTGANSynthesizer(json_metadata, epochs=100,
    verbose=True)
reduced_data = augmented_data.sample(frac=0.3, random_state=56)
synthesizer.fit(reduced_data)

synthetic_data = synthesizer.sample(num_rows=len(reduced_data))
print(synthetic_data.head(5))

synthetic_data.to_csv("synthetic_creditcard.csv", index=False)

Gen. (-6.12) | Discrim. (-0.04): 100%|██████████| 100/100 [1:14:48<00:00, 44.89s/it]


      Time        V1        V2        V3        V4        V5        V6  \
0  68183.0 -2.555683  1.046685  1.020982  1.566225  3.022399  1.321870   
1  71429.0  1.315486 -0.371440 -0.070024  0.281124  0.334142  0.887258   
2  51993.0 -0.947109  1.290269  0.083987 -0.652643  1.661304 -0.653085   
3  64155.0 -0.302848  1.081142  0.568993  2.189436  1.593832  0.074635   
4  74694.0  2.004417 -0.660734 -0.561478 -0.188918  0.360716 -0.743623   

         V7        V8        V9  ...  Transaction Timestamp  Payment Method  \
0  1.108944 -0.072699 -0.509197  ...          sdv-pii-24p3x      Debit Card   
1  0.183357  0.269768  1.117159  ...          sdv-pii-ngu20      Debit Card   
2  0.879837 -0.047507 -0.120699  ...          sdv-pii-kp3sm      Google Pay   
3  0.624375  0.803518 -0.288782  ...          sdv-pii-u659p       Apple Pay   
4  0.224457  0.000284 -0.476166  ...          sdv-pii-ltoow       Apple Pay   

    Channel  Season         Card Type  Credit Limit  Cardholder Risk Score  \
0 