In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, precision_score, recall_score, f1_score, auc
import matplotlib
import matplotlib.pyplot as plt
from ludwig.api import LudwigModel
import requests
import yaml
import json
from pathlib import Path
from typing import Dict, Any, Tuple
import plotly.graph_objects as go
import os
import shutil
import glob
from sdv.metadata import SingleTableMetadata

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_latest_experiment_dir(base_dir):
        # Get a list of all items in the base directory
        items = os.listdir(base_dir)

        # Filter out items that are not directories
        dirs = [item for item in items if os.path.isdir(os.path.join(base_dir, item))]

        # Filter out directories that do not start with 'experiment_run'
        experiment_dirs = [dir for dir in dirs if dir.startswith('experiment_run')]

        # Sort the directories by the experiment number (the part after 'experiment_run')
        sorted_experiment_dirs = sorted(experiment_dirs, key=lambda dir: int(dir.split('_')[-1]) if dir.split('_')[-1].isdigit() else -1)

        # Return the last directory in the sorted list
        latest_dir = sorted_experiment_dirs[-1] if sorted_experiment_dirs else None
        
        # If latest_dir is None, raise an error
        if latest_dir is None:
            raise ValueError(f"No directories starting with 'experiment_run' found in {base_dir}")
        
        # Return the full path of the latest directory
        return os.path.join(base_dir, latest_dir).replace('\\', '/') if latest_dir else None
    
import glob

def delete_file():
    # Get a list of all HDF5 files in the current directory
    hdf5_files = glob.glob('*.hdf5')

    # Print each HDF5 file name
    for file in hdf5_files:
        os.remove(file)

    # Get a list of all JSON files in the current directory
    json_files = glob.glob('*.json')

    # Print each JSON file name
    for file in json_files:
        os.remove(file)

In [3]:
def read_data(data_location) -> pd.DataFrame:
    creditcard = pd.read_csv(data_location)
    return creditcard

In [4]:
# read data and isolate fraud transactions
df = read_data("G:/My Drive/Data-Centric Solutions/07. Blog Posts/kedro/data/creditcard.csv")

In [5]:
# add transaction ID
df['transaction_id'] = df.index

In [6]:
df_fraud = df.loc[df["Class"] == 1]
df_fraud.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,transaction_id
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1,541
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1,623
4920,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,1,4920
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.0,1,6108
6329,7519.0,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.0,1,6329


In [7]:
# detect metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_fraud)

metadata.update_column(
    column_name='transaction_id',
    sdtype='id')

# set primary key 
metadata.set_primary_key(column_name='transaction_id')
metadata.validate()

Detected metadata:
{
    "columns": {
        "Time": {
            "sdtype": "numerical"
        },
        "V1": {
            "sdtype": "numerical"
        },
        "V2": {
            "sdtype": "numerical"
        },
        "V3": {
            "sdtype": "numerical"
        },
        "V4": {
            "sdtype": "numerical"
        },
        "V5": {
            "sdtype": "numerical"
        },
        "V6": {
            "sdtype": "numerical"
        },
        "V7": {
            "sdtype": "numerical"
        },
        "V8": {
            "sdtype": "numerical"
        },
        "V9": {
            "sdtype": "numerical"
        },
        "V10": {
            "sdtype": "numerical"
        },
        "V11": {
            "sdtype": "numerical"
        },
        "V12": {
            "sdtype": "numerical"
        },
        "V13": {
            "sdtype": "numerical"
        },
        "V14": {
            "sdtype": "numerical"
        },
        "V15": {
            "sdtype": "

In [8]:
from sdv.single_table import TVAESynthesizer
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

synthesizer = TVAESynthesizer(
    metadata, # required
    enforce_min_max_values=True,
    enforce_rounding=False,
    epochs=1000
)
synthesizer.fit(df_fraud)

synthetic_data = synthesizer.sample(num_rows=10000)
synthetic_data.head()

Fitting table None metadata
Fitting formatters for table None
Fitting constraints for table None
Setting the configuration for the ``HyperTransformer`` for table None
Fitting HyperTransformer for table None
Guidance: There are no missing values in column Time. Extra column not created.
Guidance: There are no missing values in column V1. Extra column not created.
Guidance: There are no missing values in column V2. Extra column not created.
Guidance: There are no missing values in column V3. Extra column not created.
Guidance: There are no missing values in column V4. Extra column not created.
Guidance: There are no missing values in column V5. Extra column not created.
Guidance: There are no missing values in column V6. Extra column not created.
Guidance: There are no missing values in column V7. Extra column not created.
Guidance: There are no missing values in column V8. Extra column not created.
Guidance: There are no missing values in column V9. Extra column not created.
Guidance: T

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,transaction_id
0,28586.81752,-11.155812,2.60931,-18.672437,5.28218,-3.625366,-2.576886,-11.495101,1.750511,-4.164061,...,0.74193,-0.084119,0.118208,-0.85313,0.560663,-5.07906,-0.316627,81.708826,1,0
1,49902.060498,-12.208513,17.193707,-26.169159,5.403107,-15.138295,-3.524899,-15.447762,13.129597,-4.137356,...,-1.03514,-0.037801,-0.371156,1.348949,0.019584,1.525381,0.240119,83.149205,1,1
2,155215.609146,-3.537087,3.888565,-6.428892,4.581233,-2.212659,-2.134853,-7.999193,0.085123,-1.679858,...,1.11722,0.047816,-0.192173,0.818457,0.627725,0.185452,0.246259,0.0,1,2
3,143479.211246,-2.663597,1.830799,-5.556075,5.672864,-0.872294,-1.846483,-2.072803,0.567094,-1.611487,...,-1.010366,-0.210342,-0.530721,0.210808,-0.144099,0.503443,0.219241,24.787739,1,3
4,82895.669343,-25.523103,8.479862,-16.36683,7.305313,-12.199212,-2.636703,-15.10226,4.294263,-7.171735,...,0.295297,-0.389192,0.560779,-0.839927,0.360991,-3.142498,-0.782078,0.0,1,4


In [9]:
# Pipeline
def split_data(df: pd.DataFrame, synthetic_data: pd.DataFrame, synth: bool) -> Tuple[pd.DataFrame, pd.DataFrame]:

    train, holdout_df = train_test_split(df, test_size=0.2, random_state=42)
    
    if synth is True:
        train_df = pd.concat([train, synthetic_data], ignore_index=True)
    else:
        train_df = train
    
    return train_df, holdout_df


def run_experiment(train_df: pd.DataFrame, model_yaml, output_dir) -> None:

    # Send a GET request to the URL
    response = requests.get(model_yaml)

    # Raise an exception if the request was unsuccessful
    response.raise_for_status()

    # Load the YAML data from the response text
    config = yaml.safe_load(response.text)

    # Set up your experiment
    model = LudwigModel(config=config)
    experiment_results = model.experiment(
      dataset=train_df,
      output_directory=output_dir
    )
    
    df = pd.DataFrame()
    
    delete_file()

    return None


def run_predictions(holdout_df: pd.DataFrame, output_dir) -> pd.DataFrame:
    
    # dummpy input varibale
    df = exp_run
    
    latest_experiment_dir = get_latest_experiment_dir(output_dir)
    model_path = Path(latest_experiment_dir) / 'model'

    # Load the model
    model = LudwigModel.load(model_path)
    
    # run predictions on holdout
    predictions, _ = model.predict(dataset=holdout_df)
    
    full_predictions = predictions.merge(right=holdout_df,   left_index=True, right_index=True)
    full_predictions['Class_predictions'] = full_predictions['Class_predictions'].map({True: 1, False: 0})
    
    return full_predictions


def model_training_diagnostics(full_predictions: pd.DataFrame, output_dir) -> Tuple[matplotlib.figure.Figure, go.Figure]:
        
    # plot roc curve 
    fpr, tpr, thresholds = roc_curve(full_predictions['Class'], full_predictions['Class_predictions'])
    roc_auc = auc(fpr, tpr)

    # Create the base figure
    fig = go.Figure()

    # Add the ROC curve
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (area = {roc_auc:.2f})'))

    # Add the random guess line
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Guess', line=dict(dash='dash')))

    # Update the layout
    fig.update_layout(
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'),
        width=700, height=700,
        title='Receiver Operating Characteristic'
    )

    roc_curve_plot = fig
    fig.show()
        
    # plot loss curve
    latest_experiment_dir = get_latest_experiment_dir(output_dir)

    json_path = latest_experiment_dir + "/training_statistics.json"

    # Load the JSON file
    with open(json_path, 'r') as f:
        train_stats = json.load(f)

    train_loss = train_stats['training']['Class']['loss']
    validation_loss = train_stats['validation']['Class']['loss']
    test_loss = train_stats['test']['Class']['loss']

    # Create list of epochs
    epochs = list(range(1, len(train_loss) + 1))

    # Create the plot
    fig = go.Figure()

    # Add traces
    fig.add_trace(go.Scatter(x=epochs, y=train_loss, mode='lines', name='Training loss'))
    fig.add_trace(go.Scatter(x=epochs, y=validation_loss, mode='lines', name='Validation loss'))
    fig.add_trace(go.Scatter(x=epochs, y=test_loss, mode='lines', name='Test loss'))

    # Add details
    fig.update_layout(title='Training, Validation and Test Loss', xaxis_title='Epochs', yaxis_title='Loss')
    
    fig.show()
    
    loss_plot = fig
    
    return loss_plot, roc_curve_plot

In [10]:
model_yaml = "https://raw.githubusercontent.com/john-adeojo/Credit-Card-Fraud-Model-Registry/main/model%20yaml%20files/model_1a.yaml"
output_dir = "../models/"
df = df
synthetic_data = synthetic_data
synth = True

In [11]:
# Pipeline with synthetic data
train_df, holdout_df = split_data(df=df, synthetic_data=synthetic_data, synth=synth)
run_experiment(train_df, model_yaml, output_dir)
full_predictions = run_predictions(holdout_df, output_dir)
model_training_diagnostics(full_predictions, output_dir)

NameError: name 'exp_run' is not defined

In [None]:
# pipeline without synthetic data 
synth = False
train_df, holdout_df = split_data(df=df, synthetic_data=synthetic_data, synth=synth)
run_experiment(train_df)
full_predictions = run_predictions(holdout_df, output_dir)
model_training_diagnostics(full_predictions, output_dir)