In [1]:
from kfp import compiler
import google.cloud.aiplatform as aip
from kfp.dsl import component, Output, Input, Dataset, pipeline

In [4]:
project_id = "jesusarguelles-sandbox"
pipeline_root_path = "gs://jesusarguelles-staging/"
bucket_id = "jesusarguelles-datasets-public"
bucket_folder_name = "money_laundering_detection"
raw_file_name = "paysim_dataset.csv"
raw_data_full_path = f"gs://{bucket_id}/{bucket_folder_name}/{raw_file_name}"
filtered_data_1 = "filtered_data_1.csv"
filtered_data_2 = "filtered_data_2.csv"
filtered_data_3 = "filtered_data_3.csv"

In [5]:
@component(
    base_image="python:3.10",
    packages_to_install=["google-cloud-storage", "gcsfs", "pandas"]
)
def data_preprocess_stage_1(
        raw_dataset: str,
        output_dataset_one : Output[Dataset],
        output_dataset_two : Output[Dataset]
):
    import os
    import csv
    import logging
    import numpy as np
    import pandas as pd
    from random import randint


    logging.warning("DATA PREPROCESSING 1 STAGE")
    logging.warning("Reading Dataset...")

    X = pd.read_csv(raw_dataset)
    X = X.to_numpy()

    logging.warning("Read Dataset")

    nameOrigCol = 3
    nameDestCol = 6
    nameOrig = []
    nameDest = []
    nameCount = {}
    namesWithMoreThanOneOccurrence = []

    logging.warning("Checking Each Person's Transactions Count...")

    for name in X[:, nameOrigCol] :
        if nameCount.get(name, -1) == -1 :
            nameOrig.append(name)

            nameCount[name] = 1

        else :
            nameCount[name] += 1
            namesWithMoreThanOneOccurrence.append(name)

    for name in X[:, nameDestCol] :
        if nameCount.get(name, -1) == -1 :
            nameDest.append(name)

            nameCount[name] = 1

        else :
            nameCount[name] += 1
            namesWithMoreThanOneOccurrence.append(name)

    logging.warning("Count Identification Done")

    logging.warning("Calculating Median ...")

    countArr = []
    count = 0
    for attr, value in nameCount.items() :
        if value > 40 :
            countArr.append(value)
            count += 1
    median = np.median(countArr)

    logging.warning(f"Median : {median}")

    logging.warning("Filtering Data Based on Transactions Count...")
    csv_golden_data = []

    for i in range(X.shape[0]) :
        if nameCount.get(X[i, 3], -1) > 40 or nameCount.get(X[i, 6], -1) > 40 :
            csv_golden_data.append(X[i, :])

    logging.warning("Filtering Done")

    logging.warning("Storing Filtered Data in data_processed folder...")

    new_file_name = "filtered_data.csv"

    with open(output_dataset_one.path, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(csv_golden_data)

    logging.warning("----------")
    logging.warning("DATA PREPROCESSING 2 STAGE")

    logging.warning("Reading Preprocessed 1 dataset...")

    X = pd.DataFrame(csv_golden_data)
    X = X.to_numpy()

    logging.warning("Read Preprocessed 1 dataset...")

    csv_dataset_primary = []
    step = 0
    trans_type = 1
    amount = 2
    nameOrig = 3
    oldbalanceOrg = 4
    nameDest = 6
    oldbalanceDest = 7
    accountType = 8
    isFraud = 9
    isFlaggedFraud = 10

    logging.warning("Changing Labels of Type Column ...")

    transfer = ["WIRE_IN", "WIRE_OUT"]
    for i in range(X.shape[0]):
        arr = []
        arr.append(X[i,step])
        if X[i,trans_type] =="PAYMENT":
            arr.append("CREDIT")
        elif X[i,trans_type] =="TRANSFER":
            arr.append(transfer[randint(0,1)])
        else:
            arr.append(X[i,trans_type])
        arr.append(X[i,amount])
        arr.append(X[i,nameOrig])
        arr.append(X[i,oldbalanceOrg])
        arr.append(X[i,nameDest])
        arr.append(X[i,oldbalanceDest])
        if X[i,trans_type] == "TRANSFER":
            arr.append("FOREIGN")
        else:
            arr.append("DOMESTIC")

        arr.append(X[i,isFraud])
        arr.append(X[i,isFlaggedFraud])

        csv_dataset_primary.append(arr)

    logging.warning("Changing Labels Done")
    logging.warning("Storing Data in Data_processed Folder...")


    columns=['step','trans_type','amount','nameOrig','oldbalanceOrg',
             'nameDest','oldbalanceDest','accountType','isFraud','isFlaggedFraud']

    data_primary = pd.DataFrame(csv_dataset_primary, columns=columns)

    data_primary.to_csv(output_dataset_two.path, index=False)

    logging.warning("Storing Data Done")

In [6]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs"]
)
def data_preprocess_stage_2(
        input_dataset: Input[Dataset],
        output_dataset_three: Output[Dataset]
):
    import logging
    import pandas as pd
    # data_path = f'gs://{bucket_id}/{folder_id}/filtered_data_2.csv'

    logging.basicConfig(filename='logs/model_development.txt',
                        filemode='a',
                        format='%(asctime)s %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")

    logging.warning("----------")
    logging.warning("DATA PREPROCESSING 3 STAGE")

    logging.warning("Reading Preprocessed 2 dataset...")

    X = pd.read_csv(input_dataset.path)
    X = X.to_numpy()

    logging.warning("Read Preprocessed 2 dataset")

    #col
    step = 0
    trans_type = 1
    amount = 2
    nameOrig = 3
    oldbalanceOrg = 4
    nameDest = 5
    oldbalanceDest = 6
    accountType = 7
    isFraud = 8

    #col
    entity = 0
    incommingDomestic30 = 1
    incommingDomestic60 = 2
    incommingDomestic90 = 3
    outgoingDomestic30 = 4
    outgoingDomestic60 = 5
    outgoingDomestic90 = 6
    incommingForeign30 = 7
    incommingForeign60 = 8
    incommingForeign90 = 9
    outgoingForeign30 = 10
    outgoingForeign60 = 11
    outgoingForeign90 = 12
    incoming_domestic_count_30 = 13
    incoming_domestic_count_60 = 14
    incoming_domestic_count_90 = 15
    outgoing_domestic_count_30 = 16
    outgoing_domestic_count_60 = 17
    outgoing_domestic_count_90 = 18
    incoming_foreign_count_30 = 19
    incoming_foreign_count_60 = 20
    incoming_foreign_count_90 = 21
    outgoing_foreign_count_30 = 22
    outgoing_foreign_count_60 = 23
    outgoing_foreign_count_90 = 24
    balance_difference_30 = 25
    balance_difference_60 = 26
    balance_difference_90 = 27
    isFraudSec = 28

    csv_dataset_secondary = []
    entities_pos = {}
    enititesDict = {}

    logging.warning("Creating New Features Using Transaction History...")

    def getSecRow(entity):
        return [entity,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

    for i in range(X.shape[0]):
        source_entity = X[i,nameOrig]
        dest_entity = X[i,nameDest]

        source_pos = entities_pos.get(source_entity,-1)
        if source_pos == -1:
            pos = len(csv_dataset_secondary)
            entities_pos[source_entity] = pos
            source_pos = pos

            row = getSecRow(source_entity)

            csv_dataset_secondary.append(row)

        dest_pos = entities_pos.get(dest_entity,-1)
        if dest_pos == -1:
            pos = len(csv_dataset_secondary)
            entities_pos[dest_entity] = pos
            dest_pos = pos

            row = getSecRow(dest_entity)

            csv_dataset_secondary.append(row)

        transferAmountSource = 0
        transferAmountDest = 0

        if X[i,trans_type] == "CASH_IN" or X[i,trans_type] == "CREDIT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][incommingDomestic30] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][incommingDomestic60] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][incommingDomestic90] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic90] += X[i,amount]

            transferAmountSource = X[i,amount]
            transferAmountDest = -1*X[i,amount]

        if X[i,trans_type] == "CASH_OUT" or X[i,trans_type] == "DEBIT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][outgoingDomestic30] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][outgoingDomestic60] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][outgoingDomestic90] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic90] += X[i,amount]

            transferAmountSource = -1*X[i,amount]
            transferAmountDest = X[i,amount]

        if X[i,trans_type] == "WIRE_IN":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][incommingForeign30] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingForeign30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][incommingForeign60] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingForeign60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][incommingForeign90] += X[i,amount]
                # print(dest_pos,outgoingForeign90,i,amount)
                csv_dataset_secondary[dest_pos][outgoingForeign90] += X[i,amount]

            transferAmountSource = X[i,amount]
            transferAmountDest = -1*X[i,amount]

        if X[i,trans_type] == "WIRE_OUT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][outgoingForeign30] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][outgoingForeign60] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][outgoingForeign90] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign90] += X[i,amount]

            transferAmountSource = -1*X[i,amount]
            transferAmountDest = X[i,amount]

        if enititesDict.get(source_entity,-1) == -1:
            enititesDict[source_entity] = {
                'day1Bal': X[i,oldbalanceOrg],
                'day30Bal': 0,
                'day60Bal': 0,
                'day90Bal': 0,
                'countIncomingDomestic30': 0,
                'countOutgoingDomestic30': 0,
                'countIncomingDomestic60': 0,
                'countOutgoingDomestic60': 0,
                'countIncomingDomestic90': 0,
                'countOutgoingDomestic90': 0,
                'countIncomingForeign30': 0,
                'countOutgoingForeign30': 0,
                'countIncomingForeign60': 0,
                'countOutgoingForeign60': 0,
                'countIncomingForeign90': 0,
                'countOutgoingForeign90': 0
            }

        if enititesDict.get(dest_entity,-1) == -1:
            enititesDict[dest_entity] = {
                'day1Bal': X[i,oldbalanceDest],
                'day30Bal': 0,
                'day60Bal': 0,
                'day90Bal': 0,
                'countIncomingDomestic30': 0,
                'countOutgoingDomestic30': 0,
                'countIncomingDomestic60': 0,
                'countOutgoingDomestic60': 0,
                'countIncomingDomestic90': 0,
                'countOutgoingDomestic90': 0,
                'countIncomingForeign30': 0,
                'countOutgoingForeign30': 0,
                'countIncomingForeign60': 0,
                'countOutgoingForeign60': 0,
                'countIncomingForeign90': 0,
                'countOutgoingForeign90': 0
            }

        incomingForSource = ["CASH_IN","CREDIT","WIRE_IN"]
        incomingForDest = ["CASH_OUT","DEBIT","WIRE_OUT"]
        outgoingForDest = incomingForSource
        outgoingForSource = incomingForDest

        if X[i,step]<=30:
            enititesDict[source_entity]['day30Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day30Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign30'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign30'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic30'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic30'] += 1


            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign30'] += 1
                else:

                    enititesDict[dest_entity]['countOutgoingForeign30'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic30'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic30'] += 1

        if X[i,step]<=60:
            enititesDict[source_entity]['day60Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day60Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign60'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign60'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic60'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic60'] += 1

            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign60'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingForeign60'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic60'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic60'] += 1

        if X[i,step]<=90:
            enititesDict[source_entity]['day90Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day90Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign90'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign90'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic90'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic90'] += 1

            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign90'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingForeign90'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic90'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic90'] += 1


        csv_dataset_secondary[source_pos][balance_difference_30] = enititesDict[source_entity]['day30Bal'] - enititesDict[source_entity]['day1Bal']
        csv_dataset_secondary[source_pos][balance_difference_60] = enititesDict[source_entity]['day60Bal'] - enititesDict[source_entity]['day1Bal']
        csv_dataset_secondary[source_pos][balance_difference_90] = enititesDict[source_entity]['day90Bal'] - enititesDict[source_entity]['day1Bal']



        csv_dataset_secondary[source_pos][incoming_domestic_count_30] = enititesDict[source_entity]['countIncomingDomestic30']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_30] = enititesDict[source_entity]['countOutgoingDomestic30']
        csv_dataset_secondary[source_pos][incoming_domestic_count_60] = enititesDict[source_entity]['countIncomingDomestic60']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_60] = enititesDict[source_entity]['countOutgoingDomestic60']
        csv_dataset_secondary[source_pos][incoming_domestic_count_90] = enititesDict[source_entity]['countIncomingDomestic90']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_90] = enititesDict[source_entity]['countOutgoingDomestic90']
        csv_dataset_secondary[source_pos][incoming_foreign_count_30] = enititesDict[source_entity]['countIncomingForeign30']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_30] = enititesDict[source_entity]['countOutgoingForeign30']
        csv_dataset_secondary[source_pos][incoming_foreign_count_60] = enititesDict[source_entity]['countIncomingForeign60']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_60] = enititesDict[source_entity]['countOutgoingForeign60']
        csv_dataset_secondary[source_pos][incoming_foreign_count_90] = enititesDict[source_entity]['countIncomingForeign90']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_90] = enititesDict[source_entity]['countOutgoingForeign90']

        csv_dataset_secondary[source_pos][isFraudSec] = csv_dataset_secondary[source_pos][isFraudSec] or X[i,isFraud]

        csv_dataset_secondary[dest_pos][incoming_domestic_count_30] = enititesDict[dest_entity]['countIncomingDomestic30']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_30] = enititesDict[dest_entity]['countOutgoingDomestic30']
        csv_dataset_secondary[dest_pos][incoming_domestic_count_60] = enititesDict[dest_entity]['countIncomingDomestic60']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_60] = enititesDict[dest_entity]['countOutgoingDomestic60']
        csv_dataset_secondary[dest_pos][incoming_domestic_count_90] = enititesDict[dest_entity]['countIncomingDomestic90']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_90] = enititesDict[dest_entity]['countOutgoingDomestic90']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_30] = enititesDict[dest_entity]['countIncomingForeign30']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_30] = enititesDict[dest_entity]['countOutgoingForeign30']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_60] = enititesDict[dest_entity]['countIncomingForeign60']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_60] = enititesDict[dest_entity]['countOutgoingForeign60']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_90] = enititesDict[dest_entity]['countIncomingForeign90']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_90] = enititesDict[dest_entity]['countOutgoingForeign90']


    columns = ['entity','incoming_domestic_amount_30','incoming_domestic_amount_60','incoming_domestic_amount_90',
               'outgoing_domestic_amount_30','outgoing_domestic_amount_60','outgoing_domestic_amount_90',
               'incoming_foreign_amount_30','incoming_foreign_amount_60','incoming_foreign_amount_90',
               'outgoing_foreign_amount_30','outgoing_foreign_amount_60','outgoing_foreign_amount_90',
               'incoming_domestic_count_30','incoming_domestic_count_60','incoming_domestic_count_90',
               'outgoing_domestic_count_30','outgoing_domestic_count_60','outgoing_domestic_count_90',
               'incoming_foreign_count_30','incoming_foreign_count_60','incoming_foreign_count_90',
               'outgoing_foreign_count_30','outgoing_foreign_count_60','outgoing_foreign_count_90',
               'balance_difference_30','balance_difference_60','balance_difference_90','isFraud']

    logging.warning("Creating New Features Done")

    logging.warning("Storing Data in Data_processed Folder...")

    # filtered_data_3.csv
    data_secondary = pd.DataFrame(csv_dataset_secondary, columns=columns)
    data_secondary.to_csv(output_dataset_three.path,index=False)
    logging.warning("Storing Data Done")

In [7]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn"]
)
def feature_selection(
        input_dataset : Input[Dataset],
        output_dataset : Output[Dataset]
):
    """

    :param input_dataset_1: filtered_data_3.csv from data-preprocess-stage-2
    :param output_dataset: feature_importances.csv
    """
    import logging
    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier


    logging.warning("----------")
    logging.warning("FEATURE SELECTION STAGE")

    logging.warning("Reading Filtered Data 3 ...")

    dataframeX = pd.read_csv(input_dataset.path)
    col_names = list(dataframeX.columns.values)
    dataMat = dataframeX.to_numpy()

    logging.warning("Read Filtered Data 3")

    logging.warning("Creating X and Y Variables...")

    X = dataMat[:,1:-2]
    Y = dataMat[:,-1]

    logging.warning(f"Shape of X: {X.shape} and Shape of Y: {Y.shape}")

    logging.warning("Instiantiating Random Forest Model...")

    model = RandomForestClassifier(random_state=42)

    logging.warning("Fitting Data...")

    model.fit(X, Y.astype(int))

    logging.warning("Checking Feature Importances...")

    feature_imp = model.feature_importances_

    sorted_feature_vals = np.sort(feature_imp)
    sorted_feature_indexes = np.argsort(feature_imp)

    logging.warning("Significant Features in decreasing order of importance: ")

    logging.warning("Storing Feature Importances in reports...")

    fea_imp = [[col_names[i+2], feature_imp[i]] for i in reversed(sorted_feature_indexes)]
    features = pd.DataFrame(fea_imp, columns=["features", "importance_score"])
    # feature_importances.csv
    features.to_csv(output_dataset.path, index=False)

    logging.warning("Storing Feature Importances Done")

In [8]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn"]
)
def segment_generation(
        input_dataset_1 : Input[Dataset],
        input_dataset_2 : Input[Dataset],
        input_dataset_3 : Input[Dataset],
        silhoutte_scores : Output[Dataset],
        final_dataset_output: Output[Dataset]
):
    """

    :param input_dataset_1:  filtered_data_2.csv from data-preprocess-stage-1
    :param input_dataset_2:  filtered_data_3.csv from data-preprocess-stage-2
    :param input_dataset_3:  feature_importances.csv from features_election
    :param silhoutte_scores: Metrics
    :param final_dataset_output: final_dataset
    :return:
    """
    # Segment Generation
    import logging
    import numpy as np
    import pandas as pd
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    logging.basicConfig(filename='logs/model_development.txt',
                        filemode='a',
                        format='%(asctime)s %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")

    logging.warning("----------")
    logging.warning("SEGMENT GENERATOR STAGE")

    def getClusterPredictions(data, true_k):
        model = KMeans(n_clusters=true_k)
        model.fit(data)
        prediction = model.predict(data)

        return prediction

    def getBestCluster(X,_min=2,_max=10):
        selected_cluster = 0
        previous_sil_coeff = 0.001 #some random small number not 0
        sc_vals = []
        for n_cluster in range(_min, _max):
            kmeans = KMeans(n_clusters=n_cluster).fit(X)
            label = kmeans.labels_

            sil_coeff = silhouette_score(X, label, metric='euclidean', sample_size=1000)
            sc_vals.append(sil_coeff)
            # print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

            percent_change = (sil_coeff-previous_sil_coeff)*100/previous_sil_coeff

            # return when below a threshold of 1%
            if percent_change<1:
                selected_cluster = n_cluster-1

            previous_sil_coeff = sil_coeff

        return selected_cluster or _max, sc_vals

    logging.warning("Reading Filtered Data 3 ...")

    X_dataframe = pd.read_csv(input_dataset_2.path)
    X = X_dataframe.to_numpy()

    logging.warning("Read Filtered Data 3")

    col_names = list(X_dataframe.columns.values)

    X_trimmed_features = np.zeros((X.shape[0],1))

    logging.warning("Importing Feature Importances...")

    #feature_path = f'gs://{bucket_id}/{folder_id}/feature_importances.csv'
    features = pd.read_csv(input_dataset_3.path)

    logging.warning("Selecting Top 13 Features for CLustering...")

    top_13 = features.iloc[:13, 0].tolist()

    logging.warning("Top 13 Features stored in List")

    for feature in top_13:
        X_trimmed_features = np.concatenate((X_trimmed_features,np.expand_dims(X_dataframe[feature],axis=1)),axis=1)
    X_trimmed_features = X_trimmed_features[:,1:]

    logging.warning("Choosing Best Number Of Clusters...")

    min_value = 2
    max_value = 10
    true_k, sc_vals = getBestCluster(X_trimmed_features,_min=min_value,_max=max_value)
    true_k = 5

    logging.warning("Storing Silhoutte Scores...")


    sil_score = [[i, sc_vals[i-min_value]] for i in range(min_value, max_value)]
    sil = pd.DataFrame(sil_score, columns=["no_of_clusters", "silhoutte_score"])
    sil.to_csv(silhoutte_scores.path, index=False)
    #sil.to_csv(f'gs://{bucket_id}/{folder_id}/silhoutte_scores.csv', index=False)

    logging.warning("Storing Silhoutte Scores Done")

    logging.warning("Creating Clusters with Best No Of Clusters...")

    prediction = getClusterPredictions(X_trimmed_features, true_k)
    seg_dict = {}
    for i in range(X.shape[0]):
        seg_dict[X[i,0]] = prediction[i]

    logging.warning("Inputing Filtered Data 2 Dataset...")

    X_dataframe_pri = pd.read_csv(input_dataset_1.path)
    X_pri = X_dataframe_pri.to_numpy()
    col_names = list(X_dataframe_pri.columns.values)

    logging.warning("Read Filtered 2 Data")

    logging.warning("Creating Final Dataset with segments...")

    X_with_segments = []
    for i in range(X_pri.shape[0]):
        X_with_segments.append(np.concatenate(([[seg_dict[X_pri[i,3]]]],np.expand_dims(X_pri[i,:],axis=0)),axis=1)[0])

    segmented_columns = ['segment','step','trans_type','amount','nameOrig','oldbalanceOrg',
                         'nameDest','oldbalanceDest','accountType','isFraud','isFlaggedFraud']

    data_segmented = pd.DataFrame(X_with_segments, columns = segmented_columns)
    data_segmented = data_segmented.drop('isFlaggedFraud', axis=1)
    data_segmented.to_csv(final_dataset_output.path, index=False)

    logging.warning("Storing Final Dataset Done")

In [15]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn", "catboost"]
)
def training(
        project_id: str,
        bucket_id: str,
        bucket_folder: str,
        input_dataset : Input[Dataset],
        model_path: Output[Dataset]
):
    """

    :param input_dataset: fina_dataset.csv from segment-generation
    """
    import json
    import pickle
    import logging
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from catboost import CatBoostClassifier
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

    logging.warning("----------")
    logging.warning("MODEL CREATION STAGE")

    logging.warning("Reading Final Dataset...")

    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(bucket_id)

    dataMat = pd.read_csv(input_dataset.path)
    data = dataMat.to_numpy()

    logging.warning("Read Final Dataset")

    logging.warning("Checking Categorical Features...")

    cat_feat = [i for i in dataMat.columns if dataMat[i].dtypes == 'O']

    logging.warning("Checking Missing Values...")

    a = dict(dataMat.isnull().sum())
    b = [[i, a[i]] for i in a.keys()]
    missing = pd.DataFrame(b, columns=['features', 'null_values_count'])

    logging.warning("Storing Missing Values...")

    missing.to_csv("missing_values.csv", index=False)

    logging.warning("Storing Missing Values Done")

    logging.warning("Encoding Categorical Features...")

    encoder = LabelEncoder()
    print("------")
    print(cat_feat)
    print("------")

    label_encoders = {}
    label_mappings = {}

    for i in cat_feat:
        encoder.fit(dataMat[i])
        dataMat[i] = encoder.transform(dataMat[i])

        label_encoders[i] = encoder
        label_mappings[i] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

    blob = bucket.blob(f"{bucket_folder}/label_encoder.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(encoder, f)

    logging.warning("Features Encoding Done")

    logging.warning("Creating X and y variables ...")

    X = dataMat.iloc[:, :-1]
    y = dataMat['isFraud']

    logging.warning(f"Shape of X: {X.shape} and Shape of y: {y.shape}")

    logging.warning("Splitting Dataset...")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    logging.warning("Instantiating Model...")

    model = CatBoostClassifier(random_state=42, class_weights={0:1, 1:12}, silent=True)

    logging.warning("Fitting Model...")

    model.fit(X_train, y_train)
    y_pred_cat = model.predict(X_test)

    logging.warning("Saving Model...")

    #model_path = "model.pkl"
    blob = bucket.blob(f"{bucket_folder}/model.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(model, f)

    f.close()

    logging.warning("Saving Model Metrics...")

    metric_file_path = "performance.json"
    # with open(metric_file_path, "r") as f:
    #     data = json.load(f)

    model_metric = {
        "time_stamp": datetime.now().strftime("%d-%m-%Y_%H:%M:%S"),
        "confusion_matrix": confusion_matrix(y_test, y_pred_cat).tolist(),
        "precision": precision_score(y_test, y_pred_cat),
        "recall": recall_score(y_test, y_pred_cat),
        "f1_score": f1_score(y_test, y_pred_cat)
    }

    # data['model_metric'].append(model_metric)
    # with open(metric_file_path, "w") as f:
    #     json.dump(data, f, indent=4)

    logging.warning("Model Metrics Stored")

In [16]:
from kfp.dsl import pipeline

@pipeline(name="money_laundering_detection")
def pipeline(
        project_id: str,
        bucket_id: str,
        bucket_folder: str
):
    preproces_job_1 = data_preprocess_stage_1(raw_dataset=raw_data_full_path)
    preprocess_job_2 = data_preprocess_stage_2(input_dataset=preproces_job_1.outputs["output_dataset_two"])
    feature_selection_job = feature_selection(input_dataset=preprocess_job_2.outputs["output_dataset_three"])
    segment_generation_job = segment_generation(
        input_dataset_1=preproces_job_1.outputs["output_dataset_two"],
        input_dataset_2=preprocess_job_2.outputs["output_dataset_three"],
        input_dataset_3=feature_selection_job.outputs["output_dataset"]
    )
    training_job = training(
        project_id = project_id,
        bucket_id = bucket_id,
        bucket_folder = bucket_folder,
        input_dataset=segment_generation_job.outputs["final_dataset_output"])

In [17]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='money_laundering_detection.yaml'
)

In [18]:

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=project_id,
    location="us-central1",
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="money_laundering_detection",
    template_path="money_laundering_detection.yaml",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'bucket_id': bucket_id,
        'bucket_folder': bucket_folder_name
    }
)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/390227712642/locations/us-central1/pipelineJobs/money-laundering-detection-20240507145523
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/390227712642/locations/us-central1/pipelineJobs/money-laundering-detection-20240507145523')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/money-laundering-detection-20240507145523?project=390227712642


## Testing the Model

In [13]:
import pickle
from google.cloud import storage

bucket = storage.Client(project="jesusarguelles-sandbox").bucket("jesusarguelles-datasets-public")

blob = bucket.blob("money_laundering_detection/model.pkl")

with blob.open("rb") as f:
    model = pickle.load(f)
f.close()

blob = bucket.blob("money_laundering_detection/label_encoder.pkl")

with blob.open("rb") as f:
    encoder = pickle.load(f)

In [14]:
predict = {
    "segment": [0],
    "step": [1],
    "trans_type": ["DEBIT"],
    "amount": [181.00],
    "nameOrig": ["C1900366749"],
    "oldbalanceOrg": [4465.0],
    "nameDest": ["C997608398"],
    "oldbalanceDest": ["10845.0"],
    "accountType": ["DOMESTIC"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

In [54]:
df[df.isFraud == 1]

Unnamed: 0,segment,step,trans_type,amount,nameOrig,oldbalanceOrg,nameDest,oldbalanceDest,accountType,isFraud
0,0,1,CASH_OUT,181.00,C840083671,181.00,C38997010,21182.00,DOMESTIC,1
350,0,1,CASH_OUT,416001.33,C749981943,0.00,C667346055,102.00,DOMESTIC,1
448,0,1,CASH_OUT,1277212.77,C467632528,1277212.77,C716083600,0.00,DOMESTIC,1
480,0,1,CASH_OUT,35063.63,C1635772897,35063.63,C1983025922,31140.00,DOMESTIC,1
794,0,1,CASH_OUT,132842.64,C13692003,4499.08,C297927961,0.00,DOMESTIC,1
...,...,...,...,...,...,...,...,...,...,...
211794,0,551,CASH_OUT,813992.49,C990823587,813992.49,C1506986844,24794625.20,DOMESTIC,1
211914,0,567,CASH_OUT,175203.45,C1731825076,175203.45,C1046160944,2735776.45,DOMESTIC,1
212005,2,572,CASH_OUT,2000718.20,C1426906570,2000718.20,C154319946,10984451.39,DOMESTIC,1
212451,0,600,CASH_OUT,612229.86,C199984853,612229.86,C1428539340,2275776.03,DOMESTIC,1


In [61]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}

to_predict_df = pd.DataFrame(predict)
to_predict_df = df[df.isFraud == 1]
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
A value is trying to be set 

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [58]:
df[df.isFraud == 1].iloc[7,:]

segment                     0
step                        2
trans_type           WIRE_OUT
amount               18627.02
nameOrig          C1375503918
oldbalanceOrg        18627.02
nameDest           C234430897
oldbalanceDest            0.0
accountType           FOREIGN
isFraud                     1
Name: 1423, dtype: object

In [63]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

array([0])

In [85]:
to_predict_df = df[df.isFraud == 1].iloc[:,:-1]
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']
for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
model.predict(to_predict_df)

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [83]:
to_predict_df.reset_index(inplace=True)

In [137]:
to_predict_df = df[df.isFraud == 1].iloc[:,:-1]
to_predict_df.reset_index(drop=True, inplace=True)

In [138]:
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']
print(to_predict_df.head(9))
for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
print(model.predict(to_predict_df))

   segment  step trans_type      amount     nameOrig  oldbalanceOrg  \
0        0     1   CASH_OUT      181.00   C840083671         181.00   
1        0     1   CASH_OUT   416001.33   C749981943           0.00   
2        0     1   CASH_OUT  1277212.77   C467632528     1277212.77   
3        0     1   CASH_OUT    35063.63  C1635772897       35063.63   
4        0     1   CASH_OUT   132842.64    C13692003        4499.08   
5        0     2   CASH_OUT  1096187.24    C77163673     1096187.24   
6        0     2   CASH_OUT   963532.14   C430329518      963532.14   
7        0     2   WIRE_OUT    18627.02  C1375503918       18627.02   
8        0     3    WIRE_IN    10539.37  C1134864869       10539.37   

      nameDest  oldbalanceDest accountType  
0    C38997010        21182.00    DOMESTIC  
1   C667346055          102.00    DOMESTIC  
2   C716083600            0.00    DOMESTIC  
3  C1983025922        31140.00    DOMESTIC  
4   C297927961            0.00    DOMESTIC  
5   C644345897     

In [129]:
to_predict_df

Unnamed: 0,segment,step,trans_type,amount,nameOrig,oldbalanceOrg,nameDest,oldbalanceDest,accountType
0,0,1,0,181.00,109,181.00,69,21182.00,0
1,0,1,0,416001.33,102,0.00,89,102.00,0
2,0,1,0,1277212.77,87,1277212.77,92,0.00,0
3,0,1,0,35063.63,44,35063.63,47,31140.00,0
4,0,1,0,132842.64,19,4499.08,64,0.00,0
...,...,...,...,...,...,...,...,...,...
117,0,551,0,813992.49,121,813992.49,19,24794625.20,0
118,0,567,0,175203.45,53,175203.45,1,2735776.45,0
119,2,572,0,2000718.20,27,2000718.20,21,10984451.39,0
120,0,600,0,612229.86,68,612229.86,14,2275776.03,0


In [125]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

In [None]:
to_predict_df 0     2   WIRE_OUT    18627.02  C1375503918       18627.02#%%
from kfp import compiler
import google.cloud.aiplatform as aip
from kfp.dsl import component, Output, Input, Dataset, pipeline

In [None]:
project_id = "jesusarguelles-sandbox"
pipeline_root_path = "gs://jesusarguelles-staging/"
bucket_id = "jesusarguelles-datasets-public"
bucket_folder_name = "money_laundering_detection"
raw_file_name = "paysim_dataset.csv"
raw_data_full_path = f"gs://{bucket_id}/{bucket_folder_name}/{raw_file_name}"
filtered_data_1 = "filtered_data_1.csv"
filtered_data_2 = "filtered_data_2.csv"
filtered_data_3 = "filtered_data_3.csv"

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["google-cloud-storage", "gcsfs", "pandas"]
)
def data_preprocess_stage_1(
        raw_dataset: str,
        output_dataset_one : Output[Dataset],
        output_dataset_two : Output[Dataset]
):
    import os
    import csv
    import logging
    import numpy as np
    import pandas as pd
    from random import randint


    logging.warning("DATA PREPROCESSING 1 STAGE")
    logging.warning("Reading Dataset...")

    X = pd.read_csv(raw_dataset)
    X = X.to_numpy()

    logging.warning("Read Dataset")

    nameOrigCol = 3
    nameDestCol = 6
    nameOrig = []
    nameDest = []
    nameCount = {}
    namesWithMoreThanOneOccurrence = []

    logging.warning("Checking Each Person's Transactions Count...")

    for name in X[:, nameOrigCol] :
        if nameCount.get(name, -1) == -1 :
            nameOrig.append(name)

            nameCount[name] = 1

        else :
            nameCount[name] += 1
            namesWithMoreThanOneOccurrence.append(name)

    for name in X[:, nameDestCol] :
        if nameCount.get(name, -1) == -1 :
            nameDest.append(name)

            nameCount[name] = 1

        else :
            nameCount[name] += 1
            namesWithMoreThanOneOccurrence.append(name)

    logging.warning("Count Identification Done")

    logging.warning("Calculating Median ...")

    countArr = []
    count = 0
    for attr, value in nameCount.items() :
        if value > 40 :
            countArr.append(value)
            count += 1
    median = np.median(countArr)

    logging.warning(f"Median : {median}")

    logging.warning("Filtering Data Based on Transactions Count...")
    csv_golden_data = []

    for i in range(X.shape[0]) :
        if nameCount.get(X[i, 3], -1) > 40 or nameCount.get(X[i, 6], -1) > 40 :
            csv_golden_data.append(X[i, :])

    logging.warning("Filtering Done")

    logging.warning("Storing Filtered Data in data_processed folder...")

    new_file_name = "filtered_data.csv"

    with open(output_dataset_one.path, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(csv_golden_data)

    logging.warning("----------")
    logging.warning("DATA PREPROCESSING 2 STAGE")

    logging.warning("Reading Preprocessed 1 dataset...")

    X = pd.DataFrame(csv_golden_data)
    X = X.to_numpy()

    logging.warning("Read Preprocessed 1 dataset...")

    csv_dataset_primary = []
    step = 0
    trans_type = 1
    amount = 2
    nameOrig = 3
    oldbalanceOrg = 4
    nameDest = 6
    oldbalanceDest = 7
    accountType = 8
    isFraud = 9
    isFlaggedFraud = 10

    logging.warning("Changing Labels of Type Column ...")

    transfer = ["WIRE_IN", "WIRE_OUT"]
    for i in range(X.shape[0]):
        arr = []
        arr.append(X[i,step])
        if X[i,trans_type] =="PAYMENT":
            arr.append("CREDIT")
        elif X[i,trans_type] =="TRANSFER":
            arr.append(transfer[randint(0,1)])
        else:
            arr.append(X[i,trans_type])
        arr.append(X[i,amount])
        arr.append(X[i,nameOrig])
        arr.append(X[i,oldbalanceOrg])
        arr.append(X[i,nameDest])
        arr.append(X[i,oldbalanceDest])
        if X[i,trans_type] == "TRANSFER":
            arr.append("FOREIGN")
        else:
            arr.append("DOMESTIC")

        arr.append(X[i,isFraud])
        arr.append(X[i,isFlaggedFraud])

        csv_dataset_primary.append(arr)

    logging.warning("Changing Labels Done")
    logging.warning("Storing Data in Data_processed Folder...")


    columns=['step','trans_type','amount','nameOrig','oldbalanceOrg',
             'nameDest','oldbalanceDest','accountType','isFraud','isFlaggedFraud']

    data_primary = pd.DataFrame(csv_dataset_primary, columns=columns)

    data_primary.to_csv(output_dataset_two.path, index=False)

    logging.warning("Storing Data Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs"]
)
def data_preprocess_stage_2(
        input_dataset: Input[Dataset],
        output_dataset_three: Output[Dataset]
):
    import logging
    import pandas as pd
    # data_path = f'gs://{bucket_id}/{folder_id}/filtered_data_2.csv'

    logging.basicConfig(filename='logs/model_development.txt',
                        filemode='a',
                        format='%(asctime)s %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")

    logging.warning("----------")
    logging.warning("DATA PREPROCESSING 3 STAGE")

    logging.warning("Reading Preprocessed 2 dataset...")

    X = pd.read_csv(input_dataset.path)
    X = X.to_numpy()

    logging.warning("Read Preprocessed 2 dataset")

    #col
    step = 0
    trans_type = 1
    amount = 2
    nameOrig = 3
    oldbalanceOrg = 4
    nameDest = 5
    oldbalanceDest = 6
    accountType = 7
    isFraud = 8

    #col
    entity = 0
    incommingDomestic30 = 1
    incommingDomestic60 = 2
    incommingDomestic90 = 3
    outgoingDomestic30 = 4
    outgoingDomestic60 = 5
    outgoingDomestic90 = 6
    incommingForeign30 = 7
    incommingForeign60 = 8
    incommingForeign90 = 9
    outgoingForeign30 = 10
    outgoingForeign60 = 11
    outgoingForeign90 = 12
    incoming_domestic_count_30 = 13
    incoming_domestic_count_60 = 14
    incoming_domestic_count_90 = 15
    outgoing_domestic_count_30 = 16
    outgoing_domestic_count_60 = 17
    outgoing_domestic_count_90 = 18
    incoming_foreign_count_30 = 19
    incoming_foreign_count_60 = 20
    incoming_foreign_count_90 = 21
    outgoing_foreign_count_30 = 22
    outgoing_foreign_count_60 = 23
    outgoing_foreign_count_90 = 24
    balance_difference_30 = 25
    balance_difference_60 = 26
    balance_difference_90 = 27
    isFraudSec = 28

    csv_dataset_secondary = []
    entities_pos = {}
    enititesDict = {}

    logging.warning("Creating New Features Using Transaction History...")

    def getSecRow(entity):
        return [entity,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

    for i in range(X.shape[0]):
        source_entity = X[i,nameOrig]
        dest_entity = X[i,nameDest]

        source_pos = entities_pos.get(source_entity,-1)
        if source_pos == -1:
            pos = len(csv_dataset_secondary)
            entities_pos[source_entity] = pos
            source_pos = pos

            row = getSecRow(source_entity)

            csv_dataset_secondary.append(row)

        dest_pos = entities_pos.get(dest_entity,-1)
        if dest_pos == -1:
            pos = len(csv_dataset_secondary)
            entities_pos[dest_entity] = pos
            dest_pos = pos

            row = getSecRow(dest_entity)

            csv_dataset_secondary.append(row)

        transferAmountSource = 0
        transferAmountDest = 0

        if X[i,trans_type] == "CASH_IN" or X[i,trans_type] == "CREDIT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][incommingDomestic30] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][incommingDomestic60] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][incommingDomestic90] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic90] += X[i,amount]

            transferAmountSource = X[i,amount]
            transferAmountDest = -1*X[i,amount]

        if X[i,trans_type] == "CASH_OUT" or X[i,trans_type] == "DEBIT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][outgoingDomestic30] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][outgoingDomestic60] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][outgoingDomestic90] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic90] += X[i,amount]

            transferAmountSource = -1*X[i,amount]
            transferAmountDest = X[i,amount]

        if X[i,trans_type] == "WIRE_IN":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][incommingForeign30] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingForeign30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][incommingForeign60] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingForeign60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][incommingForeign90] += X[i,amount]
                # print(dest_pos,outgoingForeign90,i,amount)
                csv_dataset_secondary[dest_pos][outgoingForeign90] += X[i,amount]

            transferAmountSource = X[i,amount]
            transferAmountDest = -1*X[i,amount]

        if X[i,trans_type] == "WIRE_OUT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][outgoingForeign30] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][outgoingForeign60] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][outgoingForeign90] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign90] += X[i,amount]

            transferAmountSource = -1*X[i,amount]
            transferAmountDest = X[i,amount]

        if enititesDict.get(source_entity,-1) == -1:
            enititesDict[source_entity] = {
                'day1Bal': X[i,oldbalanceOrg],
                'day30Bal': 0,
                'day60Bal': 0,
                'day90Bal': 0,
                'countIncomingDomestic30': 0,
                'countOutgoingDomestic30': 0,
                'countIncomingDomestic60': 0,
                'countOutgoingDomestic60': 0,
                'countIncomingDomestic90': 0,
                'countOutgoingDomestic90': 0,
                'countIncomingForeign30': 0,
                'countOutgoingForeign30': 0,
                'countIncomingForeign60': 0,
                'countOutgoingForeign60': 0,
                'countIncomingForeign90': 0,
                'countOutgoingForeign90': 0
            }

        if enititesDict.get(dest_entity,-1) == -1:
            enititesDict[dest_entity] = {
                'day1Bal': X[i,oldbalanceDest],
                'day30Bal': 0,
                'day60Bal': 0,
                'day90Bal': 0,
                'countIncomingDomestic30': 0,
                'countOutgoingDomestic30': 0,
                'countIncomingDomestic60': 0,
                'countOutgoingDomestic60': 0,
                'countIncomingDomestic90': 0,
                'countOutgoingDomestic90': 0,
                'countIncomingForeign30': 0,
                'countOutgoingForeign30': 0,
                'countIncomingForeign60': 0,
                'countOutgoingForeign60': 0,
                'countIncomingForeign90': 0,
                'countOutgoingForeign90': 0
            }

        incomingForSource = ["CASH_IN","CREDIT","WIRE_IN"]
        incomingForDest = ["CASH_OUT","DEBIT","WIRE_OUT"]
        outgoingForDest = incomingForSource
        outgoingForSource = incomingForDest

        if X[i,step]<=30:
            enititesDict[source_entity]['day30Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day30Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign30'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign30'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic30'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic30'] += 1


            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign30'] += 1
                else:

                    enititesDict[dest_entity]['countOutgoingForeign30'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic30'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic30'] += 1

        if X[i,step]<=60:
            enititesDict[source_entity]['day60Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day60Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign60'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign60'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic60'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic60'] += 1

            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign60'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingForeign60'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic60'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic60'] += 1

        if X[i,step]<=90:
            enititesDict[source_entity]['day90Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day90Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign90'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign90'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic90'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic90'] += 1

            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign90'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingForeign90'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic90'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic90'] += 1


        csv_dataset_secondary[source_pos][balance_difference_30] = enititesDict[source_entity]['day30Bal'] - enititesDict[source_entity]['day1Bal']
        csv_dataset_secondary[source_pos][balance_difference_60] = enititesDict[source_entity]['day60Bal'] - enititesDict[source_entity]['day1Bal']
        csv_dataset_secondary[source_pos][balance_difference_90] = enititesDict[source_entity]['day90Bal'] - enititesDict[source_entity]['day1Bal']



        csv_dataset_secondary[source_pos][incoming_domestic_count_30] = enititesDict[source_entity]['countIncomingDomestic30']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_30] = enititesDict[source_entity]['countOutgoingDomestic30']
        csv_dataset_secondary[source_pos][incoming_domestic_count_60] = enititesDict[source_entity]['countIncomingDomestic60']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_60] = enititesDict[source_entity]['countOutgoingDomestic60']
        csv_dataset_secondary[source_pos][incoming_domestic_count_90] = enititesDict[source_entity]['countIncomingDomestic90']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_90] = enititesDict[source_entity]['countOutgoingDomestic90']
        csv_dataset_secondary[source_pos][incoming_foreign_count_30] = enititesDict[source_entity]['countIncomingForeign30']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_30] = enititesDict[source_entity]['countOutgoingForeign30']
        csv_dataset_secondary[source_pos][incoming_foreign_count_60] = enititesDict[source_entity]['countIncomingForeign60']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_60] = enititesDict[source_entity]['countOutgoingForeign60']
        csv_dataset_secondary[source_pos][incoming_foreign_count_90] = enititesDict[source_entity]['countIncomingForeign90']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_90] = enititesDict[source_entity]['countOutgoingForeign90']

        csv_dataset_secondary[source_pos][isFraudSec] = csv_dataset_secondary[source_pos][isFraudSec] or X[i,isFraud]

        csv_dataset_secondary[dest_pos][incoming_domestic_count_30] = enititesDict[dest_entity]['countIncomingDomestic30']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_30] = enititesDict[dest_entity]['countOutgoingDomestic30']
        csv_dataset_secondary[dest_pos][incoming_domestic_count_60] = enititesDict[dest_entity]['countIncomingDomestic60']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_60] = enititesDict[dest_entity]['countOutgoingDomestic60']
        csv_dataset_secondary[dest_pos][incoming_domestic_count_90] = enititesDict[dest_entity]['countIncomingDomestic90']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_90] = enititesDict[dest_entity]['countOutgoingDomestic90']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_30] = enititesDict[dest_entity]['countIncomingForeign30']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_30] = enititesDict[dest_entity]['countOutgoingForeign30']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_60] = enititesDict[dest_entity]['countIncomingForeign60']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_60] = enititesDict[dest_entity]['countOutgoingForeign60']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_90] = enititesDict[dest_entity]['countIncomingForeign90']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_90] = enititesDict[dest_entity]['countOutgoingForeign90']


    columns = ['entity','incoming_domestic_amount_30','incoming_domestic_amount_60','incoming_domestic_amount_90',
               'outgoing_domestic_amount_30','outgoing_domestic_amount_60','outgoing_domestic_amount_90',
               'incoming_foreign_amount_30','incoming_foreign_amount_60','incoming_foreign_amount_90',
               'outgoing_foreign_amount_30','outgoing_foreign_amount_60','outgoing_foreign_amount_90',
               'incoming_domestic_count_30','incoming_domestic_count_60','incoming_domestic_count_90',
               'outgoing_domestic_count_30','outgoing_domestic_count_60','outgoing_domestic_count_90',
               'incoming_foreign_count_30','incoming_foreign_count_60','incoming_foreign_count_90',
               'outgoing_foreign_count_30','outgoing_foreign_count_60','outgoing_foreign_count_90',
               'balance_difference_30','balance_difference_60','balance_difference_90','isFraud']

    logging.warning("Creating New Features Done")

    logging.warning("Storing Data in Data_processed Folder...")

    # filtered_data_3.csv
    data_secondary = pd.DataFrame(csv_dataset_secondary, columns=columns)
    data_secondary.to_csv(output_dataset_three.path,index=False)
    logging.warning("Storing Data Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn"]
)
def feature_selection(
        input_dataset : Input[Dataset],
        output_dataset : Output[Dataset]
):
    """

    :param input_dataset_1: filtered_data_3.csv from data-preprocess-stage-2
    :param output_dataset: feature_importances.csv
    """
    import logging
    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier


    logging.warning("----------")
    logging.warning("FEATURE SELECTION STAGE")

    logging.warning("Reading Filtered Data 3 ...")

    dataframeX = pd.read_csv(input_dataset.path)
    col_names = list(dataframeX.columns.values)
    dataMat = dataframeX.to_numpy()

    logging.warning("Read Filtered Data 3")

    logging.warning("Creating X and Y Variables...")

    X = dataMat[:,1:-2]
    Y = dataMat[:,-1]

    logging.warning(f"Shape of X: {X.shape} and Shape of Y: {Y.shape}")

    logging.warning("Instiantiating Random Forest Model...")

    model = RandomForestClassifier(random_state=42)

    logging.warning("Fitting Data...")

    model.fit(X, Y.astype(int))

    logging.warning("Checking Feature Importances...")

    feature_imp = model.feature_importances_

    sorted_feature_vals = np.sort(feature_imp)
    sorted_feature_indexes = np.argsort(feature_imp)

    logging.warning("Significant Features in decreasing order of importance: ")

    logging.warning("Storing Feature Importances in reports...")

    fea_imp = [[col_names[i+2], feature_imp[i]] for i in reversed(sorted_feature_indexes)]
    features = pd.DataFrame(fea_imp, columns=["features", "importance_score"])
    # feature_importances.csv
    features.to_csv(output_dataset.path, index=False)

    logging.warning("Storing Feature Importances Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn"]
)
def segment_generation(
        input_dataset_1 : Input[Dataset],
        input_dataset_2 : Input[Dataset],
        input_dataset_3 : Input[Dataset],
        silhoutte_scores : Output[Dataset],
        final_dataset_output: Output[Dataset]
):
    """

    :param input_dataset_1:  filtered_data_2.csv from data-preprocess-stage-1
    :param input_dataset_2:  filtered_data_3.csv from data-preprocess-stage-2
    :param input_dataset_3:  feature_importances.csv from features_election
    :param silhoutte_scores: Metrics
    :param final_dataset_output: final_dataset
    :return:
    """
    # Segment Generation
    import logging
    import numpy as np
    import pandas as pd
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    logging.basicConfig(filename='logs/model_development.txt',
                        filemode='a',
                        format='%(asctime)s %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")

    logging.warning("----------")
    logging.warning("SEGMENT GENERATOR STAGE")

    def getClusterPredictions(data, true_k):
        model = KMeans(n_clusters=true_k)
        model.fit(data)
        prediction = model.predict(data)

        return prediction

    def getBestCluster(X,_min=2,_max=10):
        selected_cluster = 0
        previous_sil_coeff = 0.001 #some random small number not 0
        sc_vals = []
        for n_cluster in range(_min, _max):
            kmeans = KMeans(n_clusters=n_cluster).fit(X)
            label = kmeans.labels_

            sil_coeff = silhouette_score(X, label, metric='euclidean', sample_size=1000)
            sc_vals.append(sil_coeff)
            # print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

            percent_change = (sil_coeff-previous_sil_coeff)*100/previous_sil_coeff

            # return when below a threshold of 1%
            if percent_change<1:
                selected_cluster = n_cluster-1

            previous_sil_coeff = sil_coeff

        return selected_cluster or _max, sc_vals

    logging.warning("Reading Filtered Data 3 ...")

    X_dataframe = pd.read_csv(input_dataset_2.path)
    X = X_dataframe.to_numpy()

    logging.warning("Read Filtered Data 3")

    col_names = list(X_dataframe.columns.values)

    X_trimmed_features = np.zeros((X.shape[0],1))

    logging.warning("Importing Feature Importances...")

    #feature_path = f'gs://{bucket_id}/{folder_id}/feature_importances.csv'
    features = pd.read_csv(input_dataset_3.path)

    logging.warning("Selecting Top 13 Features for CLustering...")

    top_13 = features.iloc[:13, 0].tolist()

    logging.warning("Top 13 Features stored in List")

    for feature in top_13:
        X_trimmed_features = np.concatenate((X_trimmed_features,np.expand_dims(X_dataframe[feature],axis=1)),axis=1)
    X_trimmed_features = X_trimmed_features[:,1:]

    logging.warning("Choosing Best Number Of Clusters...")

    min_value = 2
    max_value = 10
    true_k, sc_vals = getBestCluster(X_trimmed_features,_min=min_value,_max=max_value)
    true_k = 5

    logging.warning("Storing Silhoutte Scores...")


    sil_score = [[i, sc_vals[i-min_value]] for i in range(min_value, max_value)]
    sil = pd.DataFrame(sil_score, columns=["no_of_clusters", "silhoutte_score"])
    sil.to_csv(silhoutte_scores.path, index=False)
    #sil.to_csv(f'gs://{bucket_id}/{folder_id}/silhoutte_scores.csv', index=False)

    logging.warning("Storing Silhoutte Scores Done")

    logging.warning("Creating Clusters with Best No Of Clusters...")

    prediction = getClusterPredictions(X_trimmed_features, true_k)
    seg_dict = {}
    for i in range(X.shape[0]):
        seg_dict[X[i,0]] = prediction[i]

    logging.warning("Inputing Filtered Data 2 Dataset...")

    X_dataframe_pri = pd.read_csv(input_dataset_1.path)
    X_pri = X_dataframe_pri.to_numpy()
    col_names = list(X_dataframe_pri.columns.values)

    logging.warning("Read Filtered 2 Data")

    logging.warning("Creating Final Dataset with segments...")

    X_with_segments = []
    for i in range(X_pri.shape[0]):
        X_with_segments.append(np.concatenate(([[seg_dict[X_pri[i,3]]]],np.expand_dims(X_pri[i,:],axis=0)),axis=1)[0])

    segmented_columns = ['segment','step','trans_type','amount','nameOrig','oldbalanceOrg',
                         'nameDest','oldbalanceDest','accountType','isFraud','isFlaggedFraud']

    data_segmented = pd.DataFrame(X_with_segments, columns = segmented_columns)
    data_segmented = data_segmented.drop('isFlaggedFraud', axis=1)
    data_segmented.to_csv(final_dataset_output.path, index=False)

    logging.warning("Storing Final Dataset Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn", "catboost"]
)
def training(
        project_id: str,
        bucket_id: str,
        bucket_folder: str,
        input_dataset : Input[Dataset],
        model_path: Output[Dataset]
):
    """

    :param input_dataset: fina_dataset.csv from segment-generation
    """
    import json
    import pickle
    import logging
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from catboost import CatBoostClassifier
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

    logging.warning("----------")
    logging.warning("MODEL CREATION STAGE")

    logging.warning("Reading Final Dataset...")

    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(bucket_id)

    dataMat = pd.read_csv(input_dataset.path)
    data = dataMat.to_numpy()

    logging.warning("Read Final Dataset")

    logging.warning("Checking Categorical Features...")

    cat_feat = [i for i in dataMat.columns if dataMat[i].dtypes == 'O']

    logging.warning("Checking Missing Values...")

    a = dict(dataMat.isnull().sum())
    b = [[i, a[i]] for i in a.keys()]
    missing = pd.DataFrame(b, columns=['features', 'null_values_count'])

    logging.warning("Storing Missing Values...")

    missing.to_csv("missing_values.csv", index=False)

    logging.warning("Storing Missing Values Done")

    logging.warning("Encoding Categorical Features...")

    encoder = LabelEncoder()
    for i in cat_feat:
        dataMat[i] = encoder.fit_transform(dataMat[i])

    blob = bucket.blob(f"{bucket_folder}/label_encoder.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(encoder, f)

    logging.warning("Features Encoding Done")

    logging.warning("Creating X and y variables ...")

    X = dataMat.iloc[:, :-1]
    y = dataMat['isFraud']

    logging.warning(f"Shape of X: {X.shape} and Shape of y: {y.shape}")

    logging.warning("Splitting Dataset...")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    logging.warning("Instantiating Model...")

    model = CatBoostClassifier(random_state=42, class_weights={0:1, 1:12}, silent=True)

    logging.warning("Fitting Model...")

    model.fit(X_train, y_train)
    y_pred_cat = model.predict(X_test)

    logging.warning("Saving Model...")

    #model_path = "model.pkl"
    blob = bucket.blob(f"{bucket_folder}/model.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(model, f)

    f.close()

    logging.warning("Saving Model Metrics...")

    metric_file_path = "performance.json"
    # with open(metric_file_path, "r") as f:
    #     data = json.load(f)

    model_metric = {
        "time_stamp": datetime.now().strftime("%d-%m-%Y_%H:%M:%S"),
        "confusion_matrix": confusion_matrix(y_test, y_pred_cat).tolist(),
        "precision": precision_score(y_test, y_pred_cat),
        "recall": recall_score(y_test, y_pred_cat),
        "f1_score": f1_score(y_test, y_pred_cat)
    }

    # data['model_metric'].append(model_metric)
    # with open(metric_file_path, "w") as f:
    #     json.dump(data, f, indent=4)

    logging.warning("Model Metrics Stored")

In [None]:
from kfp.dsl import pipeline

@pipeline(name="money_laundering_detection")
def pipeline(
        project_id: str,
        bucket_id: str,
        bucket_folder: str
):
    preproces_job_1 = data_preprocess_stage_1(raw_dataset=raw_data_full_path)
    preprocess_job_2 = data_preprocess_stage_2(input_dataset=preproces_job_1.outputs["output_dataset_two"])
    feature_selection_job = feature_selection(input_dataset=preprocess_job_2.outputs["output_dataset_three"])
    segment_generation_job = segment_generation(
        input_dataset_1=preproces_job_1.outputs["output_dataset_two"],
        input_dataset_2=preprocess_job_2.outputs["output_dataset_three"],
        input_dataset_3=feature_selection_job.outputs["output_dataset"]
    )
    training_job = training(
        project_id = project_id,
        bucket_id = bucket_id,
        bucket_folder = bucket_folder,
        input_dataset=segment_generation_job.outputs["final_dataset_output"])

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='money_laundering_detection.yaml'
)

In [None]:

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=project_id,
    location="us-central1",
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="money_laundering_detection",
    template_path="money_laundering_detection.yaml",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'bucket_id': bucket_id,
        'bucket_folder': bucket_folder_name
    }
)

job.submit()

## Testing the Model

In [None]:
import pickle
from google.cloud import storage

bucket = storage.Client(project="jesusarguelles-sandbox").bucket("jesusarguelles-datasets-public")

blob = bucket.blob("money_laundering_detection/model.pkl")

with blob.open("rb") as f:
    model = pickle.load(f)
f.close()

blob = bucket.blob("money_laundering_detection/label_encoder.pkl")

with blob.open("rb") as f:
    encoder = pickle.load(f)

In [None]:
predict = {
    "segment": [0],
    "step": [1],
    "trans_type": ["DEBIT"],
    "amount": [181.00],
    "nameOrig": ["C1900366749"],
    "oldbalanceOrg": [4465.0],
    "nameDest": ["C997608398"],
    "oldbalanceDest": ["10845.0"],
    "accountType": ["DOMESTIC"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

In [None]:
df[df.isFraud == 1]

In [None]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}

to_predict_df = pd.DataFrame(predict)
to_predict_df = df[df.isFraud == 1]
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

In [None]:
df[df.isFraud == 1].iloc[7,:]

In [None]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

In [None]:
to_predict_df = df[df.isFraud == 1].iloc[:,:-1]
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']
for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
model.predict(to_predict_df)

In [None]:
to_predict_df.reset_index(inplace=True)

In [None]:
to_predict_df = df[df.isFraud == 1].iloc[:,:-1]
to_predict_df.reset_index(drop=True, inplace=True)

In [None]:
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']
print(to_predict_df.head(9))
for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
print(model.predict(to_predict_df))

In [None]:
to_predict_df

In [None]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

In [None]:
 0     2   WIRE_OUT    18627.02  C1375503918       18627.02#%%
from kfp import compiler
import google.cloud.aiplatform as aip
from kfp.dsl import component, Output, Input, Dataset, pipeline

In [None]:
project_id = "jesusarguelles-sandbox"
pipeline_root_path = "gs://jesusarguelles-staging/"
bucket_id = "jesusarguelles-datasets-public"
bucket_folder_name = "money_laundering_detection"
raw_file_name = "paysim_dataset.csv"
raw_data_full_path = f"gs://{bucket_id}/{bucket_folder_name}/{raw_file_name}"
filtered_data_1 = "filtered_data_1.csv"
filtered_data_2 = "filtered_data_2.csv"
filtered_data_3 = "filtered_data_3.csv"

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["google-cloud-storage", "gcsfs", "pandas"]
)
def data_preprocess_stage_1(
        raw_dataset: str,
        output_dataset_one : Output[Dataset],
        output_dataset_two : Output[Dataset]
):
    import os
    import csv
    import logging
    import numpy as np
    import pandas as pd
    from random import randint


    logging.warning("DATA PREPROCESSING 1 STAGE")
    logging.warning("Reading Dataset...")

    X = pd.read_csv(raw_dataset)
    X = X.to_numpy()

    logging.warning("Read Dataset")

    nameOrigCol = 3
    nameDestCol = 6
    nameOrig = []
    nameDest = []
    nameCount = {}
    namesWithMoreThanOneOccurrence = []

    logging.warning("Checking Each Person's Transactions Count...")

    for name in X[:, nameOrigCol] :
        if nameCount.get(name, -1) == -1 :
            nameOrig.append(name)

            nameCount[name] = 1

        else :
            nameCount[name] += 1
            namesWithMoreThanOneOccurrence.append(name)

    for name in X[:, nameDestCol] :
        if nameCount.get(name, -1) == -1 :
            nameDest.append(name)

            nameCount[name] = 1

        else :
            nameCount[name] += 1
            namesWithMoreThanOneOccurrence.append(name)

    logging.warning("Count Identification Done")

    logging.warning("Calculating Median ...")

    countArr = []
    count = 0
    for attr, value in nameCount.items() :
        if value > 40 :
            countArr.append(value)
            count += 1
    median = np.median(countArr)

    logging.warning(f"Median : {median}")

    logging.warning("Filtering Data Based on Transactions Count...")
    csv_golden_data = []

    for i in range(X.shape[0]) :
        if nameCount.get(X[i, 3], -1) > 40 or nameCount.get(X[i, 6], -1) > 40 :
            csv_golden_data.append(X[i, :])

    logging.warning("Filtering Done")

    logging.warning("Storing Filtered Data in data_processed folder...")

    new_file_name = "filtered_data.csv"

    with open(output_dataset_one.path, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(csv_golden_data)

    logging.warning("----------")
    logging.warning("DATA PREPROCESSING 2 STAGE")

    logging.warning("Reading Preprocessed 1 dataset...")

    X = pd.DataFrame(csv_golden_data)
    X = X.to_numpy()

    logging.warning("Read Preprocessed 1 dataset...")

    csv_dataset_primary = []
    step = 0
    trans_type = 1
    amount = 2
    nameOrig = 3
    oldbalanceOrg = 4
    nameDest = 6
    oldbalanceDest = 7
    accountType = 8
    isFraud = 9
    isFlaggedFraud = 10

    logging.warning("Changing Labels of Type Column ...")

    transfer = ["WIRE_IN", "WIRE_OUT"]
    for i in range(X.shape[0]):
        arr = []
        arr.append(X[i,step])
        if X[i,trans_type] =="PAYMENT":
            arr.append("CREDIT")
        elif X[i,trans_type] =="TRANSFER":
            arr.append(transfer[randint(0,1)])
        else:
            arr.append(X[i,trans_type])
        arr.append(X[i,amount])
        arr.append(X[i,nameOrig])
        arr.append(X[i,oldbalanceOrg])
        arr.append(X[i,nameDest])
        arr.append(X[i,oldbalanceDest])
        if X[i,trans_type] == "TRANSFER":
            arr.append("FOREIGN")
        else:
            arr.append("DOMESTIC")

        arr.append(X[i,isFraud])
        arr.append(X[i,isFlaggedFraud])

        csv_dataset_primary.append(arr)

    logging.warning("Changing Labels Done")
    logging.warning("Storing Data in Data_processed Folder...")


    columns=['step','trans_type','amount','nameOrig','oldbalanceOrg',
             'nameDest','oldbalanceDest','accountType','isFraud','isFlaggedFraud']

    data_primary = pd.DataFrame(csv_dataset_primary, columns=columns)

    data_primary.to_csv(output_dataset_two.path, index=False)

    logging.warning("Storing Data Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs"]
)
def data_preprocess_stage_2(
        input_dataset: Input[Dataset],
        output_dataset_three: Output[Dataset]
):
    import logging
    import pandas as pd
    # data_path = f'gs://{bucket_id}/{folder_id}/filtered_data_2.csv'

    logging.basicConfig(filename='logs/model_development.txt',
                        filemode='a',
                        format='%(asctime)s %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")

    logging.warning("----------")
    logging.warning("DATA PREPROCESSING 3 STAGE")

    logging.warning("Reading Preprocessed 2 dataset...")

    X = pd.read_csv(input_dataset.path)
    X = X.to_numpy()

    logging.warning("Read Preprocessed 2 dataset")

    #col
    step = 0
    trans_type = 1
    amount = 2
    nameOrig = 3
    oldbalanceOrg = 4
    nameDest = 5
    oldbalanceDest = 6
    accountType = 7
    isFraud = 8

    #col
    entity = 0
    incommingDomestic30 = 1
    incommingDomestic60 = 2
    incommingDomestic90 = 3
    outgoingDomestic30 = 4
    outgoingDomestic60 = 5
    outgoingDomestic90 = 6
    incommingForeign30 = 7
    incommingForeign60 = 8
    incommingForeign90 = 9
    outgoingForeign30 = 10
    outgoingForeign60 = 11
    outgoingForeign90 = 12
    incoming_domestic_count_30 = 13
    incoming_domestic_count_60 = 14
    incoming_domestic_count_90 = 15
    outgoing_domestic_count_30 = 16
    outgoing_domestic_count_60 = 17
    outgoing_domestic_count_90 = 18
    incoming_foreign_count_30 = 19
    incoming_foreign_count_60 = 20
    incoming_foreign_count_90 = 21
    outgoing_foreign_count_30 = 22
    outgoing_foreign_count_60 = 23
    outgoing_foreign_count_90 = 24
    balance_difference_30 = 25
    balance_difference_60 = 26
    balance_difference_90 = 27
    isFraudSec = 28

    csv_dataset_secondary = []
    entities_pos = {}
    enititesDict = {}

    logging.warning("Creating New Features Using Transaction History...")

    def getSecRow(entity):
        return [entity,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

    for i in range(X.shape[0]):
        source_entity = X[i,nameOrig]
        dest_entity = X[i,nameDest]

        source_pos = entities_pos.get(source_entity,-1)
        if source_pos == -1:
            pos = len(csv_dataset_secondary)
            entities_pos[source_entity] = pos
            source_pos = pos

            row = getSecRow(source_entity)

            csv_dataset_secondary.append(row)

        dest_pos = entities_pos.get(dest_entity,-1)
        if dest_pos == -1:
            pos = len(csv_dataset_secondary)
            entities_pos[dest_entity] = pos
            dest_pos = pos

            row = getSecRow(dest_entity)

            csv_dataset_secondary.append(row)

        transferAmountSource = 0
        transferAmountDest = 0

        if X[i,trans_type] == "CASH_IN" or X[i,trans_type] == "CREDIT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][incommingDomestic30] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][incommingDomestic60] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][incommingDomestic90] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic90] += X[i,amount]

            transferAmountSource = X[i,amount]
            transferAmountDest = -1*X[i,amount]

        if X[i,trans_type] == "CASH_OUT" or X[i,trans_type] == "DEBIT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][outgoingDomestic30] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][outgoingDomestic60] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][outgoingDomestic90] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic90] += X[i,amount]

            transferAmountSource = -1*X[i,amount]
            transferAmountDest = X[i,amount]

        if X[i,trans_type] == "WIRE_IN":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][incommingForeign30] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingForeign30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][incommingForeign60] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingForeign60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][incommingForeign90] += X[i,amount]
                # print(dest_pos,outgoingForeign90,i,amount)
                csv_dataset_secondary[dest_pos][outgoingForeign90] += X[i,amount]

            transferAmountSource = X[i,amount]
            transferAmountDest = -1*X[i,amount]

        if X[i,trans_type] == "WIRE_OUT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][outgoingForeign30] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][outgoingForeign60] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][outgoingForeign90] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign90] += X[i,amount]

            transferAmountSource = -1*X[i,amount]
            transferAmountDest = X[i,amount]

        if enititesDict.get(source_entity,-1) == -1:
            enititesDict[source_entity] = {
                'day1Bal': X[i,oldbalanceOrg],
                'day30Bal': 0,
                'day60Bal': 0,
                'day90Bal': 0,
                'countIncomingDomestic30': 0,
                'countOutgoingDomestic30': 0,
                'countIncomingDomestic60': 0,
                'countOutgoingDomestic60': 0,
                'countIncomingDomestic90': 0,
                'countOutgoingDomestic90': 0,
                'countIncomingForeign30': 0,
                'countOutgoingForeign30': 0,
                'countIncomingForeign60': 0,
                'countOutgoingForeign60': 0,
                'countIncomingForeign90': 0,
                'countOutgoingForeign90': 0
            }

        if enititesDict.get(dest_entity,-1) == -1:
            enititesDict[dest_entity] = {
                'day1Bal': X[i,oldbalanceDest],
                'day30Bal': 0,
                'day60Bal': 0,
                'day90Bal': 0,
                'countIncomingDomestic30': 0,
                'countOutgoingDomestic30': 0,
                'countIncomingDomestic60': 0,
                'countOutgoingDomestic60': 0,
                'countIncomingDomestic90': 0,
                'countOutgoingDomestic90': 0,
                'countIncomingForeign30': 0,
                'countOutgoingForeign30': 0,
                'countIncomingForeign60': 0,
                'countOutgoingForeign60': 0,
                'countIncomingForeign90': 0,
                'countOutgoingForeign90': 0
            }

        incomingForSource = ["CASH_IN","CREDIT","WIRE_IN"]
        incomingForDest = ["CASH_OUT","DEBIT","WIRE_OUT"]
        outgoingForDest = incomingForSource
        outgoingForSource = incomingForDest

        if X[i,step]<=30:
            enititesDict[source_entity]['day30Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day30Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign30'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign30'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic30'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic30'] += 1


            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign30'] += 1
                else:

                    enititesDict[dest_entity]['countOutgoingForeign30'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic30'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic30'] += 1

        if X[i,step]<=60:
            enititesDict[source_entity]['day60Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day60Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign60'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign60'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic60'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic60'] += 1

            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign60'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingForeign60'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic60'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic60'] += 1

        if X[i,step]<=90:
            enititesDict[source_entity]['day90Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day90Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign90'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign90'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic90'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic90'] += 1

            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign90'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingForeign90'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic90'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic90'] += 1


        csv_dataset_secondary[source_pos][balance_difference_30] = enititesDict[source_entity]['day30Bal'] - enititesDict[source_entity]['day1Bal']
        csv_dataset_secondary[source_pos][balance_difference_60] = enititesDict[source_entity]['day60Bal'] - enititesDict[source_entity]['day1Bal']
        csv_dataset_secondary[source_pos][balance_difference_90] = enititesDict[source_entity]['day90Bal'] - enititesDict[source_entity]['day1Bal']



        csv_dataset_secondary[source_pos][incoming_domestic_count_30] = enititesDict[source_entity]['countIncomingDomestic30']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_30] = enititesDict[source_entity]['countOutgoingDomestic30']
        csv_dataset_secondary[source_pos][incoming_domestic_count_60] = enititesDict[source_entity]['countIncomingDomestic60']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_60] = enititesDict[source_entity]['countOutgoingDomestic60']
        csv_dataset_secondary[source_pos][incoming_domestic_count_90] = enititesDict[source_entity]['countIncomingDomestic90']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_90] = enititesDict[source_entity]['countOutgoingDomestic90']
        csv_dataset_secondary[source_pos][incoming_foreign_count_30] = enititesDict[source_entity]['countIncomingForeign30']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_30] = enititesDict[source_entity]['countOutgoingForeign30']
        csv_dataset_secondary[source_pos][incoming_foreign_count_60] = enititesDict[source_entity]['countIncomingForeign60']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_60] = enititesDict[source_entity]['countOutgoingForeign60']
        csv_dataset_secondary[source_pos][incoming_foreign_count_90] = enititesDict[source_entity]['countIncomingForeign90']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_90] = enititesDict[source_entity]['countOutgoingForeign90']

        csv_dataset_secondary[source_pos][isFraudSec] = csv_dataset_secondary[source_pos][isFraudSec] or X[i,isFraud]

        csv_dataset_secondary[dest_pos][incoming_domestic_count_30] = enititesDict[dest_entity]['countIncomingDomestic30']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_30] = enititesDict[dest_entity]['countOutgoingDomestic30']
        csv_dataset_secondary[dest_pos][incoming_domestic_count_60] = enititesDict[dest_entity]['countIncomingDomestic60']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_60] = enititesDict[dest_entity]['countOutgoingDomestic60']
        csv_dataset_secondary[dest_pos][incoming_domestic_count_90] = enititesDict[dest_entity]['countIncomingDomestic90']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_90] = enititesDict[dest_entity]['countOutgoingDomestic90']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_30] = enititesDict[dest_entity]['countIncomingForeign30']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_30] = enititesDict[dest_entity]['countOutgoingForeign30']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_60] = enititesDict[dest_entity]['countIncomingForeign60']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_60] = enititesDict[dest_entity]['countOutgoingForeign60']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_90] = enititesDict[dest_entity]['countIncomingForeign90']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_90] = enititesDict[dest_entity]['countOutgoingForeign90']


    columns = ['entity','incoming_domestic_amount_30','incoming_domestic_amount_60','incoming_domestic_amount_90',
               'outgoing_domestic_amount_30','outgoing_domestic_amount_60','outgoing_domestic_amount_90',
               'incoming_foreign_amount_30','incoming_foreign_amount_60','incoming_foreign_amount_90',
               'outgoing_foreign_amount_30','outgoing_foreign_amount_60','outgoing_foreign_amount_90',
               'incoming_domestic_count_30','incoming_domestic_count_60','incoming_domestic_count_90',
               'outgoing_domestic_count_30','outgoing_domestic_count_60','outgoing_domestic_count_90',
               'incoming_foreign_count_30','incoming_foreign_count_60','incoming_foreign_count_90',
               'outgoing_foreign_count_30','outgoing_foreign_count_60','outgoing_foreign_count_90',
               'balance_difference_30','balance_difference_60','balance_difference_90','isFraud']

    logging.warning("Creating New Features Done")

    logging.warning("Storing Data in Data_processed Folder...")

    # filtered_data_3.csv
    data_secondary = pd.DataFrame(csv_dataset_secondary, columns=columns)
    data_secondary.to_csv(output_dataset_three.path,index=False)
    logging.warning("Storing Data Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn"]
)
def feature_selection(
        input_dataset : Input[Dataset],
        output_dataset : Output[Dataset]
):
    """

    :param input_dataset_1: filtered_data_3.csv from data-preprocess-stage-2
    :param output_dataset: feature_importances.csv
    """
    import logging
    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier


    logging.warning("----------")
    logging.warning("FEATURE SELECTION STAGE")

    logging.warning("Reading Filtered Data 3 ...")

    dataframeX = pd.read_csv(input_dataset.path)
    col_names = list(dataframeX.columns.values)
    dataMat = dataframeX.to_numpy()

    logging.warning("Read Filtered Data 3")

    logging.warning("Creating X and Y Variables...")

    X = dataMat[:,1:-2]
    Y = dataMat[:,-1]

    logging.warning(f"Shape of X: {X.shape} and Shape of Y: {Y.shape}")

    logging.warning("Instiantiating Random Forest Model...")

    model = RandomForestClassifier(random_state=42)

    logging.warning("Fitting Data...")

    model.fit(X, Y.astype(int))

    logging.warning("Checking Feature Importances...")

    feature_imp = model.feature_importances_

    sorted_feature_vals = np.sort(feature_imp)
    sorted_feature_indexes = np.argsort(feature_imp)

    logging.warning("Significant Features in decreasing order of importance: ")

    logging.warning("Storing Feature Importances in reports...")

    fea_imp = [[col_names[i+2], feature_imp[i]] for i in reversed(sorted_feature_indexes)]
    features = pd.DataFrame(fea_imp, columns=["features", "importance_score"])
    # feature_importances.csv
    features.to_csv(output_dataset.path, index=False)

    logging.warning("Storing Feature Importances Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn"]
)
def segment_generation(
        input_dataset_1 : Input[Dataset],
        input_dataset_2 : Input[Dataset],
        input_dataset_3 : Input[Dataset],
        silhoutte_scores : Output[Dataset],
        final_dataset_output: Output[Dataset]
):
    """

    :param input_dataset_1:  filtered_data_2.csv from data-preprocess-stage-1
    :param input_dataset_2:  filtered_data_3.csv from data-preprocess-stage-2
    :param input_dataset_3:  feature_importances.csv from features_election
    :param silhoutte_scores: Metrics
    :param final_dataset_output: final_dataset
    :return:
    """
    # Segment Generation
    import logging
    import numpy as np
    import pandas as pd
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    logging.basicConfig(filename='logs/model_development.txt',
                        filemode='a',
                        format='%(asctime)s %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")

    logging.warning("----------")
    logging.warning("SEGMENT GENERATOR STAGE")

    def getClusterPredictions(data, true_k):
        model = KMeans(n_clusters=true_k)
        model.fit(data)
        prediction = model.predict(data)

        return prediction

    def getBestCluster(X,_min=2,_max=10):
        selected_cluster = 0
        previous_sil_coeff = 0.001 #some random small number not 0
        sc_vals = []
        for n_cluster in range(_min, _max):
            kmeans = KMeans(n_clusters=n_cluster).fit(X)
            label = kmeans.labels_

            sil_coeff = silhouette_score(X, label, metric='euclidean', sample_size=1000)
            sc_vals.append(sil_coeff)
            # print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

            percent_change = (sil_coeff-previous_sil_coeff)*100/previous_sil_coeff

            # return when below a threshold of 1%
            if percent_change<1:
                selected_cluster = n_cluster-1

            previous_sil_coeff = sil_coeff

        return selected_cluster or _max, sc_vals

    logging.warning("Reading Filtered Data 3 ...")

    X_dataframe = pd.read_csv(input_dataset_2.path)
    X = X_dataframe.to_numpy()

    logging.warning("Read Filtered Data 3")

    col_names = list(X_dataframe.columns.values)

    X_trimmed_features = np.zeros((X.shape[0],1))

    logging.warning("Importing Feature Importances...")

    #feature_path = f'gs://{bucket_id}/{folder_id}/feature_importances.csv'
    features = pd.read_csv(input_dataset_3.path)

    logging.warning("Selecting Top 13 Features for CLustering...")

    top_13 = features.iloc[:13, 0].tolist()

    logging.warning("Top 13 Features stored in List")

    for feature in top_13:
        X_trimmed_features = np.concatenate((X_trimmed_features,np.expand_dims(X_dataframe[feature],axis=1)),axis=1)
    X_trimmed_features = X_trimmed_features[:,1:]

    logging.warning("Choosing Best Number Of Clusters...")

    min_value = 2
    max_value = 10
    true_k, sc_vals = getBestCluster(X_trimmed_features,_min=min_value,_max=max_value)
    true_k = 5

    logging.warning("Storing Silhoutte Scores...")


    sil_score = [[i, sc_vals[i-min_value]] for i in range(min_value, max_value)]
    sil = pd.DataFrame(sil_score, columns=["no_of_clusters", "silhoutte_score"])
    sil.to_csv(silhoutte_scores.path, index=False)
    #sil.to_csv(f'gs://{bucket_id}/{folder_id}/silhoutte_scores.csv', index=False)

    logging.warning("Storing Silhoutte Scores Done")

    logging.warning("Creating Clusters with Best No Of Clusters...")

    prediction = getClusterPredictions(X_trimmed_features, true_k)
    seg_dict = {}
    for i in range(X.shape[0]):
        seg_dict[X[i,0]] = prediction[i]

    logging.warning("Inputing Filtered Data 2 Dataset...")

    X_dataframe_pri = pd.read_csv(input_dataset_1.path)
    X_pri = X_dataframe_pri.to_numpy()
    col_names = list(X_dataframe_pri.columns.values)

    logging.warning("Read Filtered 2 Data")

    logging.warning("Creating Final Dataset with segments...")

    X_with_segments = []
    for i in range(X_pri.shape[0]):
        X_with_segments.append(np.concatenate(([[seg_dict[X_pri[i,3]]]],np.expand_dims(X_pri[i,:],axis=0)),axis=1)[0])

    segmented_columns = ['segment','step','trans_type','amount','nameOrig','oldbalanceOrg',
                         'nameDest','oldbalanceDest','accountType','isFraud','isFlaggedFraud']

    data_segmented = pd.DataFrame(X_with_segments, columns = segmented_columns)
    data_segmented = data_segmented.drop('isFlaggedFraud', axis=1)
    data_segmented.to_csv(final_dataset_output.path, index=False)

    logging.warning("Storing Final Dataset Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn", "catboost"]
)
def training(
        project_id: str,
        bucket_id: str,
        bucket_folder: str,
        input_dataset : Input[Dataset],
        model_path: Output[Dataset]
):
    """

    :param input_dataset: fina_dataset.csv from segment-generation
    """
    import json
    import pickle
    import logging
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from catboost import CatBoostClassifier
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

    logging.warning("----------")
    logging.warning("MODEL CREATION STAGE")

    logging.warning("Reading Final Dataset...")

    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(bucket_id)

    dataMat = pd.read_csv(input_dataset.path)
    data = dataMat.to_numpy()

    logging.warning("Read Final Dataset")

    logging.warning("Checking Categorical Features...")

    cat_feat = [i for i in dataMat.columns if dataMat[i].dtypes == 'O']

    logging.warning("Checking Missing Values...")

    a = dict(dataMat.isnull().sum())
    b = [[i, a[i]] for i in a.keys()]
    missing = pd.DataFrame(b, columns=['features', 'null_values_count'])

    logging.warning("Storing Missing Values...")

    missing.to_csv("missing_values.csv", index=False)

    logging.warning("Storing Missing Values Done")

    logging.warning("Encoding Categorical Features...")

    encoder = LabelEncoder()
    for i in cat_feat:
        dataMat[i] = encoder.fit_transform(dataMat[i])

    blob = bucket.blob(f"{bucket_folder}/label_encoder.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(encoder, f)

    logging.warning("Features Encoding Done")

    logging.warning("Creating X and y variables ...")

    X = dataMat.iloc[:, :-1]
    y = dataMat['isFraud']

    logging.warning(f"Shape of X: {X.shape} and Shape of y: {y.shape}")

    logging.warning("Splitting Dataset...")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    logging.warning("Instantiating Model...")

    model = CatBoostClassifier(random_state=42, class_weights={0:1, 1:12}, silent=True)

    logging.warning("Fitting Model...")

    model.fit(X_train, y_train)
    y_pred_cat = model.predict(X_test)

    logging.warning("Saving Model...")

    #model_path = "model.pkl"
    blob = bucket.blob(f"{bucket_folder}/model.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(model, f)

    f.close()

    logging.warning("Saving Model Metrics...")

    metric_file_path = "performance.json"
    # with open(metric_file_path, "r") as f:
    #     data = json.load(f)

    model_metric = {
        "time_stamp": datetime.now().strftime("%d-%m-%Y_%H:%M:%S"),
        "confusion_matrix": confusion_matrix(y_test, y_pred_cat).tolist(),
        "precision": precision_score(y_test, y_pred_cat),
        "recall": recall_score(y_test, y_pred_cat),
        "f1_score": f1_score(y_test, y_pred_cat)
    }

    # data['model_metric'].append(model_metric)
    # with open(metric_file_path, "w") as f:
    #     json.dump(data, f, indent=4)

    logging.warning("Model Metrics Stored")

In [None]:
from kfp.dsl import pipeline

@pipeline(name="money_laundering_detection")
def pipeline(
        project_id: str,
        bucket_id: str,
        bucket_folder: str
):
    preproces_job_1 = data_preprocess_stage_1(raw_dataset=raw_data_full_path)
    preprocess_job_2 = data_preprocess_stage_2(input_dataset=preproces_job_1.outputs["output_dataset_two"])
    feature_selection_job = feature_selection(input_dataset=preprocess_job_2.outputs["output_dataset_three"])
    segment_generation_job = segment_generation(
        input_dataset_1=preproces_job_1.outputs["output_dataset_two"],
        input_dataset_2=preprocess_job_2.outputs["output_dataset_three"],
        input_dataset_3=feature_selection_job.outputs["output_dataset"]
    )
    training_job = training(
        project_id = project_id,
        bucket_id = bucket_id,
        bucket_folder = bucket_folder,
        input_dataset=segment_generation_job.outputs["final_dataset_output"])

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='money_laundering_detection.yaml'
)

In [None]:

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=project_id,
    location="us-central1",
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="money_laundering_detection",
    template_path="money_laundering_detection.yaml",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'bucket_id': bucket_id,
        'bucket_folder': bucket_folder_name
    }
)

job.submit()

## Testing the Model

In [None]:
import pickle
from google.cloud import storage

bucket = storage.Client(project="jesusarguelles-sandbox").bucket("jesusarguelles-datasets-public")

blob = bucket.blob("money_laundering_detection/model.pkl")

with blob.open("rb") as f:
    model = pickle.load(f)
f.close()

blob = bucket.blob("money_laundering_detection/label_encoder.pkl")

with blob.open("rb") as f:
    encoder = pickle.load(f)

In [None]:
predict = {
    "segment": [0],
    "step": [1],
    "trans_type": ["DEBIT"],
    "amount": [181.00],
    "nameOrig": ["C1900366749"],
    "oldbalanceOrg": [4465.0],
    "nameDest": ["C997608398"],
    "oldbalanceDest": ["10845.0"],
    "accountType": ["DOMESTIC"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

In [None]:
df[df.isFraud == 1]

In [None]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}

to_predict_df = pd.DataFrame(predict)
to_predict_df = df[df.isFraud == 1]
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

In [None]:
df[df.isFraud == 1].iloc[7,:]

In [None]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

model.predict(to_predict_df)

In [None]:
to_predict_df = df[df.isFraud == 1].iloc[:,:-1]
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']
for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
model.predict(to_predict_df)

In [None]:
to_predict_df.reset_index(inplace=True)

In [None]:
to_predict_df = df[df.isFraud == 1].iloc[:,:-1]
to_predict_df.reset_index(drop=True, inplace=True)

In [None]:
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']
print(to_predict_df.head(9))
for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])
print(model.predict(to_predict_df))

In [None]:
to_predict_df

In [None]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.fit_transform(to_predict_df[i])

In [None]:
to_predict_df 0     2   WIRE_OUT    18627.02  C1375503918       18627.02#%%
from kfp import compiler
import google.cloud.aiplatform as aip
from kfp.dsl import component, Output, Input, Dataset, pipeline

In [None]:
project_id = "jesusarguelles-sandbox"
pipeline_root_path = "gs://jesusarguelles-staging/"
bucket_id = "jesusarguelles-datasets-public"
bucket_folder_name = "money_laundering_detection"
raw_file_name = "paysim_dataset.csv"
raw_data_full_path = f"gs://{bucket_id}/{bucket_folder_name}/{raw_file_name}"
filtered_data_1 = "filtered_data_1.csv"
filtered_data_2 = "filtered_data_2.csv"
filtered_data_3 = "filtered_data_3.csv"

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["google-cloud-storage", "gcsfs", "pandas"]
)
def data_preprocess_stage_1(
        raw_dataset: str,
        output_dataset_one : Output[Dataset],
        output_dataset_two : Output[Dataset]
):
    import os
    import csv
    import logging
    import numpy as np
    import pandas as pd
    from random import randint


    logging.warning("DATA PREPROCESSING 1 STAGE")
    logging.warning("Reading Dataset...")

    X = pd.read_csv(raw_dataset)
    X = X.to_numpy()

    logging.warning("Read Dataset")

    nameOrigCol = 3
    nameDestCol = 6
    nameOrig = []
    nameDest = []
    nameCount = {}
    namesWithMoreThanOneOccurrence = []

    logging.warning("Checking Each Person's Transactions Count...")

    for name in X[:, nameOrigCol] :
        if nameCount.get(name, -1) == -1 :
            nameOrig.append(name)

            nameCount[name] = 1

        else :
            nameCount[name] += 1
            namesWithMoreThanOneOccurrence.append(name)

    for name in X[:, nameDestCol] :
        if nameCount.get(name, -1) == -1 :
            nameDest.append(name)

            nameCount[name] = 1

        else :
            nameCount[name] += 1
            namesWithMoreThanOneOccurrence.append(name)

    logging.warning("Count Identification Done")

    logging.warning("Calculating Median ...")

    countArr = []
    count = 0
    for attr, value in nameCount.items() :
        if value > 40 :
            countArr.append(value)
            count += 1
    median = np.median(countArr)

    logging.warning(f"Median : {median}")

    logging.warning("Filtering Data Based on Transactions Count...")
    csv_golden_data = []

    for i in range(X.shape[0]) :
        if nameCount.get(X[i, 3], -1) > 40 or nameCount.get(X[i, 6], -1) > 40 :
            csv_golden_data.append(X[i, :])

    logging.warning("Filtering Done")

    logging.warning("Storing Filtered Data in data_processed folder...")

    new_file_name = "filtered_data.csv"

    with open(output_dataset_one.path, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(csv_golden_data)

    logging.warning("----------")
    logging.warning("DATA PREPROCESSING 2 STAGE")

    logging.warning("Reading Preprocessed 1 dataset...")

    X = pd.DataFrame(csv_golden_data)
    X = X.to_numpy()

    logging.warning("Read Preprocessed 1 dataset...")

    csv_dataset_primary = []
    step = 0
    trans_type = 1
    amount = 2
    nameOrig = 3
    oldbalanceOrg = 4
    nameDest = 6
    oldbalanceDest = 7
    accountType = 8
    isFraud = 9
    isFlaggedFraud = 10

    logging.warning("Changing Labels of Type Column ...")

    transfer = ["WIRE_IN", "WIRE_OUT"]
    for i in range(X.shape[0]):
        arr = []
        arr.append(X[i,step])
        if X[i,trans_type] =="PAYMENT":
            arr.append("CREDIT")
        elif X[i,trans_type] =="TRANSFER":
            arr.append(transfer[randint(0,1)])
        else:
            arr.append(X[i,trans_type])
        arr.append(X[i,amount])
        arr.append(X[i,nameOrig])
        arr.append(X[i,oldbalanceOrg])
        arr.append(X[i,nameDest])
        arr.append(X[i,oldbalanceDest])
        if X[i,trans_type] == "TRANSFER":
            arr.append("FOREIGN")
        else:
            arr.append("DOMESTIC")

        arr.append(X[i,isFraud])
        arr.append(X[i,isFlaggedFraud])

        csv_dataset_primary.append(arr)

    logging.warning("Changing Labels Done")
    logging.warning("Storing Data in Data_processed Folder...")


    columns=['step','trans_type','amount','nameOrig','oldbalanceOrg',
             'nameDest','oldbalanceDest','accountType','isFraud','isFlaggedFraud']

    data_primary = pd.DataFrame(csv_dataset_primary, columns=columns)

    data_primary.to_csv(output_dataset_two.path, index=False)

    logging.warning("Storing Data Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs"]
)
def data_preprocess_stage_2(
        input_dataset: Input[Dataset],
        output_dataset_three: Output[Dataset]
):
    import logging
    import pandas as pd
    # data_path = f'gs://{bucket_id}/{folder_id}/filtered_data_2.csv'

    logging.basicConfig(filename='logs/model_development.txt',
                        filemode='a',
                        format='%(asctime)s %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")

    logging.warning("----------")
    logging.warning("DATA PREPROCESSING 3 STAGE")

    logging.warning("Reading Preprocessed 2 dataset...")

    X = pd.read_csv(input_dataset.path)
    X = X.to_numpy()

    logging.warning("Read Preprocessed 2 dataset")

    #col
    step = 0
    trans_type = 1
    amount = 2
    nameOrig = 3
    oldbalanceOrg = 4
    nameDest = 5
    oldbalanceDest = 6
    accountType = 7
    isFraud = 8

    #col
    entity = 0
    incommingDomestic30 = 1
    incommingDomestic60 = 2
    incommingDomestic90 = 3
    outgoingDomestic30 = 4
    outgoingDomestic60 = 5
    outgoingDomestic90 = 6
    incommingForeign30 = 7
    incommingForeign60 = 8
    incommingForeign90 = 9
    outgoingForeign30 = 10
    outgoingForeign60 = 11
    outgoingForeign90 = 12
    incoming_domestic_count_30 = 13
    incoming_domestic_count_60 = 14
    incoming_domestic_count_90 = 15
    outgoing_domestic_count_30 = 16
    outgoing_domestic_count_60 = 17
    outgoing_domestic_count_90 = 18
    incoming_foreign_count_30 = 19
    incoming_foreign_count_60 = 20
    incoming_foreign_count_90 = 21
    outgoing_foreign_count_30 = 22
    outgoing_foreign_count_60 = 23
    outgoing_foreign_count_90 = 24
    balance_difference_30 = 25
    balance_difference_60 = 26
    balance_difference_90 = 27
    isFraudSec = 28

    csv_dataset_secondary = []
    entities_pos = {}
    enititesDict = {}

    logging.warning("Creating New Features Using Transaction History...")

    def getSecRow(entity):
        return [entity,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

    for i in range(X.shape[0]):
        source_entity = X[i,nameOrig]
        dest_entity = X[i,nameDest]

        source_pos = entities_pos.get(source_entity,-1)
        if source_pos == -1:
            pos = len(csv_dataset_secondary)
            entities_pos[source_entity] = pos
            source_pos = pos

            row = getSecRow(source_entity)

            csv_dataset_secondary.append(row)

        dest_pos = entities_pos.get(dest_entity,-1)
        if dest_pos == -1:
            pos = len(csv_dataset_secondary)
            entities_pos[dest_entity] = pos
            dest_pos = pos

            row = getSecRow(dest_entity)

            csv_dataset_secondary.append(row)

        transferAmountSource = 0
        transferAmountDest = 0

        if X[i,trans_type] == "CASH_IN" or X[i,trans_type] == "CREDIT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][incommingDomestic30] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][incommingDomestic60] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][incommingDomestic90] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingDomestic90] += X[i,amount]

            transferAmountSource = X[i,amount]
            transferAmountDest = -1*X[i,amount]

        if X[i,trans_type] == "CASH_OUT" or X[i,trans_type] == "DEBIT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][outgoingDomestic30] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][outgoingDomestic60] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][outgoingDomestic90] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingDomestic90] += X[i,amount]

            transferAmountSource = -1*X[i,amount]
            transferAmountDest = X[i,amount]

        if X[i,trans_type] == "WIRE_IN":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][incommingForeign30] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingForeign30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][incommingForeign60] += X[i,amount]
                csv_dataset_secondary[dest_pos][outgoingForeign60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][incommingForeign90] += X[i,amount]
                # print(dest_pos,outgoingForeign90,i,amount)
                csv_dataset_secondary[dest_pos][outgoingForeign90] += X[i,amount]

            transferAmountSource = X[i,amount]
            transferAmountDest = -1*X[i,amount]

        if X[i,trans_type] == "WIRE_OUT":
            if X[i,step] <=30:
                csv_dataset_secondary[source_pos][outgoingForeign30] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign30] += X[i,amount]
            if X[i,step] <=60:
                csv_dataset_secondary[source_pos][outgoingForeign60] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign60] += X[i,amount]
            if X[i,step] <=90:
                csv_dataset_secondary[source_pos][outgoingForeign90] += X[i,amount]
                csv_dataset_secondary[dest_pos][incommingForeign90] += X[i,amount]

            transferAmountSource = -1*X[i,amount]
            transferAmountDest = X[i,amount]

        if enititesDict.get(source_entity,-1) == -1:
            enititesDict[source_entity] = {
                'day1Bal': X[i,oldbalanceOrg],
                'day30Bal': 0,
                'day60Bal': 0,
                'day90Bal': 0,
                'countIncomingDomestic30': 0,
                'countOutgoingDomestic30': 0,
                'countIncomingDomestic60': 0,
                'countOutgoingDomestic60': 0,
                'countIncomingDomestic90': 0,
                'countOutgoingDomestic90': 0,
                'countIncomingForeign30': 0,
                'countOutgoingForeign30': 0,
                'countIncomingForeign60': 0,
                'countOutgoingForeign60': 0,
                'countIncomingForeign90': 0,
                'countOutgoingForeign90': 0
            }

        if enititesDict.get(dest_entity,-1) == -1:
            enititesDict[dest_entity] = {
                'day1Bal': X[i,oldbalanceDest],
                'day30Bal': 0,
                'day60Bal': 0,
                'day90Bal': 0,
                'countIncomingDomestic30': 0,
                'countOutgoingDomestic30': 0,
                'countIncomingDomestic60': 0,
                'countOutgoingDomestic60': 0,
                'countIncomingDomestic90': 0,
                'countOutgoingDomestic90': 0,
                'countIncomingForeign30': 0,
                'countOutgoingForeign30': 0,
                'countIncomingForeign60': 0,
                'countOutgoingForeign60': 0,
                'countIncomingForeign90': 0,
                'countOutgoingForeign90': 0
            }

        incomingForSource = ["CASH_IN","CREDIT","WIRE_IN"]
        incomingForDest = ["CASH_OUT","DEBIT","WIRE_OUT"]
        outgoingForDest = incomingForSource
        outgoingForSource = incomingForDest

        if X[i,step]<=30:
            enititesDict[source_entity]['day30Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day30Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign30'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign30'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic30'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic30'] += 1


            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign30'] += 1
                else:

                    enititesDict[dest_entity]['countOutgoingForeign30'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic30'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic30'] += 1

        if X[i,step]<=60:
            enititesDict[source_entity]['day60Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day60Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign60'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign60'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic60'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic60'] += 1

            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign60'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingForeign60'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic60'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic60'] += 1

        if X[i,step]<=90:
            enititesDict[source_entity]['day90Bal'] = transferAmountSource+X[i,oldbalanceOrg]
            enititesDict[dest_entity]['day90Bal'] = transferAmountDest+X[i,oldbalanceDest]
            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForSource:
                    enititesDict[source_entity]['countIncomingForeign90'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingForeign90'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[source_entity]['countIncomingDomestic90'] += 1
                else:
                    enititesDict[source_entity]['countOutgoingDomestic90'] += 1

            if X[i,accountType] == "FOREIGN":
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingForeign90'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingForeign90'] += 1
            else:
                if X[i,trans_type] in incomingForDest:
                    enititesDict[dest_entity]['countIncomingDomestic90'] += 1
                else:
                    enititesDict[dest_entity]['countOutgoingDomestic90'] += 1


        csv_dataset_secondary[source_pos][balance_difference_30] = enititesDict[source_entity]['day30Bal'] - enititesDict[source_entity]['day1Bal']
        csv_dataset_secondary[source_pos][balance_difference_60] = enititesDict[source_entity]['day60Bal'] - enititesDict[source_entity]['day1Bal']
        csv_dataset_secondary[source_pos][balance_difference_90] = enititesDict[source_entity]['day90Bal'] - enititesDict[source_entity]['day1Bal']



        csv_dataset_secondary[source_pos][incoming_domestic_count_30] = enititesDict[source_entity]['countIncomingDomestic30']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_30] = enititesDict[source_entity]['countOutgoingDomestic30']
        csv_dataset_secondary[source_pos][incoming_domestic_count_60] = enititesDict[source_entity]['countIncomingDomestic60']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_60] = enititesDict[source_entity]['countOutgoingDomestic60']
        csv_dataset_secondary[source_pos][incoming_domestic_count_90] = enititesDict[source_entity]['countIncomingDomestic90']
        csv_dataset_secondary[source_pos][outgoing_domestic_count_90] = enititesDict[source_entity]['countOutgoingDomestic90']
        csv_dataset_secondary[source_pos][incoming_foreign_count_30] = enititesDict[source_entity]['countIncomingForeign30']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_30] = enititesDict[source_entity]['countOutgoingForeign30']
        csv_dataset_secondary[source_pos][incoming_foreign_count_60] = enititesDict[source_entity]['countIncomingForeign60']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_60] = enititesDict[source_entity]['countOutgoingForeign60']
        csv_dataset_secondary[source_pos][incoming_foreign_count_90] = enititesDict[source_entity]['countIncomingForeign90']
        csv_dataset_secondary[source_pos][outgoing_foreign_count_90] = enititesDict[source_entity]['countOutgoingForeign90']

        csv_dataset_secondary[source_pos][isFraudSec] = csv_dataset_secondary[source_pos][isFraudSec] or X[i,isFraud]

        csv_dataset_secondary[dest_pos][incoming_domestic_count_30] = enititesDict[dest_entity]['countIncomingDomestic30']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_30] = enititesDict[dest_entity]['countOutgoingDomestic30']
        csv_dataset_secondary[dest_pos][incoming_domestic_count_60] = enititesDict[dest_entity]['countIncomingDomestic60']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_60] = enititesDict[dest_entity]['countOutgoingDomestic60']
        csv_dataset_secondary[dest_pos][incoming_domestic_count_90] = enititesDict[dest_entity]['countIncomingDomestic90']
        csv_dataset_secondary[dest_pos][outgoing_domestic_count_90] = enititesDict[dest_entity]['countOutgoingDomestic90']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_30] = enititesDict[dest_entity]['countIncomingForeign30']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_30] = enititesDict[dest_entity]['countOutgoingForeign30']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_60] = enititesDict[dest_entity]['countIncomingForeign60']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_60] = enititesDict[dest_entity]['countOutgoingForeign60']
        csv_dataset_secondary[dest_pos][incoming_foreign_count_90] = enititesDict[dest_entity]['countIncomingForeign90']
        csv_dataset_secondary[dest_pos][outgoing_foreign_count_90] = enititesDict[dest_entity]['countOutgoingForeign90']


    columns = ['entity','incoming_domestic_amount_30','incoming_domestic_amount_60','incoming_domestic_amount_90',
               'outgoing_domestic_amount_30','outgoing_domestic_amount_60','outgoing_domestic_amount_90',
               'incoming_foreign_amount_30','incoming_foreign_amount_60','incoming_foreign_amount_90',
               'outgoing_foreign_amount_30','outgoing_foreign_amount_60','outgoing_foreign_amount_90',
               'incoming_domestic_count_30','incoming_domestic_count_60','incoming_domestic_count_90',
               'outgoing_domestic_count_30','outgoing_domestic_count_60','outgoing_domestic_count_90',
               'incoming_foreign_count_30','incoming_foreign_count_60','incoming_foreign_count_90',
               'outgoing_foreign_count_30','outgoing_foreign_count_60','outgoing_foreign_count_90',
               'balance_difference_30','balance_difference_60','balance_difference_90','isFraud']

    logging.warning("Creating New Features Done")

    logging.warning("Storing Data in Data_processed Folder...")

    # filtered_data_3.csv
    data_secondary = pd.DataFrame(csv_dataset_secondary, columns=columns)
    data_secondary.to_csv(output_dataset_three.path,index=False)
    logging.warning("Storing Data Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn"]
)
def feature_selection(
        input_dataset : Input[Dataset],
        output_dataset : Output[Dataset]
):
    """

    :param input_dataset_1: filtered_data_3.csv from data-preprocess-stage-2
    :param output_dataset: feature_importances.csv
    """
    import logging
    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier


    logging.warning("----------")
    logging.warning("FEATURE SELECTION STAGE")

    logging.warning("Reading Filtered Data 3 ...")

    dataframeX = pd.read_csv(input_dataset.path)
    col_names = list(dataframeX.columns.values)
    dataMat = dataframeX.to_numpy()

    logging.warning("Read Filtered Data 3")

    logging.warning("Creating X and Y Variables...")

    X = dataMat[:,1:-2]
    Y = dataMat[:,-1]

    logging.warning(f"Shape of X: {X.shape} and Shape of Y: {Y.shape}")

    logging.warning("Instiantiating Random Forest Model...")

    model = RandomForestClassifier(random_state=42)

    logging.warning("Fitting Data...")

    model.fit(X, Y.astype(int))

    logging.warning("Checking Feature Importances...")

    feature_imp = model.feature_importances_

    sorted_feature_vals = np.sort(feature_imp)
    sorted_feature_indexes = np.argsort(feature_imp)

    logging.warning("Significant Features in decreasing order of importance: ")

    logging.warning("Storing Feature Importances in reports...")

    fea_imp = [[col_names[i+2], feature_imp[i]] for i in reversed(sorted_feature_indexes)]
    features = pd.DataFrame(fea_imp, columns=["features", "importance_score"])
    # feature_importances.csv
    features.to_csv(output_dataset.path, index=False)

    logging.warning("Storing Feature Importances Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn"]
)
def segment_generation(
        input_dataset_1 : Input[Dataset],
        input_dataset_2 : Input[Dataset],
        input_dataset_3 : Input[Dataset],
        silhoutte_scores : Output[Dataset],
        final_dataset_output: Output[Dataset]
):
    """

    :param input_dataset_1:  filtered_data_2.csv from data-preprocess-stage-1
    :param input_dataset_2:  filtered_data_3.csv from data-preprocess-stage-2
    :param input_dataset_3:  feature_importances.csv from features_election
    :param silhoutte_scores: Metrics
    :param final_dataset_output: final_dataset
    :return:
    """
    # Segment Generation
    import logging
    import numpy as np
    import pandas as pd
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    logging.basicConfig(filename='logs/model_development.txt',
                        filemode='a',
                        format='%(asctime)s %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")

    logging.warning("----------")
    logging.warning("SEGMENT GENERATOR STAGE")

    def getClusterPredictions(data, true_k):
        model = KMeans(n_clusters=true_k)
        model.fit(data)
        prediction = model.predict(data)

        return prediction

    def getBestCluster(X,_min=2,_max=10):
        selected_cluster = 0
        previous_sil_coeff = 0.001 #some random small number not 0
        sc_vals = []
        for n_cluster in range(_min, _max):
            kmeans = KMeans(n_clusters=n_cluster).fit(X)
            label = kmeans.labels_

            sil_coeff = silhouette_score(X, label, metric='euclidean', sample_size=1000)
            sc_vals.append(sil_coeff)
            # print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

            percent_change = (sil_coeff-previous_sil_coeff)*100/previous_sil_coeff

            # return when below a threshold of 1%
            if percent_change<1:
                selected_cluster = n_cluster-1

            previous_sil_coeff = sil_coeff

        return selected_cluster or _max, sc_vals

    logging.warning("Reading Filtered Data 3 ...")

    X_dataframe = pd.read_csv(input_dataset_2.path)
    X = X_dataframe.to_numpy()

    logging.warning("Read Filtered Data 3")

    col_names = list(X_dataframe.columns.values)

    X_trimmed_features = np.zeros((X.shape[0],1))

    logging.warning("Importing Feature Importances...")

    #feature_path = f'gs://{bucket_id}/{folder_id}/feature_importances.csv'
    features = pd.read_csv(input_dataset_3.path)

    logging.warning("Selecting Top 13 Features for CLustering...")

    top_13 = features.iloc[:13, 0].tolist()

    logging.warning("Top 13 Features stored in List")

    for feature in top_13:
        X_trimmed_features = np.concatenate((X_trimmed_features,np.expand_dims(X_dataframe[feature],axis=1)),axis=1)
    X_trimmed_features = X_trimmed_features[:,1:]

    logging.warning("Choosing Best Number Of Clusters...")

    min_value = 2
    max_value = 10
    true_k, sc_vals = getBestCluster(X_trimmed_features,_min=min_value,_max=max_value)
    true_k = 5

    logging.warning("Storing Silhoutte Scores...")


    sil_score = [[i, sc_vals[i-min_value]] for i in range(min_value, max_value)]
    sil = pd.DataFrame(sil_score, columns=["no_of_clusters", "silhoutte_score"])
    sil.to_csv(silhoutte_scores.path, index=False)
    #sil.to_csv(f'gs://{bucket_id}/{folder_id}/silhoutte_scores.csv', index=False)

    logging.warning("Storing Silhoutte Scores Done")

    logging.warning("Creating Clusters with Best No Of Clusters...")

    prediction = getClusterPredictions(X_trimmed_features, true_k)
    seg_dict = {}
    for i in range(X.shape[0]):
        seg_dict[X[i,0]] = prediction[i]

    logging.warning("Inputing Filtered Data 2 Dataset...")

    X_dataframe_pri = pd.read_csv(input_dataset_1.path)
    X_pri = X_dataframe_pri.to_numpy()
    col_names = list(X_dataframe_pri.columns.values)

    logging.warning("Read Filtered 2 Data")

    logging.warning("Creating Final Dataset with segments...")

    X_with_segments = []
    for i in range(X_pri.shape[0]):
        X_with_segments.append(np.concatenate(([[seg_dict[X_pri[i,3]]]],np.expand_dims(X_pri[i,:],axis=0)),axis=1)[0])

    segmented_columns = ['segment','step','trans_type','amount','nameOrig','oldbalanceOrg',
                         'nameDest','oldbalanceDest','accountType','isFraud','isFlaggedFraud']

    data_segmented = pd.DataFrame(X_with_segments, columns = segmented_columns)
    data_segmented = data_segmented.drop('isFlaggedFraud', axis=1)
    data_segmented.to_csv(final_dataset_output.path, index=False)

    logging.warning("Storing Final Dataset Done")

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "gcsfs", "scikit-learn", "catboost"]
)
def training(
        project_id: str,
        bucket_id: str,
        bucket_folder: str,
        input_dataset : Input[Dataset],
        model_path: Output[Dataset]
):
    """

    :param input_dataset: fina_dataset.csv from segment-generation
    """
    import json
    import pickle
    import logging
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from catboost import CatBoostClassifier
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

    logging.warning("----------")
    logging.warning("MODEL CREATION STAGE")

    logging.warning("Reading Final Dataset...")

    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(bucket_id)

    dataMat = pd.read_csv(input_dataset.path)
    data = dataMat.to_numpy()

    logging.warning("Read Final Dataset")

    logging.warning("Checking Categorical Features...")

    cat_feat = [i for i in dataMat.columns if dataMat[i].dtypes == 'O']

    logging.warning("Checking Missing Values...")

    a = dict(dataMat.isnull().sum())
    b = [[i, a[i]] for i in a.keys()]
    missing = pd.DataFrame(b, columns=['features', 'null_values_count'])

    logging.warning("Storing Missing Values...")

    missing.to_csv("missing_values.csv", index=False)

    logging.warning("Storing Missing Values Done")

    logging.warning("Encoding Categorical Features...")

    encoder = LabelEncoder()
    for i in cat_feat:
        dataMat[i] = encoder.fit_transform(dataMat[i])

    blob = bucket.blob(f"{bucket_folder}/label_encoder.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(encoder, f)

    logging.warning("Features Encoding Done")

    logging.warning("Creating X and y variables ...")

    X = dataMat.iloc[:, :-1]
    y = dataMat['isFraud']

    logging.warning(f"Shape of X: {X.shape} and Shape of y: {y.shape}")

    logging.warning("Splitting Dataset...")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    logging.warning("Instantiating Model...")

    model = CatBoostClassifier(random_state=42, class_weights={0:1, 1:12}, silent=True)

    logging.warning("Fitting Model...")

    model.fit(X_train, y_train)
    y_pred_cat = model.predict(X_test)

    logging.warning("Saving Model...")

    #model_path = "model.pkl"
    blob = bucket.blob(f"{bucket_folder}/model.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(model, f)

    f.close()

    logging.warning("Saving Model Metrics...")

    metric_file_path = "performance.json"
    # with open(metric_file_path, "r") as f:
    #     data = json.load(f)

    model_metric = {
        "time_stamp": datetime.now().strftime("%d-%m-%Y_%H:%M:%S"),
        "confusion_matrix": confusion_matrix(y_test, y_pred_cat).tolist(),
        "precision": precision_score(y_test, y_pred_cat),
        "recall": recall_score(y_test, y_pred_cat),
        "f1_score": f1_score(y_test, y_pred_cat)
    }

    # data['model_metric'].append(model_metric)
    # with open(metric_file_path, "w") as f:
    #     json.dump(data, f, indent=4)

    logging.warning("Model Metrics Stored")

In [None]:
from kfp.dsl import pipeline

@pipeline(name="money_laundering_detection")
def pipeline(
        project_id: str,
        bucket_id: str,
        bucket_folder: str
):
    preproces_job_1 = data_preprocess_stage_1(raw_dataset=raw_data_full_path)
    preprocess_job_2 = data_preprocess_stage_2(input_dataset=preproces_job_1.outputs["output_dataset_two"])
    feature_selection_job = feature_selection(input_dataset=preprocess_job_2.outputs["output_dataset_three"])
    segment_generation_job = segment_generation(
        input_dataset_1=preproces_job_1.outputs["output_dataset_two"],
        input_dataset_2=preprocess_job_2.outputs["output_dataset_three"],
        input_dataset_3=feature_selection_job.outputs["output_dataset"]
    )
    training_job = training(
        project_id = project_id,
        bucket_id = bucket_id,
        bucket_folder = bucket_folder,
        input_dataset=segment_generation_job.outputs["final_dataset_output"])

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='money_laundering_detection.yaml'
)

In [None]:

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=project_id,
    location="us-central1",
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="money_laundering_detection",
    template_path="money_laundering_detection.yaml",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'bucket_id': bucket_id,
        'bucket_folder': bucket_folder_name
    }
)

job.submit()

## Testing the Model

In [190]:
import pickle
from google.cloud import storage

bucket = storage.Client(project="jesusarguelles-sandbox").bucket("jesusarguelles-datasets-public")

blob = bucket.blob("money_laundering_detection/model.pkl")

with blob.open("rb") as f:
    model = pickle.load(f)
f.close()

blob = bucket.blob("money_laundering_detection/label_encoder.pkl")

with blob.open("rb") as f:
    encoder = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
predict = {
    "segment": [0],
    "step": [1],
    "trans_type": ["DEBIT"],
    "amount": [181.00],
    "nameOrig": ["C1900366749"],
    "oldbalanceOrg": [4465.0],
    "nameDest": ["C997608398"],
    "oldbalanceDest": ["10845.0"],
    "accountType": ["DOMESTIC"]
}
to_predict_df = pd.DataFrame(predict)
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.transform(to_predict_df[i])

model.predict(to_predict_df)

In [20]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": ["0.0"],
    "accountType": ["FOREIGN"]
}

to_predict_df = pd.DataFrame(predict)
#to_predict_df = df[df.isFraud == 1]
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = encoder.transform(to_predict_df[i])

model.predict(to_predict_df)

{'trans_type': {'CASH_IN': 0,
  'CASH_OUT': 1,
  'CREDIT': 2,
  'DEBIT': 3,
  'WIRE_IN': 4,
  'WIRE_OUT': 5},
 'nameOrig': {'C1000008975': 0,
  'C1000012640': 1,
  'C1000028246': 2,
  'C1000044196': 3,
  'C1000053329': 4,
  'C1000073191': 5,
  'C1000078727': 6,
  'C1000079132': 7,
  'C1000093176': 8,
  'C1000094896': 9,
  'C1000097327': 10,
  'C1000103904': 11,
  'C1000109302': 12,
  'C1000111897': 13,
  'C1000119553': 14,
  'C1000119807': 15,
  'C1000121623': 16,
  'C1000123015': 17,
  'C1000148923': 18,
  'C1000160958': 19,
  'C1000169021': 20,
  'C10001825': 21,
  'C1000185314': 22,
  'C1000198697': 23,
  'C1000202702': 24,
  'C1000222753': 25,
  'C100022590': 26,
  'C1000229380': 27,
  'C1000231597': 28,
  'C1000232082': 29,
  'C1000236204': 30,
  'C1000264406': 31,
  'C1000264972': 32,
  'C1000275423': 33,
  'C100028125': 34,
  'C1000286893': 35,
  'C1000292364': 36,
  'C1000340261': 37,
  'C1000372884': 38,
  'C1000398935': 39,
  'C1000411139': 40,
  'C1000413708': 41,
  'C100042

In [7]:
pre_df = df[df.isFraud == 1]
#to_predict_df = df[df.isFraud == 1]
cat_feat = [i for i in pre_df.columns if pre_df[i].dtypes == 'O']

for i in cat_feat:
    pre_df[i] = encoder.transform(pre_df[i])

model.predict(pre_df)

array([1])

In [4]:
import json
import pickle
import logging
import pandas as pd
from datetime import datetime
from google.cloud import storage
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

logging.warning("----------")
logging.warning("MODEL CREATION STAGE")

logging.warning("Reading Final Dataset...")

storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(bucket_id)

dataMat = pd.read_csv("gs://jesusarguelles-datasets-public/money_laundering_detection/final_data.csv")
data = dataMat.to_numpy()

logging.warning("Read Final Dataset")

logging.warning("Checking Categorical Features...")

cat_feat = [i for i in dataMat.columns if dataMat[i].dtypes == 'O']

logging.warning("Checking Missing Values...")

a = dict(dataMat.isnull().sum())
b = [[i, a[i]] for i in a.keys()]
missing = pd.DataFrame(b, columns=['features', 'null_values_count'])

logging.warning("Storing Missing Values...")

missing.to_csv("missing_values.csv", index=False)

logging.warning("Storing Missing Values Done")

logging.warning("Encoding Categorical Features...")

encoder = LabelEncoder()
print("------")
print(cat_feat)
print("------")

label_encoders = {}
label_mappings = {}

for i in cat_feat:
    encoder.fit(dataMat[i])
    dataMat[i] = encoder.transform(dataMat[i])

    label_encoders[i] = encoder
    label_mappings[i] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

logging.warning("Features Encoding Done")

logging.warning("Creating X and y variables ...")

X = dataMat.iloc[:, :-1]
y = dataMat['isFraud']

logging.warning(f"Shape of X: {X.shape} and Shape of y: {y.shape}")

logging.warning("Splitting Dataset...")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

logging.warning("Instantiating Model...")

model = CatBoostClassifier(random_state=42, class_weights={0:1, 1:12}, silent=True)

logging.warning("Fitting Model...")

model.fit(X_train, y_train)
y_pred_cat = model.predict(X_test)


logging.warning("Model Metrics Stored")

array([1])

In [6]:
predict = {
    "segment": [0],
    "step": [2],
    "trans_type": ["WIRE_OUT"],
    "amount": [18627.02],
    "nameOrig": ["C1375503918"],
    "oldbalanceOrg": [18627.02],
    "nameDest": ["C234430897"],
    "oldbalanceDest": [0.0],
    "accountType": ["FOREIGN"]
}

to_predict_df = pd.DataFrame(predict)
#to_predict_df = df[df.isFraud == 1]
cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

for i in cat_feat:
    to_predict_df[i] = to_predict_df[i].map(label_mappings[i])
    #to_predict_df[i] = encoder.transform(to_predict_df[i])

model.predict(to_predict_df)

array([1])