# TPOT - Credit Card Fraud

> ### Install TPOT

In [None]:
# Installs TPOT libraries.
!pip install tpot

> ### Import Libraries

In [None]:
# import libraries
import numpy as np
import pandas as pd
from tpot import TPOTClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from dask.distributed import Client, LocalCluster

> ### Import Dataset

In [None]:
# access your local drive. Select:  ~/Workshop--Data-Integration/Labs/Module 6 - Machine-Learning/01 Credit Card/AutoML/data/TPOT.csv
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
# Load the TPOT.csv dataset - Headless
dataset = pd.read_csv('TPOT.csv', sep= ';', header=None)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

In [None]:
# backup option pulls dataset from GitHub repository.
# url = 'https://github.com/jporeilly/How-To--Machine-Learning/blob/main/data/TPOT.csv'
# dataset = pd.read_csv(url, sep= ';', header=None)
# x = dataset.iloc[:, :-1].values
# y = dataset.iloc[:, 8].values

> #### Data Exploration

In [None]:
# displays dataset and outputs independent x variables and dependent y variable
dataset.head()

In [None]:
print (x)

In [None]:
print (y)

> ### Add Column Headers

In [None]:
# adds column headers
dataset.columns = ['first_time_customer','order_dollar_amount','num_items','age','web_order','total_transactions_to_date','hour_of_day','billing_shipping_zip_equal','reported_as_fraud_historic']

> #### Check Dataset

In [None]:
# check column headers
dataset.head()

> ### Convert Dataset to Numpy Array and Fit (optional)

In [None]:
# convert to numpy array and fit data
x = dataset.iloc[:,0:-1].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X=np.asarray(x_scaled)
y=np.asarray(dataset.iloc[:,-1])

> #### Check Dataset

In [None]:
dataset.head()

> ### Splitting the Dataset: Train / Test

In [None]:
# split the dataset 75% used for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=42)

> ### TPOT Classifier - Option 1: With Dask (Recommended for Large Datasets)

In [None]:
# TPOT Classifier with Dask distributed computing
# Set up a local Dask cluster and client
cluster = LocalCluster(
    n_workers=2,           # Number of worker processes
    threads_per_worker=2,  # Number of threads per worker
    memory_limit='2GB'     # Memory limit per worker
)
client = Client(cluster)

try:
    # Initialize TPOT with Dask support
    tpot = TPOTClassifier(
        generations=5,          # Number of iterations to run
        population_size=20,     # Number of models to evaluate per generation
        use_dask=True,          # Enable Dask distributed computing
        verbosity=2,            # Display progress information
        random_state=42,        # For reproducibility
        cv=5,                   # Cross-validation folds
        n_jobs=1                # Jobs per worker (use 1 with Dask)
    )
    
    # Fit the model
    print("Training TPOT model...")
    tpot.fit(X_train, y_train)
    
    # Evaluate on test set
    test_score = tpot.score(X_test, y_test)
    print(f"\nTest Set Accuracy: {test_score:.4f}")
    print(f"\nBest Pipeline:\n{tpot.fitted_pipeline_}")
    
finally:
    # Clean up Dask resources
    client.close()
    cluster.close()
    print("\nDask cluster closed.")

> ### TPOT Classifier - Option 2: Without Dask (Simpler, Good for Smaller Datasets)

In [None]:
# TPOT Classifier without Dask - simpler approach
# This runs faster on single machines and smaller datasets

tpot_simple = TPOTClassifier(
    generations=5,          # Number of iterations to run
    population_size=20,     # Number of models to evaluate per generation
    n_jobs=-1,              # Use all available CPU cores
    verbosity=2,            # Display progress information
    random_state=42,        # For reproducibility
    cv=5                    # Cross-validation folds
)

# Fit the model
print("Training TPOT model...")
tpot_simple.fit(X_train, y_train)

# Evaluate on test set
test_score = tpot_simple.score(X_test, y_test)
print(f"\nTest Set Accuracy: {test_score:.4f}")
print(f"\nBest Pipeline:\n{tpot_simple.fitted_pipeline_}")

> #### Export Pipeline as Python script

In [None]:
# export results to python
# Use tpot if you ran Option 1 (with Dask), or tpot_simple if you ran Option 2
tpot.export('tpot_exported_credit_card_pipeline.py')
from google.colab import files
files.download('tpot_exported_credit_card_pipeline.py')

>#### TPOT Evaluated Pipelines

In [None]:
# output as JSON each pipeline
# Use tpot if you ran Option 1 (with Dask), or tpot_simple if you ran Option 2
print(tpot.evaluated_individuals_)