# TPOT - Credit Card Fraud

> ### Install TPOT

In [1]:
# Installs TPOT libraries.
!pip install tpot



> ### Import Libraries


In [18]:
# import libraries
import numpy as np
import pandas as pd
from tpot import TPOTClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from dask.distributed import Client, LocalCluster

> ### Import Dataset

In [3]:
# access your local drive. Select:  ~/Workshop--Data-Integration/Labs/Module 6 - Machine-Learning/01 Credit Card/AutoML/data/TPOT.csv
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving TPOT.csv to TPOT.csv
User uploaded file "TPOT.csv" with length 266498 bytes


In [4]:
# Load the TPOT.csv dataset - Headless
dataset = pd.read_csv('TPOT.csv', sep= ';', header=None)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

In [5]:
# backup option pulls dataset from GitHub repository.
# url = 'https://github.com/jporeilly/How-To--Machine-Learning/blob/main/data/TPOT.csv'
# dataset = pd.read_csv(url, sep= ';', header=None)
# x = dataset.iloc[:, :-1].values
# y = dataset.iloc[:, 8].values

> #### Data Exploration

In [6]:
# displays dataset and outputs independent x variables and dependent y variable
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,964.7,19,104.7,1,40,7,0,0
1,0,524.1,15,24.2,0,55,4,1,0
2,0,234.2,18,13.8,1,66,15,0,0
3,1,207.7,8,56.3,0,45,23,1,0
4,1,910.6,11,59.1,0,31,23,1,1


In [7]:
print (x)

[[  0.  964.7  19.  ...  40.    7.    0. ]
 [  0.  524.1  15.  ...  55.    4.    1. ]
 [  0.  234.2  18.  ...  66.   15.    0. ]
 ...
 [  0.    4.9   6.  ...   5.   12.    0. ]
 [  0.   30.1  20.  ...  21.    7.    1. ]
 [  1.   81.6   1.  ...  50.   21.    1. ]]


In [8]:
print (y)

[0 0 0 ... 0 1 0]


> ### Add Column Headers

In [9]:
# adds column headers
dataset.columns = ['first_time_customer','order_dollar_amount','num_items','age','web_order','total_transactions_to_date','hour_of_day','billing_shipping_zip_equal','reported_as_fraud_historic']

> #### Check Dataset

In [10]:
# check column headers
dataset.head()

Unnamed: 0,first_time_customer,order_dollar_amount,num_items,age,web_order,total_transactions_to_date,hour_of_day,billing_shipping_zip_equal,reported_as_fraud_historic
0,0,964.7,19,104.7,1,40,7,0,0
1,0,524.1,15,24.2,0,55,4,1,0
2,0,234.2,18,13.8,1,66,15,0,0
3,1,207.7,8,56.3,0,45,23,1,0
4,1,910.6,11,59.1,0,31,23,1,1


> ### Convert Dataset to Numpy Array and Fit (optional)

In [11]:
# convert to numpy array and fit data
x = dataset.iloc[:,0:-1].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X=np.asarray(x_scaled)
y=np.asarray(dataset.iloc[:,-1])

> #### Check Dataset

In [12]:
dataset.head()

Unnamed: 0,first_time_customer,order_dollar_amount,num_items,age,web_order,total_transactions_to_date,hour_of_day,billing_shipping_zip_equal,reported_as_fraud_historic
0,0,964.7,19,104.7,1,40,7,0,0
1,0,524.1,15,24.2,0,55,4,1,0
2,0,234.2,18,13.8,1,66,15,0,0
3,1,207.7,8,56.3,0,45,23,1,0
4,1,910.6,11,59.1,0,31,23,1,1


> ### Splitting the Dataset: Train / Test

In [13]:
# split the dataset 75% used for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=None)

> ### TPOT Classifier

In [None]:
# TPOT Classifier
# Set up a local Dask cluster and client
cluster = LocalCluster(n_workers=1, threads_per_worker=1) # Using 1 worker with 1 thread to avoid resource exhaustion
client = Client(cluster)

try:
    # Set n_jobs to 1 to run TPOT in a single process, avoiding Dask worker resource issues
    # Reduced population_size to reduce memory consumption
    tpot = TPOTClassifier(generations=1, population_size=20, n_jobs = 1)
    tpot.fit(X_train, y_train)
    output_score=str(tpot.score(X_test, y_test))
    print(tpot.fitted_pipeline_)
finally:
    client.close()
    cluster.close()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37897 instead
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:46659
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:37897/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:37823'
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:34099'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:41961'. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:43013'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:40205 name: 0
INFO:distributed.scheduler:Starting worker 

> #### Export Pipeline as Python script

In [None]:
# export results to python
tpot.export('tpot_exported_credit_card_pipeline.py')
from google.colab import files
files.download('tpot_exported_credit_card_pipeline.py')

>#### TPOT Evaluated Pipelines

In [None]:
# output as JSON each pipeline
print(tpot.evaluated_individuals_)