# TPOT - Credit Card Fraud

> ### Install TPOT

In [None]:
# Installs TPOT libraries.
!pip install tpot

> ### Import Libraries


In [2]:
# import libraries
import numpy as np
import pandas as pd
from tpot import TPOTClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

> ### Import Dataset

In [3]:
# pulls dataset from github with no header
url = 'https://raw.githubusercontent.com/jporeilly/Machine--Learning/master/01_Credit_Card/Lab_01_AutoML/data/TPOT.csv' 
dataset = pd.read_csv(url, sep= ';', header=None)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

> #### Data Exploration

In [4]:
# displays dataset and outputs independent x variables and dependent y variable
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,964.7,19,104.7,1,40,7,0,0
1,0,524.1,15,24.2,0,55,4,1,0
2,0,234.2,18,13.8,1,66,15,0,0
3,1,207.7,8,56.3,0,45,23,1,0
4,1,910.6,11,59.1,0,31,23,1,1


In [5]:
print (x)

[[  0.  964.7  19.  ...  40.    7.    0. ]
 [  0.  524.1  15.  ...  55.    4.    1. ]
 [  0.  234.2  18.  ...  66.   15.    0. ]
 ...
 [  0.    4.9   6.  ...   5.   12.    0. ]
 [  0.   30.1  20.  ...  21.    7.    1. ]
 [  1.   81.6   1.  ...  50.   21.    1. ]]


In [6]:
print (y)

[0 0 0 ... 0 1 0]


> ### Add Column Headers

In [7]:
# adds column headers
dataset.columns = ['first_time_customer','order_dollar_amount','num_items','age','web_order','total_transactions_to_date','hour_of_day','billing_shipping_zip_equal','reported_as_fraud_historic']

> #### Check Dataset

In [8]:
# check column headers
dataset.head()

Unnamed: 0,first_time_customer,order_dollar_amount,num_items,age,web_order,total_transactions_to_date,hour_of_day,billing_shipping_zip_equal,reported_as_fraud_historic
0,0,964.7,19,104.7,1,40,7,0,0
1,0,524.1,15,24.2,0,55,4,1,0
2,0,234.2,18,13.8,1,66,15,0,0
3,1,207.7,8,56.3,0,45,23,1,0
4,1,910.6,11,59.1,0,31,23,1,1


> ### Convert Dataset to Numpy Array and Fit (optional)

In [9]:
# convert to numpy array and fit data
x = dataset.iloc[:,0:-1].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X=np.asarray(x_scaled)
y=np.asarray(dataset.iloc[:,-1])

> #### Check Dataset

In [10]:
dataset.head()

Unnamed: 0,first_time_customer,order_dollar_amount,num_items,age,web_order,total_transactions_to_date,hour_of_day,billing_shipping_zip_equal,reported_as_fraud_historic
0,0,964.7,19,104.7,1,40,7,0,0
1,0,524.1,15,24.2,0,55,4,1,0
2,0,234.2,18,13.8,1,66,15,0,0
3,1,207.7,8,56.3,0,45,23,1,0
4,1,910.6,11,59.1,0,31,23,1,1


> ### Splitting the Dataset: Train / Test

In [11]:
# split the dataset 75% used for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=None)

> ### TPOT Classifier

In [12]:
# TPOT Classifier
tpot = TPOTClassifier(generations=1, verbosity=2, population_size=100, scoring='accuracy', n_jobs = -1, config_dict='TPOT light')
tpot.fit(X_train, y_train)
output_score=str(tpot.score(X_test, y_test))
print(tpot.fitted_pipeline_)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=200.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.8807486973947896
Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=7, p=1, weights=distance)
Pipeline(memory=None,
         steps=[('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=7, p=1,
                                      weights='distance'))],
         verbose=False)


> #### Export Pipeline as Python script

In [13]:
# export results to python
tpot.export('tpot_exported_credit_card_pipeline.py')
from google.colab import files
files.download('tpot_exported_credit_card_pipeline.py')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

>#### TPOT Evaluated Pipelines

In [None]:
# output as JSON each pipeline
print(tpot.evaluated_individuals_)