## Phase 1: Load Dataset

In [1]:
import pandas as pd
from sklearn.datasets import load_diabetes

# Data Source: https://www.kaggle.com/mlg-ulb/creditcardfraud/downloads/creditcardfraud.zip/3
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Phase 2: Slice the Dataset

In [2]:
# Convert data to contain only non-negative values
for col in df.columns:
    if col.startswith("V"):
        offset = df[col].min()
        df[col] = df[col] - offset

In [3]:
# The code below shows us that these columns have the strongest correlation coefficients:
#df.corr()['Class'].sort_values()  # V4, V11, V17, V14, V12, V10, V16, V3, V7, V18

In [4]:
# Slice the Data into X (training_instances) and Y labels (class_labels)
training_instances = df[['V4', 'V11', 'V17', 'V14', 'V12', 'V10', 'V16', 'V3', 'V7', 'V18']]
class_labels = df['Class']

## Phase 3: Create the Pipeline

In [5]:
from sklearn import tree, pipeline, preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Initialize the Scikit-Learn pipeline
cart_model = tree.DecisionTreeClassifier()
pipe = pipeline.Pipeline(steps=[
    ("feature_selection", SelectKBest(chi2, k=2)),
    ("scale", preprocessing.StandardScaler()),  
    ("CART", cart_model)])
preprocessing.StandardScaler()

StandardScaler(copy=True, with_mean=True, with_std=True)

## Phase 4: Train

In [6]:
pipe.fit(training_instances.values, y=class_labels)

Pipeline(memory=None,
         steps=[('feature_selection',
                 SelectKBest(k=2, score_func=<function chi2 at 0x125857268>)),
                ('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('CART',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=None, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort=False, random_state=None,
                                        splitter='best'))],
         verbose=False)

## Phase 5: Evaluation

In [7]:
from sklearn.model_selection import cross_val_score

# return value is array of scores
scores = cross_val_score(pipe, training_instances, class_labels, cv=5)
# use as quality metric the average CV score
meanCvAccuracy = scores.mean()
print("Mean CV accuracy= %f" % meanCvAccuracy)

Mean CV accuracy= 0.998090


## Phase 6: Save the Pipeline

In [8]:
import joblib

PIPELINEPATH = "creditcard.pipeline"
joblib.dump(pipe, PIPELINEPATH, compress = 1)

['creditcard.pipeline']

## Phase 7: Load the Pipeline

In [9]:
pipeline_loaded = joblib.load(PIPELINEPATH)

## Phase 8: Classify New Instances

In [10]:
import random

# Example of a new instance, by sampling a random record in the dataset
sample_index = random.randint(1,df.shape[0])

# Make a prediction
result = pipeline_loaded.predict([training_instances.iloc[sample_index],])
print("Prediction: Class label is %i" % result)
print("Actual Label:", class_labels.iloc[sample_index])

Prediction: Class label is 0
Actual Label: 0
