# Machine Learning Pipeline Project (07-30-19)

In [1]:
from sklearn import tree, pipeline, preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score
import pandas as pd
import joblib
import random
import os

## The importation of the data (https://www.kaggle.com/mlg-ulb/creditcardfraud/downloads/creditcardfraud.zip/3)

In [2]:
credit = pd.read_csv('./DATA/creditcard.csv')
credit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Declaring the X and Y values with the strongest correlated columns

In [3]:
training_instances = credit[['V1', 'V2', 'V3', 'V4', 'V5',
                             'V6', 'V7', 'V8', 'V9', 'V10',
                             'V11', 'V12', 'V13', 'V14']] #X (Strongest values)

class_labels = credit['Class'] #Y

## Declaring the pipeline and fitting it the X values

In [4]:
cart_model = tree.DecisionTreeClassifier()

pipe1 = pipeline.Pipeline(steps=[
    ("feature_selection", SelectKBest()),
    ("scale", preprocessing.StandardScaler()),  
    ("CART", cart_model)])

preprocessing.StandardScaler()

pipe1.fit(training_instances, y=class_labels)

Pipeline(memory=None,
         steps=[('feature_selection',
                 SelectKBest(k=10,
                             score_func=<function f_classif at 0x11ae54620>)),
                ('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('CART',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=None, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort=False, random_state=None,
                                        splitter='best'))],
         verbose=False)

## Displaying the Mean of the CV's Accuracy

In [6]:
scores1 = cross_val_score(pipe1, training_instances, class_labels, cv=5)

CV_Accuracy_Mean = scores1.mean()
print("Mean CV accuracy: {0}".format(CV_Accuracy_Mean))

Mean CV accuracy: 0.9981496199520569


## Creation of the pipeline in its proper location 

In [7]:
if not os.path.exists("./PIPELINES/"):
    os.makedirs("./PIPELINES/")
    print("PIPELINES FOLDER CREATED.\n")
    
PIPELINEPATH = "./PIPELINES/creditcard.pipeline"
joblib.dump(pipe1, PIPELINEPATH, compress = 1)

PIPELINES FOLDER CREATED.



['./PIPELINES/creditcard.pipeline']

## Loading the pipeline 

In [8]:
pipeline_loaded = joblib.load(PIPELINEPATH)

## Creating a random sample to test the trained pipeline

In [9]:
sample_index = random.randint(1,credit.shape[0])
result = pipeline_loaded.predict([training_instances.iloc[sample_index],])

print("Prediction: Class label is {0}".format(result))
print("Actual Label: {0}".format(class_labels.iloc[sample_index]))

Prediction: Class label is [0]
Actual Label: 0
