In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from scipy import interp
from sklearn.model_selection import KFold
from pylab import rcParams
import os
from sklearn.metrics import confusion_matrix


## 1. Prepare Data

In [2]:
cost_tp = 10
cost_fn = 88
cost_tn = 0
cost_fp = 10

+ 88 average amount of money withdrawn from a fraudulent transaction
+ 10 administration costs
+ 0 no cost

In [3]:
res = pd.read_csv('res.csv', header=0)

In [4]:
res.loc[:'tp']

Unnamed: 0,Model,tp,fp,fn,tn,f1
0,Random,56721,154,17,70,0.387387
1,AdaBoost,56542,333,16,71,0.28629
2,XGBoost,56603,272,14,73,0.337962
3,Ensemble,56678,197,16,71,0.404558


In [5]:
res.columns = ['Model', 'tp', 'fp', 'fn', 'tn', 'f1']

##  2. Calculate total cost for each model


In [10]:
for index, row in res.iterrows():
    costs = (row["tp"]*cost_tp + row['fp']*cost_fp + row['tn']*cost_tn + row['fn']*cost_fn)
    print(row['Model'],'$', costs)

Random $ 570246
AdaBoost $ 570158
XGBoost $ 569982
Ensemble $ 570158


## 3. Missclassification rate

In [7]:
for index, row in res.iterrows():
    #error = ((1 - ((row['tp']+row['tn']) / (total_pred)))*100)
    error = (((row['fp']+row['fn']) / (row['tp'] + row['tn'] + row['fp'] + row['fn']))*100)
    print( 'The model', row['Model'],'has misclassification rate equal to:','{0:0.2f}%'.format(error))
    
 #   print ('Average precision-recall score: {0:0.2f}'.format(average))

The model Random has misclassification rate equal to: 0.30%
The model AdaBoost has misclassification rate equal to: 0.61%
The model XGBoost has misclassification rate equal to: 0.50%
The model Ensemble has misclassification rate equal to: 0.37%


In [13]:
min_notfraud = min(cost_tp, cost_fp)
min_notfraud

10

In [14]:
min_fraud = min(cost_tn, cost_fn)
min_fraud

0

In [15]:
for index, row in res.iterrows():
    costs = ( row['fp']*(cost_fp-min_notfraud) + row['fn']*(cost_fn-min_fraud))
    print(row['Model'], costs)

Random 1496
AdaBoost 1408
XGBoost 1232
Ensemble 1408


In [16]:
cost = pd.read_csv('cost.csv', index_col=0)

In [17]:
cost.columns

Index(['Cost', 'F1'], dtype='object')

In [18]:
cost

Unnamed: 0_level_0,Cost,F1
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Random,1496,0.387387
AdaBoost,1408,0.28629
XGBoost,1232,0.337962
Ensemble,1408,0.404558


In [19]:
df1 = cost[['Cost']]

In [20]:
df1

Unnamed: 0_level_0,Cost
Model,Unnamed: 1_level_1
Random,1496
AdaBoost,1408
XGBoost,1232
Ensemble,1408


In [21]:
cost.index.values

array(['Random', 'AdaBoost', 'XGBoost', 'Ensemble'], dtype=object)

In [22]:
df1.values

array([[1496],
       [1408],
       [1232],
       [1408]], dtype=int64)

In [23]:
import plotly
import plotly.graph_objs as go

trace1 = go.Bar(
    x=cost.index.values,
    y=[1496,1408,1232,1408],
    name='Cost Model'
)
trace2 = go.Bar(
    x=cost.index.values,
    y=[30,61,50,37],
    name='Error Rate'
)
data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)

plotly.offline.plot({
    "data": data,
    "layout":layout
}, auto_open=True)





'temp-plot.html'