In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
csv = pd.read_csv("creditcard/creditcard.csv", encoding="ISO-8859-1")

In [4]:
prednames = [p for p in csv.columns if p[0] == "V"]

In [5]:
csvpart = csv.sample(frac=0.1)

In [6]:
predictors = csvpart[prednames]

In [7]:
targets = csvpart["Class"]

In [8]:
nofraud_preds = predictors.values[np.where(targets.values == 0)]

In [9]:
nofraud_preds

array([[ 1.15844718e+00,  2.68085852e-01, -5.36739090e-01, ...,
        -2.06841218e-01, -4.04855858e-03,  1.16611811e-02],
       [ 1.44496482e+00, -1.44559467e+00, -2.58941741e-02, ...,
        -2.48478077e-01,  2.20866690e-02,  1.86645237e-02],
       [-5.69290134e+00,  4.42078063e+00, -3.40202700e+00, ...,
        -1.39970204e-01, -1.13214029e+00, -3.57963716e-01],
       ...,
       [ 2.09004320e+00, -7.93351863e-01, -1.19800405e+00, ...,
        -4.88148758e-01,  1.89634138e-02, -6.27804525e-02],
       [-8.99141647e+00, -1.14975488e+01, -1.14169500e+00, ...,
         7.48700831e-01,  1.86738088e-01, -1.42368911e+00],
       [ 1.12096878e+00,  1.04575032e-01,  2.25376261e-01, ...,
        -2.48420161e-01,  2.02455666e-02,  7.89738172e-03]])

In [10]:
fraud_preds = predictors.values[np.where(targets.values == 1)]

In [11]:
fraud_preds.shape

(50, 28)

In [12]:
np.sum(targets)

50

In [13]:
covar_matrix = np.cov(nofraud_preds.transpose())

In [14]:
covar_matrix.shape

(28, 28)

In [15]:
covar_inv = np.linalg.inv(covar_matrix)

In [16]:
distances = []
for i in range(nofraud_preds.shape[0]):
    distances.append(np.matmul(np.matmul(nofraud_preds[i,:], covar_inv), nofraud_preds[i,:]))

In [17]:
distances

[9.753366539556891,
 23.653751322667066,
 102.77555840672014,
 27.011258034321227,
 15.023391101699353,
 19.823229922940193,
 10.610684320000916,
 11.74370309431914,
 17.477419574917754,
 7.055651267902531,
 14.132774902571597,
 404.39006530412155,
 31.0928589588268,
 14.714672966611573,
 17.937122357600785,
 11.440630824992942,
 19.47038973577844,
 15.26052913695146,
 6.6856225319496305,
 30.756165142839205,
 8.19487721945709,
 8.38216329231465,
 11.738303126400334,
 29.8372312922313,
 53.55971024060844,
 17.546837012105932,
 9.606216842111056,
 28.748357121586245,
 26.69756361732643,
 15.080979319764218,
 32.97751286935279,
 71.53140159585976,
 12.775114272975399,
 14.45509739262181,
 11.924932074122863,
 14.677253710789962,
 17.599221427802757,
 14.525908639207245,
 12.10316729052357,
 15.03853390796555,
 17.92673150356171,
 36.126972502567604,
 13.660432307298917,
 12.811830564835542,
 19.214097457882797,
 29.833228775395348,
 13.590155816022234,
 11.3282051802645,
 32.841851291777

In [21]:
fraud_distances = []
for i in range(fraud_preds.shape[0]):
    fraud_distances.append(np.matmul(np.matmul(fraud_preds[i,:], covar_inv), fraud_preds[i,:]))

In [22]:
fraud_distances

[6662.751833123285,
 1528.8455474128466,
 27464.03280068023,
 60.33442263211482,
 5036.527653855904,
 2872.7979135504975,
 1496.990645478739,
 9618.985942159601,
 2122.9564172385135,
 1440.2390037864939,
 1136.8987012306325,
 4284.390797203554,
 17300.068130084856,
 8775.97353724991,
 1508.0436063361676,
 780.6465070614123,
 77.81469233101322,
 17294.45647135125,
 21739.665720699402,
 1479.5491130253747,
 3154.8033008010834,
 1263.0812961066208,
 1125.457642163751,
 4042.0246861160545,
 1166.963119525221,
 2856.461710023672,
 32.69086575108488,
 25019.711355940915,
 4334.182439586994,
 21751.571553248694,
 47.93507470789379,
 21779.617655634927,
 17352.935633687917,
 219.92675331756902,
 2138.6236922852927,
 5386.289881582347,
 8830.282399883463,
 1509.8117047737094,
 2231.865879429287,
 17581.0790294359,
 9943.965953801553,
 3121.740985710615,
 24845.66674067805,
 1655.900689260837,
 24756.561216018363,
 8088.135873438395,
 20.21243929072152,
 1473.5085321047654,
 85.55380337778634,
 

In [23]:
np.sum(np.array(fraud_distances) < 200)

6

In [24]:
len(fraud_distances)

50

In [27]:
nfaults_nofraud = np.sum(np.array(distances) > 200)

In [28]:
nfaults_nofraud

279

In [29]:
nfaults_nofraud / nofraud_preds.shape[0]

0.009813232035454257

In [30]:
nfaults_fraud = np.sum(np.array(fraud_distances) < 200)

In [31]:
nfaults_fraud / len(fraud_distances)

0.12

In [34]:
np.diag(covar_matrix)

array([3.88397546, 2.82149978, 2.10838027, 1.94246483, 1.70575986,
       1.71007254, 1.25847309, 1.22542889, 1.19235997, 1.10886191,
       1.0118799 , 0.88182732, 0.98248592, 0.77769472, 0.83865149,
       0.70194064, 0.53787187, 0.68006807, 0.66173156, 0.64487503,
       0.50499181, 0.51782918, 0.35416696, 0.35960857, 0.2726174 ,
       0.23381342, 0.15822894, 0.10942365])