In [1]:
%matplotlib inline
## util
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

## modelos
from sklearn import linear_model, neighbors, svm, naive_bayes

# 
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn import preprocessing

In [2]:
df = pd.read_csv('creditcard.csv')
df_bkp = df

FileNotFoundError: File b'creditcard.csv' does not exist

In [None]:
print(df.shape)
df.head(5)

In [None]:
nSplits = 10
kfold = StratifiedKFold(n_splits=nSplits)

## Usando todos as features do dataframe

In [None]:
X1 = df.values[:,:30]
y1 = df.values[::,30]

In [None]:
def classification(model,X_,y_):
    accuracy = 0.0
    recall = 0.0
    f1 = 0.0
    auc = 0.0
    for train_index, test_index in kfold.split(X_,y_):
        X_train = X_[train_index]
        y_train = y_[train_index]
        X_test = X_[test_index]
        y_test = y_[test_index]
        
        norm = preprocessing.MinMaxScaler().fit(X_train)
        X_train_norm = norm.transform(X_train)
        X_test_norm = norm.transform(X_test)
        
        model.fit(X_train_norm,y_train)
        y_pred = model.predict(X_test_norm)
        
        accuracy += metrics.accuracy_score(y_test,y_pred)
        recall += metrics.recall_score(y_test,y_pred)
        f1 += metrics.f1_score(y_test,y_pred)
        auc += metrics.roc_auc_score(y_test,y_pred)
    print("accuracy: " + str((accuracy/nSplits)) + "\nrecall: "+ str((recall/nSplits)) +
          "\nf1: "+ str((f1/nSplits)) + "\nauc: "+ str((auc/nSplits)))

In [None]:
modelLoR = linear_model.LogisticRegression()
classification(modelLoR,X1,y1)

In [None]:
modelSvc = svm.SVC()
classification(modelSvc,X1,y1)

In [None]:
modelNB = naive_bayes.GaussianNB()
classification(modelNB,X1,y1)

In [None]:
count_classes = pd.value_counts(df['Class'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("Historgrama das classes")
plt.xlabel("Classes")
plt.ylabel("Frequencia")

<b> <h2> Como mostrar o histograma acima, a classes no dataset estão totalmente desbalanceadas <br />
    por isso, a acurácia obtida nos três modelos testados tiveram bons resultados resultados para a classificação da classe majoritária, porém para classificar a classe "1", não acertariamos. </h2> </b>

## Reduzindo features e aplicando standardização

In [None]:
scale = preprocessing.StandardScaler().fit(df['Amount'].reshape(-1, 1))
df['newAmount'] = scale.transform(df['Amount'].reshape(-1, 1))
df = df.drop(['Time','Amount'],axis=1)
df.head(5)

In [None]:
X2 = df.values[:, df.columns != 'Class']
y2 = df.values[:, df.columns == 'Class']

## Aplicar um resample nos dados

In [None]:
n_classe_1 = len(y2[y2 == 1])

fraud_indices = np.array(df[df.Class == 1].index)
normal_indices = df[df.Class == 0].index

random_normal_indices = np.random.choice(normal_indices, n_classe_1, replace = False)
random_normal_indices = np.array(random_normal_indices)

under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])

X = X2[under_sample_indices]
y = y2[under_sample_indices]

In [None]:
modelLoR2 = linear_model.LogisticRegression()
classification(modelLoR2,X,y)

In [None]:
modelSvc2 = svm.SVC()
classification(modelSvc2,X,y)

In [None]:
modelNB2 = naive_bayes.GaussianNB()
classification(modelNB2,X,y)

In [None]:
modelKnn = neighbors.KNeighborsClassifier(n_neighbors=3)
classification(modelKnn,X,y)