# Detecção de fraude em pagamentos online


## https://thecleverprogrammer.com/2022/02/22/online-payments-fraud-detection-with-machine-learning/#google_vignette

In [29]:
# importando libs

import pandas as pd
import numpy as np

In [30]:
# lendo arquivo

df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Explorando arquivo

In [45]:
df.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [52]:
# verificando correlações entre 'isFraud' e demais colunas

correlacao = df.corr()
print(correlacao["isFraud"].sort_values(ascending=False))

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [51]:
df.type.value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [54]:
# transformando a coluna em numérica para usar decision tree

df['type'] = df['type'].map({'CASH_OUT': 1, 'PAYMENT': 2, 'CASH_IN': 3, 'TRANSFER': 4, 'DEBIT': 5})
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Começando machine learning

In [70]:
# criando as features e prediction target

y = df.isFraud
features = ['step', 'type', 'amount','oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
X = df[features]

In [71]:
# importando libs para machine learn

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [72]:
# splitando os dados em treino e validação

train_X, valid_X, train_y, valid_y = train_test_split(X, y, random_state=1)

In [73]:
# criando modelo
fraud_model = DecisionTreeClassifier(random_state=1)

# treinando modelo
fraud_model.fit(train_X, train_y)

# predizendo valores (1: é fraude, 2: não é fraude)
val_predictions = fraud_model.predict(X)
val_predictions

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

In [74]:
# validando modelo

fraud_model.score(valid_X, valid_y)

0.9996825207225954

## Uma forma de testar o modelo

In [79]:
# adicionando dados de teste
# features = ['step', 'type', 'amount','oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
features = np.array([[1, 3, 12000.0, 15000.0, 3000.0, 500.0, 12500.0]])

# predizendo o resultado
fraud_model.predict(features)



array([0], dtype=int64)

## Para os dados de teste a cima, o modelo prediz que a transação não é fraude.