In [1]:
import pandas as pd
import numpy as np
import pickle
import random

In [2]:
with open('models/XGBoost/XGBoost.pkl', 'rb') as f:
    XGBoost = pickle.load(f)
with open('models/LogisticRegression/LogisticRegression.pkl', 'rb') as f:
    LR = pickle.load(f)
with open('models/RandomForest/RandomForest.pkl', 'rb') as f:
    RF = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
full_df = pd.read_csv('../datasets/fraud_new.csv')
df = pd.read_csv('../datasets/fraud.csv')

In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
full_df.head()

Unnamed: 0,step,type,amount,oldBalanceOrig,newBalanceOrig,oldBalanceDest,newBalanceDest,isFraud,errorBalanceOrig,errorBalanceDest
0,1,0,181.0,181.0,0.0,0.0,0.0,1,0.0,181.0
1,1,1,181.0,181.0,0.0,21182.0,0.0,1,0.0,21363.0
2,1,1,229133.94,15325.0,0.0,5083.0,51513.44,0,213808.94,182703.5
3,1,0,215310.3,705.0,0.0,22425.0,0.0,0,214605.3,237735.3
4,1,0,311685.89,10835.0,0.0,6267.0,2719172.89,0,300850.89,-2401220.0


In [7]:
X = full_df.drop(['isFraud'], axis=1)

In [8]:
subset = full_df.loc[(X["oldBalanceOrig"] == full_df["amount"]) & (full_df['isFraud'] == 1)]
subset = subset.drop(['isFraud'], axis=1)
example = subset.loc[[0]]
example

Unnamed: 0,step,type,amount,oldBalanceOrig,newBalanceOrig,oldBalanceDest,newBalanceDest,errorBalanceOrig,errorBalanceDest
0,1,0,181.0,181.0,0.0,0.0,0.0,0.0,181.0


In [7]:
def check_two_columns_equality_df(df, col1, col2, fraud):
    subset = df.loc[(df[col1] == df[col2]) & (df['isFraud'] == fraud)]
    if subset.empty:
        print("No cases found that violate conditions")
    else:
        print("Found a subset that violates condition")
    return subset

In [8]:
def check_two_columns_equality_model(model,example, col1, col2, fraud):
    tests = []
    for i in range(40):
        x = example.copy()
        x[col1] = example[col1] + i ** 3
        x[col2] = x[col1]
        tests.append(x.copy())
    results = []
    for t in tests:
        r = model.predict_proba(t)
        if (r[0][0]) < (r[0][1]):
            results.append(fraud)
        else:
            results.append(not fraud)
    if (not fraud) in results:
        print("Found cases that violate conditions")
    else:
        print("No cases found that violate conditions")
    return results

    

In [9]:
def check_monotonic(model, col, example, positive = True):
    y = example
    testes = []
    results = []
    for c in example.columns:
        y[c] = X[c].mean()
    y[col] = 0
    for i in range(1,100000, 5000):
        y[col] = i
        testes.append(y.copy())
    for t in testes:
        r = model.predict_proba(t)
        results.append(r[0][1])
    if positive:
        for i in range(1,len(testes)-1):
            if results[i-1] <= results[i]:
                return False
    else:
        for i in range(1,len(testes)-1):
            if results[i-1] > results[i]:
                return False
    return True

In [10]:
print(check_two_columns_equality_model(LR, example, 'amount', 'oldBalanceOrig', 1))


Found cases that violate conditions
[False, False, False, False, False, False, False, False, False, False, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [11]:
print(check_monotonic(LR, 'amount', example))

True


## Regras com dataset

In [12]:
def check_fraud_by_type_df(data, feature, feature_value, fraud):
  check = data.loc[(data[feature] == feature_value) & (data["isFraud"] == fraud)]
  if check.empty:
    print(f"No cases detected from fraud={fraud} with the feature {feature_value}")
  else:
    print(f"Found cases from fraud={fraud} with the feature {feature_value}")
  return check

def check_difference_values_columns_df(data, col1, col2, value_limit, fraud):
  check = data.loc[((data[col1]-data[col2]) > value_limit) & (data["isFraud"] != fraud)]
  if check.empty:
    print(f"No violation of conditions. There is no cases found where {col1}-{col2}>{value_limit} and fraud={not(fraud)}")
  else:
    print(f"Violation of conditions. There is cases found where {col1}-{col2}>{value_limit} and fraud={not(fraud)}")
  return check

Regra de Negócio 2:
- Se a transação for do tipo `PAYMENT`, `DEBIT` ou `CASH-IN`, é esperado que a transação não seja fraudulenta.

In [13]:
check_fraud_by_type_df(df, "type", "PAYMENT", 1)
check_fraud_by_type_df(df, "type", "DEBIT", 1)
check_fraud_by_type_df(df, "type", "CASH-IN", 1)

No cases detected from fraud=1 with the feature PAYMENT
No cases detected from fraud=1 with the feature DEBIT
No cases detected from fraud=1 with the feature CASH-IN


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud


Nessa regra, o dataset parece cumprir corretamente, não tendo nenhum caso de fraude com tais colunas.

Regra de Negócio 6:
- Se a diferença entre `newBalanceDest` e `oldBalanceDest` exceder 1 milhão de moedas, é esperado que seja uma fraude.

In [14]:
check_difference_values_columns_df(full_df, "newBalanceDest", "oldBalanceDest", 1000000, 1)

Violation of conditions. There is cases found where newBalanceDest-oldBalanceDest>1000000 and fraud=False


Unnamed: 0,step,type,amount,oldBalanceOrig,newBalanceOrig,oldBalanceDest,newBalanceDest,isFraud,errorBalanceOrig,errorBalanceDest
4,1,0,311685.89,10835.0,0.0,6267.00,2719172.89,0,300850.89,-2.401220e+06
7,1,1,5346.89,0.0,0.0,652637.00,6453430.91,0,5346.89,-5.795447e+06
19,1,0,125872.53,0.0,0.0,348512.00,3420103.09,0,125872.53,-2.945719e+06
20,1,0,379856.23,0.0,0.0,900180.00,19169204.93,0,379856.23,-1.788917e+07
21,1,0,1505626.01,0.0,0.0,29031.00,5515763.34,0,1505626.01,-3.981106e+06
...,...,...,...,...,...,...,...,...,...,...
2768608,714,0,1405708.32,14.0,0.0,967006.58,2372714.91,0,1405694.32,-1.000000e-02
2768941,715,0,1537768.99,30937.0,0.0,1875711.04,3413480.03,0,1506831.99,4.656613e-10
2769136,715,0,1051130.81,51827.0,0.0,0.00,1051130.81,0,999303.81,0.000000e+00
2769447,715,0,1114850.44,1509.0,0.0,0.00,1114850.44,0,1113341.44,0.000000e+00


Ou seja, nota-se que o dataset viola a regra de negócio estabelecida - e em diversas linhas.

## Regra com modelo

In [75]:
def check_fraud_by_type_model(model, x, feature, feature_value, fraud):
    results = model.predict(x)
    results = pd.DataFrame(results)
    results = x.join(results)
    results.rename(columns={0: "isFraud"}, inplace=True)
    
    return check_fraud_by_type_df(results, feature, feature_value, fraud)


def check_difference_values_columns_model(model, x, col1, col2, value_limit, fraud):
    results = model.predict(x)
    results = pd.DataFrame(results)
    results = x.join(results)
    results.rename(columns={0: "isFraud"}, inplace=True)

    return check_difference_values_columns_df(results, col1, col2, value_limit, fraud)

Regra de Negócio 2 - Model

In [73]:
# Regra 2
check_fraud_by_type_model(LR, X, "type", "PAYMENT", 1)
check_fraud_by_type_model(LR, X, "type", "DEBIT", 1)
check_fraud_by_type_model(LR, X, "type", "CASH-IN", 1)

check_fraud_by_type_model(XGBoost, X, "type", "PAYMENT", 1)
check_fraud_by_type_model(XGBoost, X, "type", "DEBIT", 1)
check_fraud_by_type_model(XGBoost, X, "type", "CASH-IN", 1)

No cases detected from fraud=1 with the feature PAYMENT
No cases detected from fraud=1 with the feature DEBIT
No cases detected from fraud=1 with the feature CASH-IN
No cases detected from fraud=1 with the feature PAYMENT
No cases detected from fraud=1 with the feature DEBIT
No cases detected from fraud=1 with the feature CASH-IN


Unnamed: 0,step,type,amount,oldBalanceOrig,newBalanceOrig,oldBalanceDest,newBalanceDest,errorBalanceOrig,errorBalanceDest,isFraud


Observa-se que o modelo e suas predições não apresentaram descumprimento a regra de negócio estabelecida, demonstrando que um bug muito provavelmente não esteja nessa feature em específico.

Regra de Negócio 6 - model

In [76]:
# Regra 6
check_difference_values_columns_model(LR, X, "newBalanceDest", "oldBalanceDest", 1000000, 1)

Violation of conditions. There is cases found where newBalanceDest-oldBalanceDest>1000000 and fraud=False


Unnamed: 0,step,type,amount,oldBalanceOrig,newBalanceOrig,oldBalanceDest,newBalanceDest,errorBalanceOrig,errorBalanceDest,isFraud
4,1,0,311685.89,10835.00,0.0,6267.00,2719172.89,300850.89,-2401220.00,0
7,1,1,5346.89,0.00,0.0,652637.00,6453430.91,5346.89,-5795447.02,0
19,1,0,125872.53,0.00,0.0,348512.00,3420103.09,125872.53,-2945718.56,0
20,1,0,379856.23,0.00,0.0,900180.00,19169204.93,379856.23,-17889168.70,0
21,1,0,1505626.01,0.00,0.0,29031.00,5515763.34,1505626.01,-3981106.33,0
...,...,...,...,...,...,...,...,...,...,...
2770338,737,1,1281113.84,1281113.84,0.0,0.00,1281113.84,0.00,0.00,0
2770362,740,1,1312694.50,1312694.50,0.0,0.00,1312694.50,0.00,0.00,0
2770364,740,1,1755647.81,1755647.81,0.0,2930014.23,4685662.04,0.00,0.00,0
2770392,742,1,1819543.69,1819543.69,0.0,0.00,1819543.69,0.00,0.00,0


Nota-se que, com essa regra, há muita violação da regra de negócio estabelecida com o resultado do modelo. E acima vimos que também ocorre isso com o dataset. Portanto, é algo muito relevante de se analisar e rever a regra ou o dataset.

### Análise feature | modelo

In [9]:
example

Unnamed: 0,step,type,amount,oldBalanceOrig,newBalanceOrig,oldBalanceDest,newBalanceDest,errorBalanceOrig,errorBalanceDest
0,1,0,181.0,181.0,0.0,0.0,0.0,0.0,181.0


In [10]:
LR.predict(example)

array([0], dtype=int64)

In [13]:
example.type = -1000
example

Unnamed: 0,step,type,amount,oldBalanceOrig,newBalanceOrig,oldBalanceDest,newBalanceDest,errorBalanceOrig,errorBalanceDest
0,1,-1000,181.0,181.0,0.0,0.0,0.0,0.0,181.0


In [14]:
LR.predict(example)

array([1], dtype=int64)