<a href="https://colab.research.google.com/github/jlsm2/deeplearning/blob/main/cc_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pré-processamento dos dados

### Download de bibliotecas

In [158]:
!pip install d2l==1.0.3 -q

In [159]:
pip install pandas numpy scikit-learn seaborn matplotlib plotly -q

[31mERROR: Operation cancelled by user[0m[31m
[0m

### Importando bibliotecas

In [160]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

### Carregando o dataset

In [161]:
dfTrain = pd.read_csv("fraudTrain.csv")
dfTest = pd.read_csv("fraudTest.csv")

dfTrain

KeyboardInterrupt: 

### Removendo colunas irrelevantes

In [None]:
dfTrain.info()
dfTest.info()

In [None]:
dfTrain = dfTrain.drop(columns=["Unnamed: 0", "cc_num", "first", "last", "street", "city", "state", "zip", "dob", "trans_num"])
dfTest = dfTest.drop(columns=["Unnamed: 0", "cc_num", "first", "last", "street", "city", "state", "zip", "dob", "trans_num"])

### Verificando se há valores nulos

In [None]:
print(dfTrain.isnull().sum())
print(dfTest.isnull().sum())

### Convertendo colunas de data

In [None]:
# train dataframe
dfTrain["trans_date_trans_time"] = pd.to_datetime(dfTrain["trans_date_trans_time"])

dfTrain["year"] = dfTrain["trans_date_trans_time"].dt.year
dfTrain["month"] = dfTrain["trans_date_trans_time"].dt.month
dfTrain["day"] = dfTrain["trans_date_trans_time"].dt.day
dfTrain["day_of_week"] = dfTrain["trans_date_trans_time"].dt.dayofweek
dfTrain["hour"] = dfTrain["trans_date_trans_time"].dt.hour

dfTrain.drop(columns=["trans_date_trans_time"], inplace=True)

# test dataframe
dfTest["trans_date_trans_time"] = pd.to_datetime(dfTest["trans_date_trans_time"])

dfTest["year"] = dfTest["trans_date_trans_time"].dt.year
dfTest["month"] = dfTest["trans_date_trans_time"].dt.month
dfTest["day"] = dfTest["trans_date_trans_time"].dt.day
dfTest["day_of_week"] = dfTest["trans_date_trans_time"].dt.dayofweek
dfTest["hour"] = dfTest["trans_date_trans_time"].dt.hour

dfTest.drop(columns=["trans_date_trans_time"], inplace=True)


dfTrain

### Identificando e tratando variáveis categoricas

In [None]:
categorical_cols = dfTrain.select_dtypes(include=["object", "category"]).columns
print(categorical_cols)

In [None]:
dfTrain["merchant"] = dfTrain["merchant"].astype("category")
dfTrain["category"] = dfTrain["category"].astype("category")
dfTrain["gender"] = dfTrain["gender"].astype("category")
dfTrain["job"] = dfTrain["job"].astype("category")

dfTest["merchant"] = dfTest["merchant"].astype("category")
dfTest["category"] = dfTest["category"].astype("category")
dfTest["gender"] = dfTest["gender"].astype("category")
dfTest["job"] = dfTest["job"].astype("category")

In [None]:
dfTrain.info()
dfTest.info()

In [None]:
# label encoder
le = LabelEncoder()

dfTrain["merchant"] = le.fit_transform(dfTrain["merchant"])
dfTrain["job"] = le.fit_transform(dfTrain["job"])

dfTest["merchant"] = le.fit_transform(dfTest["merchant"])
dfTest["job"] = le.fit_transform(dfTest["job"])

# one-hot encoding
dfTrain = pd.get_dummies(dfTrain, columns=["category", "gender"], drop_first=True)

dfTest = pd.get_dummies(dfTest, columns=["category", "gender"], drop_first=True)

In [None]:
dfTrain.info()

### Balanceando os casos de fraude

In [None]:
graph = px.histogram(dfTrain, x="is_fraud")

graph.show()

In [None]:
X = dfTrain.drop("is_fraud", axis=1)
y = dfTrain["is_fraud"]

smote = SMOTE(random_state=42)
XTrain_resampled, ytrain_resampled = smote.fit_resample(X, y)

X = dfTest.drop("is_fraud", axis=1)
y = dfTest["is_fraud"]

smote = SMOTE(random_state=42)
XTest_resampled, ytest_resampled = smote.fit_resample(X, y)

In [None]:
graph = px.histogram(ytrain_resampled, x="is_fraud")

graph.show()

### Padronizando features numéricas

In [None]:
numeric_features = ["amt", "lat", "long", "city_pop", "unix_time", "year", "month", "day", "day_of_week", "hour"]

scaler = StandardScaler()

XTrain_resampled[numeric_features] = scaler.fit_transform(XTrain_resampled[numeric_features])
XTest_resampled[numeric_features] = scaler.fit_transform(XTest_resampled[numeric_features])

In [None]:
XTrain_resampled

In [None]:
XTest_resampled

### Identificando e tratando outliers


In [None]:
# train

# identificação dos outliers
Q1 = XTrain_resampled["amt"].quantile(0.25)
Q3 = XTrain_resampled["amt"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = XTrain_resampled[(XTrain_resampled["amt"] < lower_bound) | (XTrain_resampled["amt"] > upper_bound)]

# remoção
XTrain_resampled_cleaned = XTrain_resampled[(XTrain_resampled["amt"] >= lower_bound) & (XTrain_resampled["amt"] <= upper_bound)]

# substituicao de valores
XTrain_resampled["amt"] = np.where(XTrain_resampled["amt"] < lower_bound, lower_bound, XTrain_resampled["amt"])
XTrain_resampled["amt"] = np.where(XTrain_resampled["amt"] > upper_bound, upper_bound, XTrain_resampled["amt"])

# transformação de dados
XTrain_resampled["amt"] = np.log(XTrain_resampled["amt"] + 1)  # somar 1 para evitar log(0)

In [None]:
sns.boxplot(x=XTrain_resampled_cleaned["amt"])
plt.show()

In [None]:
# test

# identificação dos outliers
Q1 = XTest_resampled["amt"].quantile(0.25)
Q3 = XTest_resampled["amt"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = XTest_resampled[(XTest_resampled["amt"] < lower_bound) | (XTest_resampled["amt"] > upper_bound)]

# remoção
XTest_resampled_cleaned = XTest_resampled[(XTest_resampled["amt"] >= lower_bound) & (XTest_resampled["amt"] <= upper_bound)]

# substituicao de valores
XTest_resampled["amt"] = np.where(XTest_resampled["amt"] < lower_bound, lower_bound, XTest_resampled["amt"])
XTest_resampled["amt"] = np.where(XTest_resampled["amt"] > upper_bound, upper_bound, XTest_resampled["amt"])

# transformação de dados
XTest_resampled["amt"] = np.log(XTest_resampled["amt"] + 1)  # somar 1 para evitar log(0)

In [None]:
sns.boxplot(x=XTest_resampled_cleaned["amt"])
plt.show()

### Divisão dos dados

In [None]:
X_train = XTrain_resampled
y_train = ytrain_resampled

X_test = XTest_resampled
y_test = ytest_resampled

# Criação e treinamento dos modelos

### Random Forest

In [None]:
estimators = [2, 5, 10, 20, 50]

for n in estimators:
  rf = RandomForestClassifier(n_estimators=n, random_state=42)
  rf.fit(X_train, y_train)

  y_pred = rf.predict(X_test)

  print(f"Acurácia[{n} estimators]: {accuracy_score(y_test, y_pred)}")

### LSTM

In [None]:
# redimensionando os dados
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
# criando o modelo
class LSTM(nn.Module):
    def __init__(self, input_size):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, 64, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out)

input_size = X_train.shape[2]
model = LSTM(input_size)

In [None]:
# treinamento
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def calculate_accuracy(outputs, labels):
    predicted = (outputs > 0.5).float()
    correct = (predicted == labels).float()
    return correct.sum() / len(correct)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    running_accuracy = 0.0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_accuracy += calculate_accuracy(outputs, labels).item()

    avg_loss = running_loss / len(train_loader)
    avg_accuracy = running_accuracy / len(train_loader)
    print(f'Época [{epoch + 1}/{num_epochs}], Perda: {avg_loss:.4f}, Acurácia: {avg_accuracy}')