# Fraud Detection Neural Network
Questo notebook ha l'obiettivo di sviluppare un modello di machine learning per la rilevazione delle frodi. L'obiettivo principale è identificare transazioni potenzialmente fraudolente all'interno di un dataset, addestrando un modello predittivo che possa distinguere tra transazioni lecite e fraudolente. Utilizzeremo diverse tecniche di machine learning per confrontare le loro prestazioni e scegliere la più efficace.

## Setup delle librerie
In questa sezione vengono importate e installate le librerie necessarie per l'analisi dei dati e la costruzione del modello di machine learning. Le librerie includono strumenti per la manipolazione dei dati (pandas, numpy), la visualizzazione (matplotlib, seaborn), l'apprendimento automatico (scikit-learn, tensorflow, imblearn), e il salvataggio del modello (joblib).

In [1]:
import subprocess
import sys

packages = [
    'pandas',
    'numpy',
    'matplotlib',
    'seaborn',
    'scikit-learn',
    'tensorflow',
    'pickle',
    'imblearn',
    'plotly',
    'pydot'
]

def install_packages(packages):
    for package in packages:
        try:
            __import__(package)
        except ImportError:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

install_packages(packages)

from pathlib import Path
import zipfile
import os
import json
import warnings
from datetime import date, datetime

!pip install -q kaggle

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import plotly.offline as pyo

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.metrics import precision_recall_curve

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

import pickle

%matplotlib inline

2024-08-26 17:38:26.993016: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-26 17:38:27.067099: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-26 17:38:27.161848: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-26 17:38:27.237912: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-26 17:38:27.257657: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-26 17:38:27.405418: I tensorflow/core/platform/cpu_feature_gu

## Dataset Download
In questa sezione viene definito il percorso del dataset e viene scaricato un file ZIP che contiene i dati delle transazioni con carte di credito. Il dataset viene poi estratto e caricato in un DataFrame di pandas. 

In [2]:
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

BASE_PATH = './'
PROJECT_PATH = os.path.join(BASE_PATH, '')
DATASET_DIR = os.path.join(PROJECT_PATH, 'datasets/')
ZIP_FILE_PATH = os.path.join(DATASET_DIR, 'credit-card-transaction-records-dataset.zip')
CSV_FILE_PATH = os.path.join(DATASET_DIR, 'credit_card_purchases.csv')

CMAP = 'Blues'


def setup_kaggle():
    kaggle_json_path = os.path.join(PROJECT_PATH, 'kaggle.json')

    if not os.path.exists(kaggle_json_path):
        print("kaggle.json not found.")
        return False

    kaggle_dir = os.path.expanduser("~/.kaggle")
    os.makedirs(kaggle_dir, exist_ok=True)

    os.system(f'cp "{kaggle_json_path}" "{kaggle_dir}/kaggle.json"')
    os.system(f'chmod 600 "{kaggle_dir}/kaggle.json"')

    return True

def download_data():
    if not os.path.isfile(ZIP_FILE_PATH):
        Path(DATASET_DIR).mkdir(parents=True, exist_ok=True)

        if not setup_kaggle():
            return None

        !kaggle datasets download -d muhammadehsan000/credit-card-transaction-records-dataset -p "{DATASET_DIR}" --force

        if not os.path.isfile(ZIP_FILE_PATH) or not ZIP_FILE_PATH.endswith('.zip'):
            print("Error: Downloaded file is not a zip file or download failed.")
            return None

        with zipfile.ZipFile(ZIP_FILE_PATH, 'r') as zip_ref:
            zip_ref.extractall(DATASET_DIR)

    if not os.path.isfile(CSV_FILE_PATH):
        return None

    return pd.read_csv(CSV_FILE_PATH)

cc_purchases = download_data()
if cc_purchases is None:
    print("Failed to load credit cards data.")


403 - Forbidden - Permission 'datasets.get' was denied
Error: Downloaded file is not a zip file or download failed.
Failed to load credit cards data.


## Esplorazione e visualizzazione dei dati
Una volta caricato il dataset, si procede con una rapida ispezione dei dati per comprendere la struttura del dataset, verificare la presenza di eventuali valori mancanti e osservare statistiche descrittive.

In [3]:
cc_purchases

In [4]:
print(cc_purchases.head())

AttributeError: 'NoneType' object has no attribute 'head'

In [None]:
print(cc_purchases.info())
print(cc_purchases.describe())

In [None]:
cc_purchases['is_fraud'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90)
plt.title('Percentuale di Transazioni Fraudolente')
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)
sns.histplot(cc_purchases['amt'], bins=50, kde=True, log_scale=True)
ax.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter())
plt.title('Distribuzione degli Importi delle Transazioni')
plt.xlabel('Importo ($)')
plt.ylabel('Frequenza')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='long', y='lat', hue='is_fraud', data=cc_purchases, palette='coolwarm', alpha=0.6)
plt.title('Distribuzione Geografica delle Transazioni (Fraudolente vs Non Fraudolente)')
plt.show()
plt.savefig(os.path.join(PROJECT_PATH, '3.png'))

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(y='category', hue='is_fraud', data=cc_purchases)
plt.title('Numero di Transazioni per Categoria (Fraudolente vs Non Fraudolente)')
plt.show()

## Preprocessing dei Dati
In questa sezione, vengono eseguite diverse operazioni di preprocessing per preparare i dati all'addestramento del modello. Questo include la gestione delle variabili temporali, la rimozione di colonne non necessarie, e la trasformazione di variabili categoriali.

In [None]:
def calculate_age(dob):
  if isinstance(dob, str):
    dob = datetime.strptime(dob, '%Y-%m-%d').date() 

  today = date.today()
  age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
  return age

In [None]:
cc_purchases_df2 = cc_purchases.copy()

TARGET_COL = 'is_fraud'
TARGET = cc_purchases_df2[TARGET_COL]

# Drop extra columns
cc_purchases_df2 = cc_purchases.drop(columns=['Unnamed: 0','trans_date_trans_time','cc_num','merchant','first','last','street','city','state','trans_num','merch_zipcode'])

# Replace date of birth with age
cc_purchases_df2['age'] = cc_purchases_df2['dob'].apply(calculate_age)
cc_purchases_df2 = cc_purchases_df2.drop(columns=['dob'])

# Order the dataframe
new_column_order = ['unix_time','lat','long', 'amt', 'category', 'gender', 'age', 'job', 'zip', 'city_pop', 'merch_lat', 'merch_long', 'is_fraud']
cc_purchases_df2 = cc_purchases_df2.reindex(columns=new_column_order)

cc_purchases_df2


In [None]:
def report(dfs, names):

    rep = pd.DataFrame(dfs[0].dtypes, columns=['dtypes'])

    for i, df in enumerate(dfs):
        rep[f"{names[i]}_missing#"] = dfs[i].isna().sum()
        rep[f"{names[i]}_missing%"] = (dfs[i].isna().sum())/len(dfs[i])
        rep[f"{names[i]}_uniques"] = dfs[i].nunique().values

    return rep
 
names = ["Credit Cards Dataframe"]
rep = report([cc_purchases_df2], names)
rep.style.background_gradient(cmap=CMAP)

In [None]:
cc_purchases_df3 = cc_purchases_df2.copy()

cat_cols = ['category', 'gender', 'job']

encoders = {}

# Encode categorical columns
for col in cat_cols:
    encoder = LabelEncoder()
    cc_purchases_df3[col] = encoder.fit_transform(cc_purchases_df3[col])
    encoders[col] = encoder

In [None]:
corr = cc_purchases_df3.corr()
mask = np.triu(np.ones_like(corr))
plt.figure(figsize = (15,9))
sns.heatmap(corr,annot = True, cmap=CMAP, mask=mask)
plt.show()

In [None]:
cc_purchases_df3 = cc_purchases_df3.drop(['zip','merch_long','merch_lat'], axis=1)

In [None]:
corr = cc_purchases_df3.corr()
mask = np.triu(np.ones_like(corr))
plt.figure(figsize = (15,9))
sns.heatmap(corr,annot = True, cmap=CMAP, mask=mask)
plt.show()

In [None]:
cc_purchases_df3.head()

## Network Training

In [None]:
#MODEL = 'model.pkl'
MODEL = None

# Data preparation without SMOTE
X = cc_purchases_df3.drop('is_fraud', axis=1)
y = cc_purchases_df3['is_fraud']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(zip(np.unique(y_train),class_weights))
# Compute sample weights
sample_weights_train = compute_sample_weight(class_weight=class_weights_dict, y=y_train)


print(f"Class Weights: {class_weights_dict}")

In [None]:
def create_model(input_shape, n_out):
    out_classes = n_out  
    HIDDEN1 = 64  
    HIDDEN2 = 128  

    print(f"Making model with input shape {input_shape}, hidden {HIDDEN1}, {HIDDEN2}, out classes {out_classes}")
    
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=input_shape))

    # First Conv1D layer
    model.add(keras.layers.Conv1D(filters=HIDDEN1, kernel_size=5, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    model.add(keras.layers.Dropout(0.2))

    # Second Conv1D layer (optional, can be removed or adjusted)
    model.add(keras.layers.Conv1D(filters=HIDDEN1, kernel_size=3, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    model.add(keras.layers.Dropout(0.2))

    # LSTM layer
    model.add(keras.layers.LSTM(HIDDEN2))
    model.add(keras.layers.Dropout(0.2))

    # Output layer
    model.add(keras.layers.Dense(out_classes, activation='softmax'))

    return model

In [None]:
if MODEL is None:
    input_shape = X_train_scaled.shape[1]
    model = create_model(input_shape)

    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    history = model.fit(
        X_train_scaled, y_train, 
        epochs=100, 
        batch_size=32, 
        validation_data=(X_val_scaled, y_val), 
        sample_weight=sample_weights_train,
        callbacks=[early_stopping]
    )

    with open('model.pkl', 'wb') as file:
        pickle.dump(model, file)
    
    history_dict = history.history
    with open('history.json', 'w') as file:
        json.dump(history_dict, file)
else:
    with open('model.pkl', 'rb') as file:
        model = pickle.load(file)
    
    with open('history.json', 'r') as file:
        history_dict = json.load(file)

In [None]:
# Predict probabilities on the validation set
y_val_prob = model.predict(X_val_scaled)

# Calcola la curva Precision-Recall
precision, recall, thresholds = precision_recall_curve(y_val, y_val_prob)

# Trova la soglia che massimizza l'F1-score
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[f1_scores.argmax()]

print(f'Optimal threshold: {best_threshold}')

# Converti le probabilità in etichette di classe usando la soglia ottimale
y_val_pred = (y_val_prob > best_threshold).astype(int)

# Stampa il report di classificazione con la soglia ottimale
report = classification_report(y_val, y_val_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose().round(3)

# Creare la figura
fig, ax = plt.subplots(figsize=(10, 6))  # Imposta la dimensione della figura
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText=report_df.values, 
                     colLabels=report_df.columns, 
                     rowLabels=report_df.index,
                     cellLoc='center', 
                     loc='center')

# Salva l'immagine (opzionale)
plt.savefig('classification_report.png')

# Mostra l'immagine
plt.show()

In [None]:
# Plot the model
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, rankdir='TB', expand_nested=True, dpi=70)

In [None]:
# Plotting training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plotting training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
cm = confusion_matrix(y_val, y_val_pred)

# Plot the confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g',
            xticklabels=cc_purchases_df3[TARGET_COL].unique(),
            yticklabels=cc_purchases_df3[TARGET_COL].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
