# DATA PREPARATION
Install data and assign csv data file into pandas Dataframe

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

df = pd.read_csv("/kaggle/input/financial-transactions-dataset-for-fraud-detection/financial_fraud_detection_dataset.csv")

In [None]:
import seaborn as sns
import polars as pl
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings("ignore") 

# DATA EXPLORATION (EDA)

In [None]:
#The first 5 data from the dataset
df.head()

## Data Overview
As we can see, the label for this dataset is attribute is_fraud with boolean label (True/False) to determine whether a transaction is fraudulent or legitimate 

In [None]:
#Display basic info of the dataset
print("Display Info: \n")
print(df.info())

In [None]:
#Check for missing value
print(f"Shape of dataset: {df.shape}")
print(f"The amount of missing value: {df.isnull().sum()}")

In [None]:
#Class distribution
fraud_trans_count = df["is_fraud"] == True
legit_trans_count = df["is_fraud"] == False
print(f"Fraud transactions: {fraud_trans_count.sum()} ({fraud_trans_count.sum()*100/df.shape[0]:.2f}%)")
print(f"Legitimate transactions: {legit_trans_count.sum()} ({legit_trans_count.sum()*100/df.shape[0]:.2f}%)")

In [None]:
plt.figure(figsize=(8,7))
ax = sns.countplot(x='is_fraud',data=df,palette='coolwarm')
plt.title("Fraudulent Financial Transaction Distribution")
plt.xlabel("Fraud Check")
plt.ylabel("Count")

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                fontsize=11, color='black',
                xytext=(0, 6),textcoords='offset points')
plt.show()

In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df["hour"] = df["timestamp"].dt.hour
df["day"] = df["timestamp"].dt.day
df["day_of_week"] = df["timestamp"].dt.weekday
df["month"] = df["timestamp"].dt.month

## Convert Boolean labels into Numeric labels (0 and 1)
Models such as SVM, XGBoost,... prefer 0 and 1 more than True and False so I convert it

In [None]:
df["is_fraud"] = df["is_fraud"].astype(int)
df.head()

In [None]:
df_pandas = df.sample(n=df.shape[0],random_state = 19)
plt.figure(figsize=(10, 6))  # set figure size first
sns.countplot(x="month", data=df, palette="coolwarm", hue="is_fraud")
plt.title("Fraud Distribution Over Time")
plt.xlabel("Month")
plt.ylabel("Count")
plt.show()

In [None]:
print(f"Statistic Describe: \n")
df.describe()

## Statistical Summary
1. *Amount*: range from 0.01 to 3520.57, mean ≈358.9343, highly skewed (≈469.93). Amount of money in a transaction may affect the fraudulent rate of itself  
2. *spending_deviation_score*: range from -5.2 to 5.02, mean ≈0, std ≈1. Like normal distribution, useful for fraud detection  
3. *velocity_score*: range from 1 to 20, mean ≈10.5, std ≈5.77. High value may indicate rapid transactions – sign of fraud  
4. *geo_anomaly_score*: range from 0 to 1, mean ≈0.5, std ≈0.3. Likely a probability-like score, critical for fraud analysis.
5. *time_since_last_transaction*: range from -8777.814 to 8757.758, mean ≈1.53, std ≈3576.569. Highly skewed so it may not be effective for detecting fraud transaction

In [None]:
### Correlation for numeric variables
corr_df = df[["timestamp","amount","time_since_last_transaction","spending_deviation_score","velocity_score","geo_anomaly_score","day","month","is_fraud"]]
correlation_matrix = corr_df.corr()
plt.figure(figsize=(10,6))
sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm',center=0,linewidths=0.5,cbar=True,fmt='.2f')
plt.title("Correlation between numeric variables",fontsize=16)
plt.show()

## Timestamp
Here we only consider month and day ofr fraud detection

In [None]:
### Month
fraud_by_month = (
    df.groupby("month")
    .agg(
        total_trans=("is_fraud", "count"),
        fraud_trans=("is_fraud", "sum")
    )
)
fraud_by_month["fraud_rate (%)"] = fraud_by_month["fraud_trans"]*100 / fraud_by_month["total_trans"]
fraud_by_month["fraud_rate (%)"] = fraud_by_month["fraud_rate (%)"].round(2)
fraud_by_month.sort_values("fraud_rate (%)",ascending=False).head(10)

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(x='month',hue='is_fraud',data=df,palette='coolwarm')
plt.title('Month for Fraud Detection')
plt.legend(title='is_fraud',loc='center right')
plt.xlabel('Month')
plt.ylabel('Count')
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                fontsize=8, color='black',
                xytext=(0, 6),textcoords='offset points')
plt.show()

In [None]:
### Day
fraud_by_day = (
    df.groupby("day")
    .agg(
        total_trans=("is_fraud", "count"),
        fraud_trans=("is_fraud", "sum")
    )
)
fraud_by_day["fraud_rate (%)"] = fraud_by_day["fraud_trans"]*100 / fraud_by_day["total_trans"]
fraud_by_day["fraud_rate (%)"] = fraud_by_day["fraud_rate (%)"].round(2)
fraud_by_day.sort_values("fraud_rate (%)",ascending=False).head(100)

In [None]:
plt.figure(figsize=(22,6))
ax = sns.countplot(x='day',hue='is_fraud',data=df,palette='coolwarm')
plt.title('Day for Fraud Detection')
# plt.legend(title='is_fraud',loc='center right')
plt.xlabel('Day in month')
plt.ylabel('Count')
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                fontsize=8, color='black',
                xytext=(0, 6),textcoords='offset points')
plt.show()

## Sender/Receiver Account

In [None]:
# Sender/Receiver
print(f"Number of senders: {np.unique(df['sender_account']).size}")
print(f"Number of receivers: {np.unique(df['receiver_account']).size}")

In [None]:
# Top 10 most active sender in the dataset
df.groupby("sender_account").size().reset_index(name="count").sort_values("count", ascending=False).head(10)

In [None]:
# Top 10 most active receiver in the dataset
df.groupby("receiver_account").size().reset_index(name="count").sort_values("count",ascending=False).head(10)

In [None]:
# Calculate the rate that related to each account
fraud_by_sender = (
    df.groupby("sender_account")
    .agg(
        total_trans=("is_fraud", "count"),
        fraud_trans=("is_fraud", "sum")
    )
)
fraud_by_sender["fraud_rate (%)"] = fraud_by_sender["fraud_trans"]*100 / fraud_by_sender["total_trans"]
fraud_by_sender["fraud_rate (%)"] = fraud_by_sender["fraud_rate (%)"].round(2)
fraud_by_sender.sort_values("fraud_rate (%)",ascending=False).head(10)

In [None]:
fraud_by_receiver = (
    df.groupby("receiver_account").agg(
        total_trans=('is_fraud','count'),
        fraud_trans=('is_fraud','sum')
    )
)
fraud_by_receiver['fraud_rate (%)'] = fraud_by_receiver['fraud_trans']*100/fraud_by_receiver['total_trans']
fraud_by_receiver['fraud_rate (%)'] = fraud_by_receiver['fraud_rate (%)'].round(2)
fraud_by_receiver.sort_values("fraud_rate (%)",ascending=False).head(10)

In [None]:
#### Top 10 most active pair of accounts in the dataset

df.groupby(["sender_account","receiver_account"]).agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
).sort_values("fraud_trans",ascending=False).head(10)

## Amount

In [None]:
sns.histplot(df,x='amount',hue='is_fraud',bins=100,kde=True,log_scale=True)
plt.xlabel('Amount of a transaction')
plt.title("Transaction amount by fraud (log scale)")
plt.show()

## Merchant Category

In [None]:
fraud_by_mc = (df.groupby('merchant_category').agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
))
fraud_by_mc['fraud_rate (%)'] = fraud_by_mc['fraud_trans']*100/fraud_by_mc['total_trans']
fraud_by_mc['fraud_rate (%)']= fraud_by_mc['fraud_rate (%)'].round(2)
fraud_by_mc.sort_values("fraud_rate (%)",ascending=False).head(10)

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(x='merchant_category',hue='is_fraud',data=df,palette='coolwarm')
plt.title('Merchant Category Fraud Detection')
plt.xlabel('Merchant Category')
plt.ylabel('Count')
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                fontsize=8, color='black',
                xytext=(0, 6),textcoords='offset points')
plt.show()

## Transaction Type

In [None]:
df.groupby("transaction_type").size().reset_index(name="Count").sort_values("Count",ascending=False).head(10)

In [None]:
#Lets see what fraud most use transaction type
fraud_by_tp = (df.groupby('transaction_type').agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
))
fraud_by_tp['fraud_rate (%)'] = fraud_by_tp['fraud_trans']*100/fraud_by_tp['total_trans']
fraud_by_tp['fraud_rate (%)']= fraud_by_tp['fraud_rate (%)'].round(2)
fraud_by_tp.sort_values("fraud_rate (%)",ascending=False).head()

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(x='transaction_type',hue='is_fraud',data=df,palette='coolwarm')
plt.title('Fraud vs Legit transaction type amount')
plt.xlabel('Transaction type')
plt.ylabel('Count')
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                fontsize=8, color='black',
                xytext=(0, 6),textcoords='offset points')
plt.show()

## Spending Deviation Score

In [None]:

fraud_by_sds = (df.groupby("spending_deviation_score").agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
).sort_values('fraud_trans',ascending=False))
fraud_by_sds['fraud_rate (%)'] = fraud_by_sds['fraud_trans']*100/fraud_by_sds['total_trans']
fraud_by_sds['fraud_rate (%)'] = fraud_by_sds['fraud_rate (%)'].round(2)
fraud_by_sds.head(10)

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=df,x='spending_deviation_score',hue='is_fraud',bins=100,kde=True)
plt.title("Spending Deviation Score Fraud Detection")
plt.show()

## Velocity Score

In [None]:
fraud_by_vs = (df.groupby("velocity_score").agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
))
fraud_by_vs['fraud_rate (%)'] = fraud_by_vs['fraud_trans']*100/fraud_by_vs['total_trans']
fraud_by_vs['fraud_rate (%)'] = fraud_by_vs['fraud_rate (%)'].round(2)
fraud_by_vs.sort_values('fraud_rate (%)',ascending=False).head(10)

In [None]:
plt.figure(figsize=(17,7))
ax = sns.countplot(data=df,x='velocity_score',hue='is_fraud',palette='coolwarm')
ax.legend(title="is_fraud",loc='center right')
plt.title("Velocity Score Fraud Detection")
plt.xlabel("Velocity Score")
plt.ylabel("Count")
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                fontsize=8, color='black',
                xytext=(0, 6),textcoords='offset points')
plt.show()


## Geo Anomaly Score

In [None]:
fraud_by_gas = (df.groupby("geo_anomaly_score").agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
))
fraud_by_gas['fraud_rate (%)'] = fraud_by_gas['fraud_trans']*100/fraud_by_gas['total_trans']
fraud_by_gas['fraud_rate (%)'] = fraud_by_gas['fraud_rate (%)'].round(2)
fraud_by_gas.sort_values('fraud_rate (%)').head(10)

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=df,x='geo_anomaly_score',bins=20,hue='is_fraud',kde=True)
plt.title("Geo Anomaly Score Fraud Detection")
plt.xlabel("Geo Anomaly Score")
plt.ylabel("Count")
plt.show()

## IP Address

In [None]:
fraud_by_ip = (df.groupby("ip_address").agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
))
fraud_by_ip['fraud_rate (%)'] = fraud_by_ip['fraud_trans']*100/fraud_by_ip['total_trans']
fraud_by_ip['fraud_rate (%)'] = fraud_by_ip['fraud_rate (%)'].round(2)
fraud_by_ip.sort_values('fraud_rate (%)',ascending=False).head(10)

## Location

In [None]:
fraud_by_loc = (df.groupby("location").agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
))
fraud_by_loc['fraud_rate (%)'] = fraud_by_loc['fraud_trans']*100/fraud_by_loc['total_trans']
fraud_by_loc['fraud_rate (%)'] = fraud_by_loc['fraud_rate (%)'].round(2)
fraud_by_loc.sort_values('fraud_rate (%)',ascending=False).head(10)

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(x='location',hue='is_fraud',data=df,palette='coolwarm')
plt.title('Location Fraud Detection')
plt.xlabel('Location')
plt.ylabel('Count')
plt.legend(title='is_fraud',loc='center right')
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                fontsize=8, color='black',
                xytext=(0, 6),textcoords='offset points')
plt.show()

## Payment Channel

In [None]:
fraud_by_pc = (df.groupby("payment_channel").agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
))
fraud_by_pc['fraud_rate (%)'] = fraud_by_pc['fraud_trans']*100/fraud_by_pc['total_trans']
fraud_by_pc['fraud_rate (%)'] = fraud_by_pc['fraud_rate (%)'].round(2)
fraud_by_pc.sort_values('fraud_rate (%)',ascending=False).head(10)

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(x='payment_channel',hue='is_fraud',data=df,palette='coolwarm')
plt.title('Payment Channel Fraud Detection')
plt.xlabel('Payment Channel')
plt.ylabel('Count')
plt.legend(title='is_fraud',loc='center right')
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                fontsize=8, color='black',
                xytext=(0, 6),textcoords='offset points')
plt.show()

## Device Hash

In [None]:
fraud_by_dh = (df.groupby("device_hash").agg(
    total_trans=('is_fraud','count'),
    fraud_trans=('is_fraud','sum')
))
fraud_by_dh['fraud_rate (%)'] = fraud_by_dh['fraud_trans']*100/fraud_by_dh['total_trans']
fraud_by_dh['fraud_rate (%)'] = fraud_by_dh['fraud_rate (%)'].round(2)
fraud_by_dh.sort_values('fraud_rate (%)',ascending=False).head(10)

# DATA PREPROCESSING
1. Data Cleaning/Preprocessing
   Method:
    - SimpleImpute (Handle missing value)
    - LabelEncoder (Handle Categorical Features)
    - Scaler (Hanedle Numeric Features)
2. Feature Engineering
3. Feature Selection/Dimension Reduction

In [None]:
## Library for Data Processing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [None]:
#1 Handle Missing values
### time_since_last_spend
imputer = SimpleImputer(strategy="mean")
df[df.select_dtypes(include=np.number).columns] = imputer.fit_transform(
    df.select_dtypes(include=np.number)
)
df.head()

In [None]:
### fraud_type
## Because fraud type does not mean anything else than is_fraud so we drop it
df = df.drop(columns=["fraud_type"])
df.head()

In [None]:
#2 Handle categorical features
for col in df.select_dtypes(include=["object","category"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
df.head()

# FEATURE ENGINEERING
1. Scaler (scaling data)
2. Data creation

In [None]:
## Amount features
df["amount_per_velocity"] = df["amount"]/(df["velocity_score"] + 1)
df["amount_log"] = np.log1p(df["amount"])
df["amount_to_avg_ratio"] = df["amount"]/df.groupby("sender_account")["amount"].transform("mean")
## Frequency features
df["transaction_per_day"] = df.groupby(["sender_account","day"])["amount"].transform("count")
df["transaction_gap"] = (df.groupby("sender_account")["timestamp"].diff().dt.total_seconds().fillna(0))
## Risk features
df["is_night_transaction"] = df["hour"].between(18,24).astype(int)
df["is_weekend"] = df["day_of_week"].isin([6,8]).astype(int)
df["is_self_transfer"] = (df["sender_account"] == df["receiver_account"]).astype(int)
## Network features
df["sender_degree"] = df.groupby("sender_account")["receiver_account"].transform("nunique")
df["receiver_degree"] = df.groupby("receiver_account")["sender_account"].transform("nunique")
df["sender_total_transaction"] = df.groupby("sender_account")["amount"].transform("count")
df["receiver_total_transaction"] = df.groupby("receiver_account")["amount"].transform("count")
## Aggregation features
df["sender_avg_amount"] = df.groupby("sender_account")["amount"].transform("mean")
df["sender_std_amount"] = df.groupby("sender_account")["amount"].transform("std").fillna(0)
## Fraud features
df["sender_fraud_transaction"] = df.groupby("sender_account")["is_fraud"].transform("sum")
df["receiver_fraud_transaction"] = df.groupby("receiver_account")["is_fraud"].transform("sum")

df["sender_fraud_percentage (%)"] = (df["sender_fraud_transaction"]*100/df["sender_total_transaction"]).round(2)
df["receiver_fraud_percentage (%)"] = (df["receiver_fraud_transaction"]*100/df["receiver_total_transaction"]).round(2)

df[["sender_fraud_percentage (%)", "receiver_fraud_percentage (%)"]] = df[["sender_fraud_percentage (%)", "receiver_fraud_percentage (%)"]].fillna(0)
## Others
df["deviation_squared"] = df["spending_deviation_score"] ** 2
df.head(10)

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(10,6))
sns.heatmap(correlation_matrix,annot=False,cmap='coolwarm',center=0,linewidths=0.5,cbar=True)
plt.title("Correlation between numeric variables",fontsize=16)
plt.show()

In [None]:
import os

dest = os.getcwd() + '/dataset'
os.makedirs(
    dest, exist_ok=True
)

df.to_csv(dest + '/financial_fraud_detection_dataset.csv')

## Train, Test, Validation Split

In [None]:
import os
import pandas as pd

dest = os.getcwd() + '/dataset/financial_fraud_detection_dataset.csv' 
df = pd.read_csv(dest)
df.describe()

In [None]:
df.drop(columns=['Unnamed: 0', 'timestamp'], inplace=True)

In [None]:
df_majority = df[df['is_fraud'] == 0]
df_minority = df[df['is_fraud'] == 1]

df_majority_downsampled = df_majority.sample(n = 2*len(df_minority), random_state=42)
df = pd.concat([df_majority_downsampled, df_minority])
df.shape

In [None]:
from sklearn.model_selection import train_test_split

y = df['is_fraud']
X = df.drop(columns=['is_fraud'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/8, random_state=36, stratify=y_train)
X_train.head()

In [None]:
len(X_train), len(y_train), len(X_test), len(y_test)

In [None]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train, X_test, X_val = scaler.transform(X_train), scaler.transform(X_test), scaler.transform(X_val)
# X_test[:10]

In [None]:
X_train.shape, X_test.shape, X_val.shape

## Preparing Machine Learning models

In [None]:
y_train[y_train==1].count(), y_test[y_test==1].count(), y_val[y_val==1].count()

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc

scale_pos_weight = y_train[y_train==1].count() / y_train[y_train==0].count()

print("\nMACHINE LEARNING MODELS:")
print("-"*40)

models = {
    'Random Forest': RandomForestClassifier(
        n_estimators = 200, 
        random_state = 42,
        class_weight="balanced",
        max_depth = 8,
        min_samples_split = 20,
        n_jobs = -1
    ),
    'XGBoost': XGBClassifier(
        n_estimators = 200,
        objective = 'binary:logistic',
        tree_method = 'gpu_hist',
        max_depth = 8, 
        learning_rate = 0.1, 
        scale_pos_weight = scale_pos_weight,
        eval_metric = ['aucpr'],
        verbosity = 2,  
        subsample = 0.8,
        random_state = 42,
        device = 'cuda',
        n_jobs = -1,
    ),
    'Isolation Forest': IsolationForest(
        n_estimators = 200,
        max_samples = 128,
        max_features = 3,
        contamination = y_train.mean(),
        random_state = 42,
        n_jobs = -1,
        bootstrap = False
    )
}
print(y_train.mean())

#In tên các thuật toán sử dụng
print("Các thuật toán được sử dụng:")
for model_name in models.keys():
    print(f"- {model_name}")

## Training Models

In [None]:
results = {}
predictions = {}

#Training section

for name,model in models.items():
    print(f"\nTraining {name} model...")

    try:
        if name =='Isolation Forest': #Chia trường hợp vì Isolation Forest không cần dùng y_train
            
            model.fit(X_train)
            y_pred = model.predict(X_test)

            # Chuyển đổi cho fraud detection: anomaly (-1) -> Fraud (1), normal (1) -> Normal (0)
            y_pred_bin = np.where(y_pred == -1, 1, 0)
            
            # Đánh giá accuracy
            accuracy = accuracy_score(y_test, y_pred_bin)
            results[name] = accuracy
            predictions[name] = y_pred_bin
            
        else: 
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
    
            accuracy = accuracy_score(y_test, y_pred)
            results[name] = accuracy
            predictions[name] = y_pred

        print(f"✅ {name} - Accuracy: {accuracy:.4f}")

    except Exception as e:
        print(f"❌ Error training {name}: {str(e)}")
        results[name] = 0
        predictions[name] = np.zeros(len(y_test))

## Classification Report, Confusion Matrix và ROC Curve của các mô hình phân loại

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
import seaborn as sns
import numpy as np

print("\n📋 Classification Reports:")
print("-" * 40)

# Loop qua từng mô hình trong results
for name, model_accuracy in sorted(results.items(), key=lambda x: x[1], reverse=True):
    model_name = name
    print(f"\n🔍 {model_name} Classification Report:")
    print("-" * 50)

    # Chuẩn hóa binary labels
    y_pred_bin = np.where(predictions[model_name] == 1, 1, 0)
    y_test_bin = np.where(y_test == 1, 1, 0)

    # Classification Report
    report = classification_report(
        y_test_bin,
        y_pred_bin,
        target_names=['Normal', 'Fraud'],
        output_dict=True
    )

    # Tạo figure và axis
    fig, axes = plt.subplots(1, 3, figsize=(24, 6))

    # Left: Classification Report
    axes[0].axis('off')
    axes[0].text(
        0.5, 0.5,
        classification_report(y_test_bin, y_pred_bin, target_names=['Normal', 'Fraud']),
        ha='center', va='center', fontsize=12, family='monospace'
    )
    axes[0].set_title(f'{model_name} - Classification Report', fontsize=14)

    # Middle: Confusion Matrix
    cm = confusion_matrix(y_test_bin, y_pred_bin, labels=[0, 1])
    sns.heatmap(
        cm, annot=True, fmt='d', cmap='Blues',
        xticklabels=['Normal', 'Fraud'],
        yticklabels=['Normal', 'Fraud'],
        ax=axes[1], cbar=False, annot_kws={'size': 12}
    )
    axes[1].set_title(f'{model_name} - Confusion Matrix', fontsize=14)
    axes[1].set_xlabel("Predicted")
    axes[1].set_ylabel("Actual")

    # Right: ROC Curve
    fpr, tpr, _ = roc_curve(y_test_bin, y_pred_bin)
    auc_score = roc_auc_score(y_test_bin, y_pred_bin)

    axes[2].plot(fpr, tpr, color='blue', label=f'AUC = {auc_score:.4f}')
    axes[2].plot([0, 1], [0, 1], color='gray', linestyle='--')
    axes[2].set_xlim([0, 1])
    axes[2].set_ylim([0, 1])
    axes[2].set_title(f'{model_name} - ROC Curve', fontsize=14)
    axes[2].set_xlabel("False Positive Rate")
    axes[2].set_ylabel("True Positive Rate")
    axes[2].legend(loc="lower right")

    plt.tight_layout()
    plt.show()


## Classification Report so sánh giữa các mô hình phân loại

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Khởi tạo dictionary để lưu kết quả
model_comparison = {}

# Đoạn mã này giả sử bạn đã có kết quả từ mô hình huấn luyện
for model_name in results.keys():
    # Tính toán các chỉ số cho mô hình
    accuracy = accuracy_score(y_test, predictions[model_name])
    precision = precision_score(y_test, predictions[model_name], average='weighted')
    recall = recall_score(y_test, predictions[model_name], average='weighted')
    f1 = f1_score(y_test, predictions[model_name], average='weighted')

    model_comparison[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

# Tạo DataFrame từ dictionary
comparison_df = pd.DataFrame(model_comparison).T

# Sắp xếp theo F1-Score để highlight mô hình tốt nhất
comparison_df = comparison_df.sort_values(by='F1-Score', ascending=False)

# Tạo hàm để highlight mô hình tốt nhất
def highlight_best_model(row):
    return ['background-color: red' if row.name == comparison_df.index[0] else '' for _ in row]

# Áp dụng style cho DataFrame để highlight mô hình tốt nhất
styled_df = comparison_df.style.apply(highlight_best_model, axis=1)

# Hiển thị bảng
styled_df


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Giả sử bạn có 2 dict: results (accuracy) và f1_scores (f1)
# results = {'Random Forest': 0.95, 'Logistic Regression': 0.92, 'Isolation Forest': 0.70}
# f1_scores = {'Random Forest': 0.90, 'Logistic Regression': 0.85, 'Isolation Forest': 0.50}
f1_scores = {}

for model_name, y_pred in predictions.items():
    # xử lý đặc biệt cho Isolation Forest (-1 -> 1 cho Fraud, 1 -> 0 cho Normal)
    # if model_name == "Isolation Forest":
    #     y_pred_bin = np.where(y_pred == -1, 1, 0)
    # else:
    y_pred_bin = np.where(y_pred == 1, 1, 0)

    y_test_bin = np.where(y_test == 1, 1, 0)

    f1 = f1_score(y_test_bin, y_pred_bin, zero_division=0)
    f1_scores[model_name] = f1

models_names = list(results.keys())
accuracies = [results[m] for m in models_names]
f1s = [f1_scores[m] for m in models_names]

x = np.arange(len(models_names))  # vị trí cột
width = 0.35                      # độ rộng mỗi nhóm bar

fig, ax = plt.subplots(figsize=(10, 6))

bars1 = ax.bar(x - width/2, accuracies, width, label='Accuracy', color='skyblue')
bars2 = ax.bar(x + width/2, f1s, width, label='F1-score', color='salmon')

# Thêm nhãn trục và tiêu đề
ax.set_ylabel('Scores', fontsize=12)
ax.set_title('Model Performance Comparison (Accuracy vs F1-score)', fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels(models_names, rotation=45, ha='right')
ax.set_ylim(0, 1)
ax.legend()

# Hiển thị giá trị trên từng cột
for bar in bars1 + bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height + 0.01,
            f'{height:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()
