<a href="https://colab.research.google.com/github/hanimatari/VR-Fraud-Detection/blob/main/VR_Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install core packages
!pip install --quiet pandas numpy openpyxl scikit-learn xgboost matplotlib seaborn streamlit

from google.colab import files
uploaded = files.upload()  # choose VR_FRAUD_DATASET.xlsx

import pandas as pd

df = pd.read_excel('VR_FRAUD_DATASET.xlsx', engine='openpyxl')
print("Rows × Columns:", df.shape)
display(df.head())
df.info()
df.describe(include='all').T


Saving VR_FRAUD_DATASET.xlsx to VR_FRAUD_DATASET (7).xlsx
Rows × Columns: (1000, 18)


Unnamed: 0,step,type,price_paid,user_id,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,is_suspicious,isFlaggedFraud,market_value,price_difference,is_overpriced,user_transaction_count,is_repeating_user,is_withdrawal,suspicious_withdrawal
0,1,PAYMENT,9839.64,C18,170136.0,160296.36,M1979787155,0.0,0.0,0,0,2732.73,7106.91,True,14,True,False,False
1,1,PAYMENT,1864.28,C25,21249.0,19384.72,M2044282225,0.0,0.0,0,0,3268.64,-1404.36,False,12,True,False,False
2,1,TRANSFER,181.0,C47,181.0,0.0,C553264065,0.0,0.0,1,0,1271.87,-1090.87,False,9,False,False,False
3,1,CASH_OUT,181.0,C51,181.0,0.0,C38997010,21182.0,0.0,1,0,6131.67,-5950.67,False,6,False,True,True
4,1,PAYMENT,11668.14,C95,41554.0,29885.86,M1230701703,0.0,0.0,0,0,3865.09,7803.05,True,14,True,False,False


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   step                    1000 non-null   int64  
 1   type                    1000 non-null   object 
 2   price_paid              1000 non-null   float64
 3   user_id                 1000 non-null   object 
 4   oldbalanceOrg           1000 non-null   float64
 5   newbalanceOrig          1000 non-null   float64
 6   nameDest                1000 non-null   object 
 7   oldbalanceDest          1000 non-null   float64
 8   newbalanceDest          1000 non-null   float64
 9   is_suspicious           1000 non-null   int64  
 10  isFlaggedFraud          1000 non-null   int64  
 11  market_value            1000 non-null   float64
 12  price_difference        1000 non-null   float64
 13  is_overpriced           1000 non-null   bool   
 14  user_transaction_count  1000 non-null   i

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
step,1000.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
type,1000.0,5.0,PAYMENT,437.0,,,,,,,
price_paid,1000.0,,,,118199.52532,248479.886955,8.73,4474.7275,14658.085,142720.3225,2545478.01
user_id,1000.0,100.0,C12,17.0,,,,,,,
oldbalanceOrg,1000.0,,,,864604.28678,2078040.475883,0.0,0.0,14518.0,107909.25,9716462.14
newbalanceOrig,1000.0,,,,884998.83148,2127790.405095,0.0,0.0,5661.83,102104.2325,9987286.56
nameDest,1000.0,597.0,C1590550415,23.0,,,,,,,
oldbalanceDest,1000.0,,,,665776.54357,2061228.443154,0.0,0.0,28755.055,436631.165,17700000.0
newbalanceDest,1000.0,,,,1225619.09462,3441299.188627,0.0,0.0,0.0,730111.2025,19200000.0
is_suspicious,1000.0,,,,0.009,0.094488,0.0,0.0,0.0,0.0,1.0


In [2]:
# -------------------------------
# Step 2: Feature Engineering
# -------------------------------
import numpy as np

# 1. (Re)simulate market_value if not already in your data
if 'market_value' not in df.columns:
    # e.g. assume real market value is within ±20% of paid price
    df['market_value'] = df['price_paid'] * np.random.uniform(0.8, 1.2, size=len(df))

# 2. Price difference
df['price_difference'] = df['price_paid'] - df['market_value']

# 3. Overpriced flag (30% threshold)
df['is_overpriced'] = (df['price_difference'] / df['market_value']) > 0.3

# 4. Count how many transactions each user has made
df['user_transaction_count'] = df.groupby('user_id')['price_paid'].transform('count')

# 5. Repeating‐user flag
df['is_repeating_user'] = df['user_transaction_count'] > 1

# 6. Withdrawal flag (any cash‐out/withdrawal)
df['is_withdrawal'] = df['type'].isin(['CASH_OUT', 'WITHDRAW'])

# 7. Suspicious withdrawal = overpriced AND withdrawal
df['suspicious_withdrawal'] = df['is_withdrawal'] & df['is_overpriced']

# Done engineering
print("✅ Engineered features:",
      ['market_value','price_difference','is_overpriced',
       'user_transaction_count','is_repeating_user',
       'is_withdrawal','suspicious_withdrawal'])


✅ Engineered features: ['market_value', 'price_difference', 'is_overpriced', 'user_transaction_count', 'is_repeating_user', 'is_withdrawal', 'suspicious_withdrawal']


In [3]:
# Sanity check: no missing values & flag distributions
print("Missing values per column:\n", df.isnull().sum(), "\n")

print("Overpriced count:\n", df['is_overpriced'].value_counts(), "\n")
print("Suspicious withdrawal count:\n", df['suspicious_withdrawal'].value_counts(), "\n")


Missing values per column:
 step                      0
type                      0
price_paid                0
user_id                   0
oldbalanceOrg             0
newbalanceOrig            0
nameDest                  0
oldbalanceDest            0
newbalanceDest            0
is_suspicious             0
isFlaggedFraud            0
market_value              0
price_difference          0
is_overpriced             0
user_transaction_count    0
is_repeating_user         0
is_withdrawal             0
suspicious_withdrawal     0
dtype: int64 

Overpriced count:
 is_overpriced
True     802
False    198
Name: count, dtype: int64 

Suspicious withdrawal count:
 suspicious_withdrawal
False    774
True     226
Name: count, dtype: int64 



In [4]:
# -------------------------------
# Step 3: Define Target & Re-balance
# -------------------------------

# 1) Define a combined fraud-like label
df['fraud_label'] = df['is_overpriced'] | df['suspicious_withdrawal']

# 2) Quick check on label distribution
print("Label distribution:\n", df['fraud_label'].value_counts(), "\n")

# 3) Split majority/minority on this new label
df_majority = df[df['fraud_label'] == False]
df_minority = df[df['fraud_label'] == True]

print("Before balancing:",
      "Non-fraud-like =", len(df_majority),
      "Fraud-like =", len(df_minority))

# 4) Upsample the minority (fraud-like) class
df_minority_upsampled = df_minority.sample(
    n=len(df_majority),
    replace=True,
    random_state=42
)

# 5) Combine & shuffle into df_balanced
df_balanced = pd.concat([df_majority, df_minority_upsampled]) \
                .sample(frac=1, random_state=42) \
                .reset_index(drop=True)

print("After balancing:",
      "Total records =", len(df_balanced),
      "\nFraud-like count =", df_balanced['fraud_label'].sum(),
      "Non-fraud-like count =", len(df_balanced) - df_balanced['fraud_label'].sum())


Label distribution:
 fraud_label
True     802
False    198
Name: count, dtype: int64 

Before balancing: Non-fraud-like = 198 Fraud-like = 802
After balancing: Total records = 396 
Fraud-like count = 198 Non-fraud-like count = 198


In [5]:
# -------------------------------
# Step 6a: Preprocessing – Encode Categorical Variables
# -------------------------------

# 1) Drop identifier columns we won’t train on
df_clean = df_balanced.drop(columns=['user_id', 'nameDest'])

# 2) One-hot encode the transaction type
df_clean = pd.get_dummies(df_clean, columns=['type'], drop_first=True)

# 3) Separate features X and target y
X = df_clean.drop(columns=['fraud_label'])
y = df_clean['fraud_label']

print("✔️ After encoding, X has shape:", X.shape)
print("Columns now:", X.columns.tolist())

from sklearn.model_selection import train_test_split

# 1) Split into train / test (80% / 20%), stratified on your fraud_label
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train set:", X_train.shape, "Test set:", X_test.shape)
print("Train label counts:\n", y_train.value_counts())



✔️ After encoding, X has shape: (396, 19)
Columns now: ['step', 'price_paid', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'is_suspicious', 'isFlaggedFraud', 'market_value', 'price_difference', 'is_overpriced', 'user_transaction_count', 'is_repeating_user', 'is_withdrawal', 'suspicious_withdrawal', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
Train set: (316, 19) Test set: (80, 19)
Train label counts:
 fraud_label
True     158
False    158
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

# 1. Separate features X and target y
X = df_balanced.drop(['fraud_label'], axis=1)
y = df_balanced['fraud_label']

# 2. Split 80/20 stratified
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train set size:", X_train.shape, "Test set size:", X_test.shape)
print("Train label distribution:\n", y_train.value_counts())


Train set size: (316, 18) Test set size: (80, 18)
Train label distribution:
 fraud_label
True     158
False    158
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split

# 1) Split into train / test (80% / 20%), stratified on your fraud_label
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train set:", X_train.shape, "Test set:", X_test.shape)
print("Train label counts:\n", y_train.value_counts())



Train set: (316, 18) Test set: (80, 18)
Train label counts:
 fraud_label
True     158
False    158
Name: count, dtype: int64


In [8]:
# -------------------------------
# Step 7: Train Full Models & Compare
# -------------------------------

# 0) Imports
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, roc_curve, auc
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# 1) Train RandomForest on full training set
rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)

# 2) Train XGBoost
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)

# 3) Reports
print("▶️ RandomForest Report")
print(classification_report(y_test, y_pred_rf))
print("▶️ XGBoost Report")
print(classification_report(y_test, y_pred_xgb))

# 4) Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, ax=axes[0])
axes[0].set_title("RF Confusion Matrix")
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, ax=axes[1])
axes[1].set_title("XGB Confusion Matrix")
plt.show()

# 5) ROC curves & AUC
rf_probs = rf.predict_proba(X_test)[:, 1]
xgb_probs = xgb.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_probs)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_probs)
auc_rf = auc(fpr_rf, tpr_rf)
auc_xgb = auc(fpr_xgb, tpr_xgb)

plt.figure(figsize=(8,6))
plt.plot(fpr_rf, tpr_rf, label=f'RF (AUC = {auc_rf:.3f})')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGB (AUC = {auc_xgb:.3f})')
plt.plot([0,1], [0,1], 'k--', label='Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.show()

# 6) Top 10 Feature Importances (RF)
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
importances.nlargest(10).sort_values().plot(kind='barh', figsize=(6,4))
plt.title("Top 10 Feature Importances (RF)")
plt.tight_layout()
plt.show()


ValueError: could not convert string to float: 'PAYMENT'