In [1]:
#Necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("/kaggle/input/payment-fraud-empowering-financial-security/payment_fraud.csv")

# First 5 samples of the data

In [3]:
print(f"Shape : {df.shape[0]:,} rows × {df.shape[1]} columns")
df.head()

Shape : 39,221 rows × 8 columns


Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend,label
0,29,1,4.745402,paypal,28.204861,shopping,0.0,0
1,725,1,4.742303,storecredit,0.0,electronics,0.0,0
2,845,1,4.921318,creditcard,0.0,food,1.0,0
3,503,1,4.886641,creditcard,0.0,electronics,1.0,0
4,2000,1,5.040929,creditcard,0.0,shopping,0.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39221 entries, 0 to 39220
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   accountAgeDays        39221 non-null  int64  
 1   numItems              39221 non-null  int64  
 2   localTime             39221 non-null  float64
 3   paymentMethod         39221 non-null  object 
 4   paymentMethodAgeDays  39221 non-null  float64
 5   Category              39126 non-null  object 
 6   isWeekend             38661 non-null  float64
 7   label                 39221 non-null  int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 2.4+ MB


# Numerical feature summary

In [5]:
num_feats = df.select_dtypes(include=[np.number]).columns
print("\nNumerical feature summary:")
display(df[num_feats].describe().T)


Numerical feature summary:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
accountAgeDays,39221.0,857.563984,804.788212,1.0,72.0,603.0,1804.0,2000.0
numItems,39221.0,1.084751,0.566899,1.0,1.0,1.0,1.0,29.0
localTime,39221.0,4.748232,0.38936,0.421214,4.742303,4.886641,4.962055,5.040929
paymentMethodAgeDays,39221.0,122.641326,283.569177,0.0,0.0,0.0125,87.510417,1999.580556
isWeekend,38661.0,0.499547,0.500006,0.0,0.0,0.0,1.0,1.0
label,39221.0,0.014278,0.118636,0.0,0.0,0.0,0.0,1.0


# Categorical feature summary

In [6]:
cat_feats = df.select_dtypes(exclude=[np.number]).columns
print("\nCategorical feature summary:")
for c in cat_feats:
    print(f"\n{c} – unique values ({df[c].nunique()}):")
    print(df[c].value_counts(dropna=False).head(10))


Categorical feature summary:

paymentMethod – unique values (3):
paymentMethod
creditcard     28004
paypal          9303
storecredit     1914
Name: count, dtype: int64

Category – unique values (3):
Category
shopping       13328
food           12964
electronics    12834
NaN               95
Name: count, dtype: int64


# Number of NaNs in each colum

In [7]:
print("\nNumber of NaNs in each colum:")
print(df.isna().sum())
total_missing = df.isna().sum().sum()
print("Total missing values:", total_missing)


Number of NaNs in each colum:
accountAgeDays            0
numItems                  0
localTime                 0
paymentMethod             0
paymentMethodAgeDays      0
Category                 95
isWeekend               560
label                     0
dtype: int64
Total missing values: 655


# Filling Missing value with mode

In [8]:
df.fillna({'Category': df['Category'].mode()[0]}, inplace=True)

df.fillna({'isWeekend': df['isWeekend'].mode()[0]}, inplace=True)

# Encode Categorical Features

In [9]:
cat_cols = ['paymentMethod', 'Category']
le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Define Features and Target

In [10]:
X = df.drop('label', axis=1)
y = df['label']

# Standardize numerical features

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test Split(80:20)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression

In [13]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))

Logistic Regression:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      7733
           1       0.57      0.04      0.07       112

    accuracy                           0.99      7845
   macro avg       0.78      0.52      0.53      7845
weighted avg       0.98      0.99      0.98      7845



# Decision Tree

In [14]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree:\n", classification_report(y_test, y_pred_dt))

Decision Tree:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7733
           1       1.00      1.00      1.00       112

    accuracy                           1.00      7845
   macro avg       1.00      1.00      1.00      7845
weighted avg       1.00      1.00      1.00      7845



# Random Forest

In [15]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:\n", classification_report(y_test, y_pred_rf))

Random Forest:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7733
           1       1.00      1.00      1.00       112

    accuracy                           1.00      7845
   macro avg       1.00      1.00      1.00      7845
weighted avg       1.00      1.00      1.00      7845

