# Problem Statement

In [None]:
A company wants to catch fake or suspicious transactions before they happen. 
They have a big dataset with more than 6 million records and 10 types of information for each transaction.
my project  is to build a machine learning model that can detect which transactions look like fraud, and give useful 
tips to help the company avoid such frauds in the future.


# Steps for solving problems

In [None]:
1. Data Gethering:-
    Collects the data and import by using pandas.
    Import required libraries and algorithm.

2.Data Cleaning:-
    Missing Values: Used SimpleImputer for numerical features, mode imputation for categorical.

3.Feature Engineering:-
       Created new variables like TransactionAmtLog, TimeSinceLastTxn, and binary flags.
       Encoded categorical variables using target encoding.
            
4.Model Building:-
       Tried various models: Random Forest.
       Used SMOTE for class imbalance (fraudulent transactions are rare)..
    
5.Model Evaluation:-
     Metrics: Precision, Recall, F1-score, ROC-AUC.
                 

# Import Libraries

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib


# Data Gethering

In [28]:
df = pd.read_csv("Fraud_dataset.csv")

In [29]:
df = df.head(100000)
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.0,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.0,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,10,PAYMENT,4020.66,C1410794718,159929.0,155908.34,M1257036576,0.00,0.00,0,0
99996,10,PAYMENT,18345.49,C744303677,6206.0,0.00,M1785344556,0.00,0.00,0,0
99997,10,CASH_IN,183774.91,C104331851,39173.0,222947.91,C36392889,54925.05,0.00,0,0
99998,10,CASH_OUT,82237.17,C707662966,6031.0,0.00,C1553004158,592635.66,799140.46,0,0


# Data cleaning and EDA

In [32]:
print(df.head())
print("Information about data::\n ",df.info())
print("Discription about data ::\n",df.describe())
print(" Check sum of null valuese which is contain in data ::\n",df.isnull().sum())
print("value counts in froud coulum ::\n ",df['isFraud'].value_counts())


   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 colum

# Lable Encoding for change data object to numerical

In [34]:
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])  # Convert 'type' to numeric

# Drop unnecessary columns
df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True)


# Train test split

In [36]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


# Model training

In [38]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)


# Model Evoluation

In [39]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9993
Confusion Matrix:
 [[19977     0]
 [   14     9]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19977
           1       1.00      0.39      0.56        23

    accuracy                           1.00     20000
   macro avg       1.00      0.70      0.78     20000
weighted avg       1.00      1.00      1.00     20000



In [40]:
joblib.dump(model, 'fraud_model.pkl')


['fraud_model.pkl']

In [41]:
model_loaded = joblib.load('fraud_model.pkl')

sample = X_test.iloc[0:1]
print("Actual:", y_test.iloc[0])
print("Predicted:", model_loaded.predict(sample))


Actual: 0
Predicted: [0]
