In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
from google.colab import files
uploaded = files.upload()

Saving fraud_dataset_example.csv to fraud_dataset_example (1).csv


In [2]:
df = pd.read_csv('fraud_dataset_example.csv')


In [5]:
print(df.dtypes)
for column in df.columns:
    if df[column].dtype in ['float64', 'int64']:
        df[column].fillna(df[column].median(), inplace=True)
    elif df[column].dtype == 'object':
        df[column].fillna(df[column].mode()[0], inplace=True)

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object


In [7]:
categorical_columns = ['type', 'nameOrig','nameDest']
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])


In [9]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [10]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [11]:
y_pred = rf_model.predict(X_test)


In [12]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Confusion Matrix:
[[30452     0]
 [   20    12]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30452
           1       1.00      0.38      0.55        32

    accuracy                           1.00     30484
   macro avg       1.00      0.69      0.77     30484
weighted avg       1.00      1.00      1.00     30484


Accuracy Score:
0.9993439181209814


In [13]:
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nFeature Importance:")
for i in range(X.shape[1]):
    print(f"{X.columns[indices[i]]}: {importances[indices[i]]}")


Feature Importance:
amount: 0.20296981640450992
oldbalanceOrg: 0.1962587519007518
oldbalanceDest: 0.15423403220042503
newbalanceDest: 0.13327520419431566
nameDest: 0.09740231848808896
nameOrig: 0.08909262614714776
step: 0.06986050041233195
type: 0.031249270307716827
newbalanceOrig: 0.025657479944712198
isFlaggedFraud: 0.0


In [15]:
joblib.dump(rf_model, 'random_forest_model.pkl')


['random_forest_model.pkl']