### **1. Data Understanding & Preprocessing**

installing libraries

In [62]:
%pip install Pandas NumPy Scikit-learn Streamlit Matplotlib Seaborn



1.1. Load and Inspect the Dataset

importing necessary libraries

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib


In [None]:
# Load the dataset
# Assuming the file 'fraud_data.csv' has been downloaded and saved locally
df = pd.read_csv('fraud_data.csv')


In [None]:
# Inspect the dataset
print("First 5 rows of the dataset:")
print(df.head())



In [None]:
print("\nDataset Information:")
df.info()


In [None]:
# satistical discription of the dataset
df.describe()


In [None]:
# show the nunber ofrows and columns
df.shape

1.2. Handle Missing Values and Remove Duplicates

Check for missing values and duplicates, and clean the dataset.

In [None]:

# Handle missing values
print("\nMissing values count per column:")
print(df.isnull().sum())


If there are missing values, a common approach is to drop rows or impute.

For simplicity, we'll assume no critical missing data based on common fraud datasets.

If imputation is needed: df['column'].fillna(df['column'].median(), inplace=True)


In [None]:
df.dropna(inplace=True)

In [None]:
# Re-Checking for missing values after cleaning data
df.isnull().sum()

In [None]:
# Remove duplicates
print(f"\nNumber of duplicate rows before removal: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Number of duplicate rows after removal: {df.duplicated().sum()}")



1.3. Encode Categorical Variables.

The type column is categorical and must be converted to a numerical format for the model.

One-Hot Encoding is suitable here, or for simplicity in the prediction app, we can use Label Encoding and retain the mapping.


In [None]:
# We'll use One-Hot Encoding for 'type' as it's nominal data
df = pd.get_dummies(df, columns=['type'], drop_first=True)



In [None]:
# Drop columns not needed for modeling (e.g., 'nameOrig', 'nameDest', 'isFlaggedFraud')

df.drop(['step', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True, errors='ignore')

In [None]:
# Save the column order for the Streamlit app later
model_features = df.drop('isFraud', axis=1).columns.tolist()
print("\nFeatures after encoding and dropping columns:")
print(model_features)


1.4. Scale Numerical Features.

Scale numerical features like amount, oldbalanceOrg, and others.

Scaling prevents features with larger values from dominating the model training process.


In [None]:
# Identify numerical columns (excluding the one-hot encoded 'type' columns and the target)
numerical_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']



In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()


In [None]:
# Apply the scaler to the numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])



In [None]:
# Save the scaler for use in the Streamlit app
joblib.dump(scaler, 'scaler.pkl')
print("\nNumerical features scaled. Scaler saved as 'scaler.pkl'.")



1.5. Explore Fraud Patterns Using Charts.

Visualize the data to understand the distribution of the target variable and fraud patterns.


In [None]:
## Target Variable Distribution (Fraud vs. Not Fraud)
plt.figure(figsize=(6, 4))
sns.countplot(x='isFraud', data=df)
plt.title('Distribution of the Target Variable (isFraud)')
plt.xlabel('Is Fraud (0: No Fraud, 1: Fraud)')
plt.ylabel('Number of Transactions')
plt.xticks([0, 1], ['No Fraud', 'Fraud'])
plt.show()


In [None]:
### Fraud by Transaction Type
# The one-hot encoded columns are type_CASH_OUT, type_DEBIT, type_PAYMENT, type_TRANSFER
type_cols = [col for col in df.columns if col.startswith('type_')]
fraud_by_type = df.groupby('isFraud')[type_cols].sum()

fraud_by_type.T.plot(kind='bar', figsize=(8, 6))
plt.title('Fraudulent Transactions by Transaction Type')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Is Fraud', labels=['No Fraud', 'Fraud'])
plt.tight_layout()
plt.show()




### 2. Machine Learning Model Development



#### 2.1. Select Features and Split Data

Select the feature columns (`X`) and the target column (`y`), then split the data into training and testing sets.


In [None]:
# Define features (X) and target (y)
X = df.drop('isFraud', axis=1)
y = df['isFraud']


In [None]:
# Split the data (80% train, 20% test is a common split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


2.2. Train a Random Forest Model
Train the recommended Random Forest Classifier. Given the high class imbalance in typical fraud datasets, adding class_weight='balanced' can often improve performance by penalizing errors in the minority class (Fraud).


In [None]:
# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
model.fit(X_train, y_train)

print("\nRandom Forest Model Training Complete.")


2.3. Evaluate Model Performance
Evaluate the model using accuracy, precision, recall, and F1-score. For fraud detection, Precision and Recall are often more critical than Accuracy due to the extreme class imbalance.


In [None]:
# Predict on the test set
y_pred = model.predict(X_test)


In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")   # Aim for 90%+
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


In [None]:
# Visualize the Confusion Matrix for a complete picture
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Fraud (0)', 'Fraud (1)'],
            yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
#
# Save the trained model for deployment
joblib.dump(model, 'random_forest_model.pkl')
print("\nTrained model saved as 'random_forest_model.pkl'.")
