# EDA for Fraud_Data.csv

This notebook performs exploratory data analysis on the e-commerce fraud detection dataset.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

from data_cleaning import clean_fraud_data, ip_to_integer, merge_ip_to_country
from feature_engineering import create_time_features, create_transaction_frequency_features

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
print("Loading Fraud_Data.csv...")
fraud_df = pd.read_csv('../data/raw/Fraud_Data.csv')
print(f"Shape: {fraud_df.shape}")
print(f"\nColumns: {fraud_df.columns.tolist()}")
print(f"\nFirst few rows:")
fraud_df.head()


## 1. Data Cleaning


In [None]:
# Clean the data
fraud_df_clean = clean_fraud_data(fraud_df)

# Display basic info
print("Data Info:")
print(fraud_df_clean.info())
print("\nData Types:")
print(fraud_df_clean.dtypes)
print("\nBasic Statistics:")
fraud_df_clean.describe()


## 2. Univariate Analysis


In [None]:
# Class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
class_counts = fraud_df_clean['class'].value_counts()
axes[0].bar(class_counts.index, class_counts.values, color=['green', 'red'])
axes[0].set_xlabel('Class (0=Legitimate, 1=Fraud)')
axes[0].set_ylabel('Count')
axes[0].set_title('Class Distribution')
axes[0].set_xticks([0, 1])

# Percentage
class_pct = fraud_df_clean['class'].value_counts(normalize=True) * 100
axes[1].bar(class_pct.index, class_pct.values, color=['green', 'red'])
axes[1].set_xlabel('Class (0=Legitimate, 1=Fraud)')
axes[1].set_ylabel('Percentage')
axes[1].set_title('Class Distribution (%)')
axes[1].set_xticks([0, 1])

plt.tight_layout()
plt.show()

print(f"Class distribution:")
print(fraud_df_clean['class'].value_counts())
print(f"\nClass imbalance ratio: {class_counts[0] / class_counts[1]:.2f}:1")


In [None]:
# Distribution of numerical features
numerical_cols = ['purchase_value', 'age']
fig, axes = plt.subplots(1, len(numerical_cols), figsize=(15, 5))

for i, col in enumerate(numerical_cols):
    if col in fraud_df_clean.columns:
        axes[i].hist(fraud_df_clean[col].dropna(), bins=50, edgecolor='black')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()


In [None]:
# Distribution of categorical features
categorical_cols = ['source', 'browser', 'sex']
fig, axes = plt.subplots(1, len(categorical_cols), figsize=(18, 5))

for i, col in enumerate(categorical_cols):
    if col in fraud_df_clean.columns:
        value_counts = fraud_df_clean[col].value_counts()
        axes[i].bar(range(len(value_counts)), value_counts.values)
        axes[i].set_xticks(range(len(value_counts)))
        axes[i].set_xticklabels(value_counts.index, rotation=45, ha='right')
        axes[i].set_ylabel('Count')
        axes[i].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()


## 3. Bivariate Analysis


In [None]:
# Purchase value by class
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
fraud_df_clean.boxplot(column='purchase_value', by='class', ax=axes[0])
axes[0].set_title('Purchase Value by Class')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Purchase Value')

# Violin plot
sns.violinplot(data=fraud_df_clean, x='class', y='purchase_value', ax=axes[1])
axes[1].set_title('Purchase Value Distribution by Class')

plt.tight_layout()
plt.show()

# Statistical summary
print("Purchase Value Statistics by Class:")
print(fraud_df_clean.groupby('class')['purchase_value'].describe())


In [None]:
# Age by class
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
fraud_df_clean.boxplot(column='age', by='class', ax=axes[0])
axes[0].set_title('Age by Class')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Age')

# Violin plot
sns.violinplot(data=fraud_df_clean, x='class', y='age', ax=axes[1])
axes[1].set_title('Age Distribution by Class')

plt.tight_layout()
plt.show()

# Statistical summary
print("Age Statistics by Class:")
print(fraud_df_clean.groupby('class')['age'].describe())


In [None]:
# Fraud rate by categorical features
categorical_cols = ['source', 'browser', 'sex']
fig, axes = plt.subplots(1, len(categorical_cols), figsize=(18, 5))

for i, col in enumerate(categorical_cols):
    if col in fraud_df_clean.columns:
        fraud_rate = fraud_df_clean.groupby(col)['class'].mean() * 100
        axes[i].bar(range(len(fraud_rate)), fraud_rate.values)
        axes[i].set_xticks(range(len(fraud_rate)))
        axes[i].set_xticklabels(fraud_rate.index, rotation=45, ha='right')
        axes[i].set_ylabel('Fraud Rate (%)')
        axes[i].set_title(f'Fraud Rate by {col}')
        axes[i].axhline(y=fraud_df_clean['class'].mean() * 100, color='r', linestyle='--', label='Overall Average')
        axes[i].legend()

plt.tight_layout()
plt.show()


## 4. Geolocation Integration


In [None]:
# Load IP to Country mapping
print("Loading IpAddress_to_Country.csv...")
ip_country_df = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
print(f"Shape: {ip_country_df.shape}")
print(f"\nColumns: {ip_country_df.columns.tolist()}")
ip_country_df.head()


In [None]:
# Convert IP address columns to integer
ip_country_df['lower_bound_ip_address'] = ip_country_df['lower_bound_ip_address'].apply(ip_to_integer)
ip_country_df['upper_bound_ip_address'] = ip_country_df['upper_bound_ip_address'].apply(ip_to_integer)

# Merge fraud data with country mapping
fraud_df_with_country = merge_ip_to_country(fraud_df_clean, ip_country_df)
print(f"Shape after merge: {fraud_df_with_country.shape}")
print(f"Rows matched: {len(fraud_df_with_country)} / {len(fraud_df_clean)}")

fraud_df_with_country.head()


In [None]:
# Analyze fraud patterns by country
if 'country' in fraud_df_with_country.columns:
    country_fraud = fraud_df_with_country.groupby('country').agg({
        'class': ['count', 'sum', 'mean']
    }).reset_index()
    country_fraud.columns = ['country', 'total_transactions', 'fraud_count', 'fraud_rate']
    country_fraud = country_fraud.sort_values('fraud_rate', ascending=False)
    
    # Top 20 countries by fraud rate
    top_countries = country_fraud.head(20)
    
    fig, axes = plt.subplots(1, 2, figsize=(18, 8))
    
    # Fraud rate
    axes[0].barh(range(len(top_countries)), top_countries['fraud_rate'].values)
    axes[0].set_yticks(range(len(top_countries)))
    axes[0].set_yticklabels(top_countries['country'])
    axes[0].set_xlabel('Fraud Rate')
    axes[0].set_title('Top 20 Countries by Fraud Rate')
    axes[0].invert_yaxis()
    
    # Total fraud count
    top_by_count = country_fraud.sort_values('fraud_count', ascending=False).head(20)
    axes[1].barh(range(len(top_by_count)), top_by_count['fraud_count'].values)
    axes[1].set_yticks(range(len(top_by_count)))
    axes[1].set_yticklabels(top_by_count['country'])
    axes[1].set_xlabel('Total Fraud Count')
    axes[1].set_title('Top 20 Countries by Total Fraud Count')
    axes[1].invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Countries by Fraud Rate:")
    print(country_fraud.head(10))


## 5. Summary and Key Insights

Key findings from the EDA:
1. Class imbalance ratio
2. Important features for fraud detection
3. Patterns in fraudulent transactions
4. Country-level fraud patterns
is