# EDA - Fraud Data Analysis

This notebook provides a comprehensive Exploratory Data Analysis of the e-commerce fraud dataset, including univariate and bivariate analysis, and geolocation integration.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_loader import load_data
from preprocessing import clean_fraud_data, clean_ip_data
from feature_engineering import feature_engineer_fraud_data

%matplotlib inline
sns.set(style="whitegrid")

## 1. Data Loading and Cleaning

In [None]:
fraud_df = load_data('../data/raw/Fraud_Data.csv')
ip_df = load_data('../data/raw/IpAddress_to_Country.csv')

fraud_df = clean_fraud_data(fraud_df)
ip_df = clean_ip_data(ip_df)

print(f"Fraud Data Shape: {fraud_df.shape}")
fraud_df.head()

## 2. Geolocation Integration & Feature Engineering

In [None]:
fraud_df = feature_engineer_fraud_data(fraud_df, ip_df)
print("Feature Engineering Complete.")
fraud_df[['user_id', 'country', 'time_since_signup', 'user_freq', 'device_freq']].head()

## 3. Univariate Analysis

### 3.1 Class Distribution

In [None]:
counts = fraud_df['class'].value_counts()
percent = fraud_df['class'].value_counts(normalize=True) * 100

print("Class Distribution:")
print(pd.concat([counts, percent], axis=1, keys=['Count', 'Percentage']))

plt.figure(figsize=(6, 4))
sns.countplot(x='class', data=fraud_df, palette='viridis')
plt.title('Absolute Class Distribution')
plt.show()

### 3.2 Numerical Features: Purchase Value & Age

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.histplot(fraud_df['purchase_value'], bins=50, ax=axes[0], color='skyblue', kde=True)
axes[0].set_title('Distribution of Purchase Value')

sns.histplot(fraud_df['age'], bins=30, ax=axes[1], color='salmon', kde=True)
axes[1].set_title('Distribution of Age')

plt.tight_layout()
plt.show()

## 4. Bivariate Analysis

### 4.1 Fraud vs Purchase Value

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x='class', y='purchase_value', data=fraud_df, palette='Set2')
plt.title('Purchase Value by Class (Fraud vs Non-Fraud)')
plt.show()

### 4.2 Fraud vs Time-based Features

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.countplot(x='hour_of_day', hue='class', data=fraud_df, ax=axes[0], palette='coolwarm')
axes[0].set_title('Fraud Distribution by Hour of Day')

sns.countplot(x='day_of_week', hue='class', data=fraud_df, ax=axes[1], palette='coolwarm')
axes[1].set_title('Fraud Distribution by Day of Week')
axes[1].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

plt.tight_layout()
plt.show()

### 4.3 Fraud vs Country

In [None]:
country_fraud = fraud_df.groupby('country')['class'].mean().sort_values(ascending=False).head(15)
plt.figure(figsize=(10, 6))
country_fraud.plot(kind='bar', color='darkred')
plt.title('Top 15 Countries by Fraud Ratio')
plt.ylabel('Fraud Ratio')
plt.xticks(rotation=45)
plt.show()

## 5. Conclusions
- Significant class imbalance (approx 9% fraud).
- Fraud cases don't show a strong bias towards specific purchase values but exhibit patterns in country-wise distribution and behavioral features like `time_since_signup` (to be explored in feature engineering script).