# FraudGuard - Data Exploration

**Ziel:** Dataset verstehen, Fraud-Patterns entdecken, Basis für Rules legen

## Inhalt
1. Dataset laden & inspizieren
2. Fraud-Statistiken
3. Feature-Analyse
4. Zeit-Patterns
5. Geo-Patterns
6. Erste Insights für Rules

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

# Eigene Utils
import sys
sys.path.append('..')
from src.utils import load_data, calculate_haversine_distance, print_fraud_stats

warnings.filterwarnings('ignore')

# Plot Settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("✓ Imports successful")

## 1. Dataset laden

**Hinweis:** Für erste Tests nur 100k Zeilen laden (schneller!)

In [None]:
# Vollständig laden (1.3M rows) - dauert ~30 Sek
# df = load_data('../data/raw/fraudTrain.csv')

# Oder: Subset für schnelles Testen (empfohlen für Entwicklung)
df = load_data('../data/raw/fraudTrain.csv', nrows=100000)

print(f"\nShape: {df.shape}")
print(f"Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Erste Zeilen
df.head()

In [None]:
# Spaltenübersicht
df.info()

In [None]:
# Statistiken
df.describe()

## 2. Fraud-Statistiken

In [None]:
# Fraud Stats ausgeben
print_fraud_stats(df)

In [None]:
# Visualisierung: Fraud Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Bar Plot
fraud_counts = df['is_fraud'].value_counts()
axes[0].bar(['Legitimate', 'Fraud'], fraud_counts.values, color=['green', 'red'])
axes[0].set_ylabel('Count')
axes[0].set_title('Transaction Distribution')

# Pie Chart
axes[1].pie(fraud_counts.values, 
            labels=['Legitimate', 'Fraud'],
            autopct='%1.2f%%',
            colors=['green', 'red'])
axes[1].set_title('Fraud Rate')

plt.tight_layout()
plt.show()

## 3. Feature-Analyse

### 3.1 Transaction Amount

In [None]:
# Amount Statistik nach Fraud
print("Transaction Amount by Fraud Status:")
print(df.groupby('is_fraud')['amt'].describe())

In [None]:
# Amount Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df[df['is_fraud'] == 0]['amt'], bins=50, alpha=0.7, label='Legitimate', color='green')
axes[0].hist(df[df['is_fraud'] == 1]['amt'], bins=50, alpha=0.7, label='Fraud', color='red')
axes[0].set_xlabel('Amount ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Amount Distribution')
axes[0].legend()
axes[0].set_xlim(0, 500)  # Focus auf <500$

# Box Plot
df.boxplot(column='amt', by='is_fraud', ax=axes[1])
axes[1].set_xlabel('is_fraud')
axes[1].set_ylabel('Amount ($)')
axes[1].set_title('Amount by Fraud Status')
axes[1].set_ylim(0, 500)

plt.tight_layout()
plt.show()

### 3.2 Kategorien

In [None]:
# Fraud Rate pro Kategorie
fraud_by_category = df.groupby('category')['is_fraud'].agg(['sum', 'count', 'mean']).sort_values('mean', ascending=False)
fraud_by_category.columns = ['Fraud_Count', 'Total', 'Fraud_Rate']
fraud_by_category['Fraud_Rate'] = fraud_by_category['Fraud_Rate'] * 100  # In Prozent

print("\nFraud Rate by Category (Top 10):")
print(fraud_by_category.head(10))

In [None]:
# Visualisierung
plt.figure(figsize=(12, 6))
fraud_by_category.head(15).plot(kind='barh', y='Fraud_Rate', color='coral')
plt.xlabel('Fraud Rate (%)')
plt.title('Top 15 Categories by Fraud Rate')
plt.tight_layout()
plt.show()

## 4. Zeit-Patterns

**Hypothese:** Fraud passiert häufiger nachts?

In [None]:
# Parse Datum/Zeit
df['trans_datetime'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_datetime'].dt.hour
df['day_of_week'] = df['trans_datetime'].dt.dayofweek  # 0=Monday
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

print("✓ Time features extracted")

In [None]:
# Fraud Rate pro Stunde
fraud_by_hour = df.groupby('hour')['is_fraud'].agg(['sum', 'count', 'mean'])
fraud_by_hour.columns = ['Fraud_Count', 'Total', 'Fraud_Rate']

plt.figure(figsize=(14, 5))
plt.plot(fraud_by_hour.index, fraud_by_hour['Fraud_Rate'] * 100, marker='o', linewidth=2)
plt.fill_between(fraud_by_hour.index, fraud_by_hour['Fraud_Rate'] * 100, alpha=0.3)
plt.axvspan(2, 5, alpha=0.2, color='red', label='Night (2-5 AM)')  # Night hours
plt.xlabel('Hour of Day')
plt.ylabel('Fraud Rate (%)')
plt.title('Fraud Rate by Hour of Day')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nFraud Rate by Hour:")
print(fraud_by_hour)

In [None]:
# Wochentag-Analyse
fraud_by_day = df.groupby('day_of_week')['is_fraud'].mean() * 100

plt.figure(figsize=(10, 5))
plt.bar(range(7), fraud_by_day.values, color=['blue']*5 + ['red']*2)
plt.xticks(range(7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.xlabel('Day of Week')
plt.ylabel('Fraud Rate (%)')
plt.title('Fraud Rate by Day of Week')
plt.tight_layout()
plt.show()

## 5. Geo-Patterns

**Distance zwischen Customer und Merchant als Fraud-Indikator?**

In [None]:
# Berechne Distance (nur für Subset - dauert sonst lange!)
sample_df = df.sample(min(10000, len(df))).copy()

print("Calculating distances for sample...")
sample_df['distance_km'] = sample_df.apply(
    lambda row: calculate_haversine_distance(
        row['lat'], row['long'],
        row['merch_lat'], row['merch_long']
    ), 
    axis=1
)

print("✓ Distances calculated")

In [None]:
# Distance Statistik
print("\nDistance Statistics by Fraud:")
print(sample_df.groupby('is_fraud')['distance_km'].describe())

In [None]:
# Distance Distribution
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.hist(sample_df[sample_df['is_fraud'] == 0]['distance_km'], 
         bins=50, alpha=0.7, label='Legitimate', color='green')
plt.hist(sample_df[sample_df['is_fraud'] == 1]['distance_km'], 
         bins=50, alpha=0.7, label='Fraud', color='red')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.title('Distance Distribution')
plt.legend()
plt.xlim(0, 500)  # Focus

plt.subplot(1, 2, 2)
sample_df.boxplot(column='distance_km', by='is_fraud')
plt.ylabel('Distance (km)')
plt.title('Distance by Fraud Status')
plt.suptitle('')  # Remove default title
plt.ylim(0, 500)

plt.tight_layout()
plt.show()

## 6. Insights für Rule Engine

### Zusammenfassung der Patterns:

In [None]:
print("="*60)
print("KEY INSIGHTS FÜR RULES")
print("="*60)

# Amount
fraud_avg_amt = df[df['is_fraud'] == 1]['amt'].mean()
legit_avg_amt = df[df['is_fraud'] == 0]['amt'].mean()
print(f"\n1. AMOUNT")
print(f"   Fraud Avg:  ${fraud_avg_amt:.2f}")
print(f"   Legit Avg:  ${legit_avg_amt:.2f}")
print(f"   → Fraud-Transaktionen sind im Schnitt HÖHER")

# Time
night_fraud_rate = df[(df['hour'] >= 2) & (df['hour'] < 5)]['is_fraud'].mean()
day_fraud_rate = df[(df['hour'] >= 9) & (df['hour'] < 17)]['is_fraud'].mean()
print(f"\n2. TIME")
print(f"   Night (2-5 AM) Fraud Rate: {night_fraud_rate:.2%}")
print(f"   Day (9-17) Fraud Rate:     {day_fraud_rate:.2%}")
print(f"   → Nachts höheres Risiko!")

# Distance (auf Sample)
fraud_avg_dist = sample_df[sample_df['is_fraud'] == 1]['distance_km'].mean()
legit_avg_dist = sample_df[sample_df['is_fraud'] == 0]['distance_km'].mean()
print(f"\n3. DISTANCE")
print(f"   Fraud Avg Distance:  {fraud_avg_dist:.0f} km")
print(f"   Legit Avg Distance:  {legit_avg_dist:.0f} km")
print(f"   → Fraud oft weiter entfernt")

print("\n" + "="*60)
print("EMPFOHLENE RULES:")
print("="*60)
print("1. High Amount: amt > 3 * user_average")
print("2. Night Transaction: hour BETWEEN 2 AND 5")
print("3. Large Distance: distance > 100 km")
print("4. High Frequency: > 5 txns per hour")
print("5. Round Amount: amt IN [100, 500, 1000, ...]")
print("6. Out-of-State: different state than usual")
print("7. Risky Category: category in high-fraud categories")
print("="*60)

## Next Steps

1. ✅ Dataset verstanden
2. ✅ Fraud-Patterns identifiziert
3. → **Nächstes Notebook:** `02_rule_analysis.ipynb` - Rule Engine implementieren und testen

In [None]:
# Optional: Speichere verarbeitete Daten für später
# df.to_csv('../data/processed/data_with_time_features.csv', index=False)
# print("✓ Data saved")