# Fraud Detection - Exploratory Data Analysis

## Objective
Analyze the `tx_train_gold` dataset to understand fraud patterns and identify key features for model building.

## Dataset
- **Source**: `workspace.fraud.tx_train_gold`
- **Size**: 8.9M transactions (labeled)
- **Fraud Rate**: ~0.15% (13,332 fraud cases)
- **Features**: 37 columns (transactions + user + card + MCC dimensions)

## Analysis Focus Areas
1. **Data Quality**: Missing values, outliers, data types
2. **Fraud Distribution**: Class imbalance, fraud rate
3. **Transaction Patterns**: Amount, time, geography
4. **MCC Categories**: High-risk merchant types
5. **User Demographics**: Age, income, credit score
6. **Card Characteristics**: Type, brand, chip usage
7. **Feature Correlations**: Which features predict fraud?

In [0]:
# Data manipulation
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

In [0]:
# Load the gold training dataset
df = spark.table("workspace.fraud.tx_train_gold")

print(f"Dataset loaded: {df.count():,} rows")
print(f"Columns: {len(df.columns)}")
print("\nFirst few column names:")
print(df.columns[:10])

---
## 1. Data Overview & Quality Check
Understand the dataset structure, data types, and quality issues.

In [0]:
# Display schema
print("Dataset Schema:")
print("=" * 80)
df.printSchema()

# Show sample data
print("\nSample Data (first 5 rows):")
display(df.limit(5))

In [0]:
# Check for missing values
from pyspark.sql.functions import col, count, when, isnan

print("Missing Values Analysis:")
print("=" * 80)

# Calculate null counts for each column
missing_counts = df.select([
    count(when(col(c).isNull(), c)).alias(c) for c in df.columns
])

# Convert to pandas for better display
missing_df = missing_counts.toPandas().T
missing_df.columns = ['null_count']
missing_df['null_percentage'] = (missing_df['null_count'] / df.count() * 100).round(2)
missing_df = missing_df[missing_df['null_count'] > 0].sort_values('null_count', ascending=False)

if len(missing_df) > 0:
    print(f"\nColumns with missing values ({len(missing_df)} columns):")
    display(missing_df)
else:
    print("\n✅ No missing values found!")

In [0]:
# Get basic statistics for numeric columns
print("Basic Statistics for Key Numeric Columns:")
print("=" * 80)

numeric_cols = ['amount', 'current_age', 'credit_score', 'yearly_income', 'total_debt']

for col_name in numeric_cols:
    if col_name in df.columns:
        stats = df.select(col_name).summary('count', 'mean', 'stddev', 'min', 'max')
        print(f"\n{col_name.upper()}:")
        stats.show()

---
## 2. Fraud Distribution Analysis
Understand the class imbalance and fraud rate in the dataset.

In [0]:
# Analyze fraud distribution
print("Fraud Distribution:")
print("=" * 80)

fraud_dist = df.groupBy('label').count().orderBy('label')
fraud_dist.show()

# Calculate percentages
total_count = df.count()
fraud_counts = fraud_dist.collect()

for row in fraud_counts:
    label = row['label']
    count = row['count']
    percentage = (count / total_count * 100)
    print(f"{label:10} {count:>10,} transactions ({percentage:.2f}%)")

# Calculate fraud rate
fraud_count = df.filter(col('label') == 'Yes').count()
fraud_rate = (fraud_count / total_count * 100)
print(f"\n⚠️  Fraud Rate: {fraud_rate:.3f}%")
print(f"⚠️  Class Imbalance Ratio: 1:{int(total_count/fraud_count)}")

In [0]:
# Visualize fraud distribution
import matplotlib.pyplot as plt

# Get fraud counts as pandas
fraud_pd = fraud_dist.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
ax1.bar(fraud_pd['label'], fraud_pd['count'], color=['green', 'red'])
ax1.set_xlabel('Label')
ax1.set_ylabel('Count')
ax1.set_title('Fraud vs Legitimate Transactions (Count)')
ax1.set_yscale('log')  # Log scale to see both bars
for i, v in enumerate(fraud_pd['count']):
    ax1.text(i, v, f'{v:,}', ha='center', va='bottom')

# Pie chart
colors = ['green', 'red']
ax2.pie(fraud_pd['count'], labels=fraud_pd['label'], autopct='%1.2f%%', 
        colors=colors, startangle=90)
ax2.set_title('Fraud Distribution (Percentage)')

plt.tight_layout()
plt.show()

print("✅ Visualization complete!")

---
## 3. Transaction Amount Analysis
Compare transaction amounts between fraud and legitimate transactions.

In [0]:
# Compare amount statistics by fraud label
print("Transaction Amount Statistics by Label:")
print("=" * 80)

amount_stats = df.groupBy('label').agg(
    F.count('amount').alias('count'),
    F.mean('amount').alias('mean_amount'),
    F.stddev('amount').alias('std_amount'),
    F.min('amount').alias('min_amount'),
    F.percentile_approx('amount', 0.25).alias('q25'),
    F.percentile_approx('amount', 0.50).alias('median'),
    F.percentile_approx('amount', 0.75).alias('q75'),
    F.max('amount').alias('max_amount')
).orderBy('label')

amount_stats.show()

# Display in a more readable format
print("\nKey Insights:")
for row in amount_stats.collect():
    print(f"\n{row['label']} Transactions:")
    print(f"  Mean: ${row['mean_amount']:.2f}")
    print(f"  Median: ${row['median']:.2f}")
    print(f"  Range: ${row['min_amount']:.2f} to ${row['max_amount']:.2f}")

In [0]:
# Sample data for visualization (to avoid memory issues)
sample_size = 100000
df_sample = df.select('label', 'amount').sample(False, sample_size / df.count(), seed=42).toPandas()
df_sample['amount'] = df_sample['amount'].astype(float)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Box plot
df_sample.boxplot(column='amount', by='label', ax=axes[0, 0])
axes[0, 0].set_title('Transaction Amount by Label (Box Plot)')
axes[0, 0].set_xlabel('Label')
axes[0, 0].set_ylabel('Amount ($)')
plt.sca(axes[0, 0])
plt.xticks(rotation=0)

# 2. Histogram - Fraud
fraud_amounts = df_sample[df_sample['label'] == 'Yes']['amount']
axes[0, 1].hist(fraud_amounts, bins=50, color='red', alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Fraud Transaction Amounts')
axes[0, 1].set_xlabel('Amount ($)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(fraud_amounts.median(), color='darkred', linestyle='--', label=f'Median: ${fraud_amounts.median():.2f}')
axes[0, 1].legend()

# 3. Histogram - Legitimate
legit_amounts = df_sample[df_sample['label'] == 'No']['amount']
axes[1, 0].hist(legit_amounts, bins=50, color='green', alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Legitimate Transaction Amounts')
axes[1, 0].set_xlabel('Amount ($)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(legit_amounts.median(), color='darkgreen', linestyle='--', label=f'Median: ${legit_amounts.median():.2f}')
axes[1, 0].legend()

# 4. Overlapping distributions
axes[1, 1].hist(fraud_amounts, bins=50, alpha=0.5, label='Fraud', color='red', density=True)
axes[1, 1].hist(legit_amounts, bins=50, alpha=0.5, label='Legitimate', color='green', density=True)
axes[1, 1].set_title('Amount Distribution Comparison (Normalized)')
axes[1, 1].set_xlabel('Amount ($)')
axes[1, 1].set_ylabel('Density')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("✅ Amount analysis complete!")

---
## 4. Temporal Patterns
Analyze fraud patterns over time: hour of day, day of week, and month.

In [0]:
# Extract time-based features from date column
df_time = df.withColumn('hour', F.hour('date')) \
            .withColumn('day_of_week', F.dayofweek('date')) \
            .withColumn('month', F.month('date')) \
            .withColumn('year', F.year('date'))

print("Time features extracted successfully!")
print("\nSample with time features:")
df_time.select('date', 'hour', 'day_of_week', 'month', 'label').show(5)

In [0]:
# Analyze fraud rate by hour of day
print("Fraud Analysis by Hour of Day:")
print("=" * 80)

hourly_fraud = df_time.groupBy('hour').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

hourly_fraud = hourly_fraud.orderBy('hour')
hourly_fraud.show(24)

# Visualize
hourly_pd = hourly_fraud.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Transaction volume by hour
ax1.bar(hourly_pd['hour'], hourly_pd['total_transactions'], color='steelblue', alpha=0.7)
ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Number of Transactions')
ax1.set_title('Transaction Volume by Hour')
ax1.grid(axis='y', alpha=0.3)

# Fraud rate by hour
ax2.plot(hourly_pd['hour'], hourly_pd['fraud_rate'], marker='o', color='red', linewidth=2)
ax2.set_xlabel('Hour of Day')
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate by Hour of Day')
ax2.grid(alpha=0.3)
ax2.axhline(y=hourly_pd['fraud_rate'].mean(), color='orange', linestyle='--', label='Average Fraud Rate')
ax2.legend()

plt.tight_layout()
plt.show()

In [0]:
# Analyze fraud rate by day of week
print("Fraud Analysis by Day of Week:")
print("=" * 80)

day_names = {1: 'Sunday', 2: 'Monday', 3: 'Tuesday', 4: 'Wednesday', 5: 'Thursday', 6: 'Friday', 7: 'Saturday'}

daily_fraud = df_time.groupBy('day_of_week').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

daily_fraud = daily_fraud.orderBy('day_of_week')
daily_fraud.show()

# Visualize
daily_pd = daily_fraud.toPandas()
daily_pd['day_name'] = daily_pd['day_of_week'].map(day_names)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Transaction volume by day
ax1.bar(daily_pd['day_name'], daily_pd['total_transactions'], color='steelblue', alpha=0.7)
ax1.set_xlabel('Day of Week')
ax1.set_ylabel('Number of Transactions')
ax1.set_title('Transaction Volume by Day of Week')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# Fraud rate by day
ax2.bar(daily_pd['day_name'], daily_pd['fraud_rate'], color='red', alpha=0.7)
ax2.set_xlabel('Day of Week')
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate by Day of Week')
ax2.tick_params(axis='x', rotation=45)
ax2.axhline(y=daily_pd['fraud_rate'].mean(), color='orange', linestyle='--', label='Average')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [0]:
# Analyze fraud rate by month
print("Fraud Analysis by Month:")
print("=" * 80)

month_names = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
               7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

monthly_fraud = df_time.groupBy('month').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

monthly_fraud = monthly_fraud.orderBy('month')
monthly_fraud.show()

# Visualize
monthly_pd = monthly_fraud.toPandas()
monthly_pd['month_name'] = monthly_pd['month'].map(month_names)

fig, ax = plt.subplots(1, 1, figsize=(14, 6))

ax2 = ax.twinx()
ax.bar(monthly_pd['month_name'], monthly_pd['total_transactions'], color='steelblue', alpha=0.5, label='Transaction Volume')
ax2.plot(monthly_pd['month_name'], monthly_pd['fraud_rate'], marker='o', color='red', linewidth=2, label='Fraud Rate')

ax.set_xlabel('Month')
ax.set_ylabel('Number of Transactions', color='steelblue')
ax2.set_ylabel('Fraud Rate (%)', color='red')
ax.set_title('Transaction Volume and Fraud Rate by Month')
ax.tick_params(axis='y', labelcolor='steelblue')
ax2.tick_params(axis='y', labelcolor='red')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✅ Temporal analysis complete!")

---
## 5. Geographic Patterns
Analyze fraud patterns by merchant location (state and city).

In [0]:
# Analyze fraud rate by merchant state
print("Fraud Analysis by Merchant State:")
print("=" * 80)

state_fraud = df.groupBy('merchant_state').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

# Filter out null states and get top states by transaction volume
state_fraud = state_fraud.filter(F.col('merchant_state').isNotNull())

print("\nTop 10 States by Transaction Volume:")
state_fraud.orderBy(F.desc('total_transactions')).show(10)

print("\nTop 10 States by Fraud Rate (min 1000 transactions):")
state_fraud.filter(F.col('total_transactions') >= 1000) \
           .orderBy(F.desc('fraud_rate')).show(10)

In [0]:
# Visualize top states
state_pd = state_fraud.filter(F.col('total_transactions') >= 1000).toPandas()
top_states = state_pd.nlargest(15, 'total_transactions')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Top states by volume
ax1.barh(top_states['merchant_state'], top_states['total_transactions'], color='steelblue', alpha=0.7)
ax1.set_xlabel('Number of Transactions')
ax1.set_ylabel('State')
ax1.set_title('Top 15 States by Transaction Volume')
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3)

# Top states by fraud rate
top_fraud_states = state_pd.nlargest(15, 'fraud_rate')
ax2.barh(top_fraud_states['merchant_state'], top_fraud_states['fraud_rate'], color='red', alpha=0.7)
ax2.set_xlabel('Fraud Rate (%)')
ax2.set_ylabel('State')
ax2.set_title('Top 15 States by Fraud Rate (min 1000 txns)')
ax2.invert_yaxis()
ax2.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✅ Geographic analysis complete!")

In [0]:
# Compare online vs physical merchant locations
print("Online vs Physical Merchant Fraud Analysis:")
print("=" * 80)

online_fraud = df.withColumn('is_online', 
    F.when(F.col('merchant_city') == 'ONLINE', 'Online').otherwise('Physical')
).groupBy('is_online').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

online_fraud.show()

# Visualize
online_pd = online_fraud.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Transaction volume
ax1.bar(online_pd['is_online'], online_pd['total_transactions'], color=['steelblue', 'orange'], alpha=0.7)
ax1.set_ylabel('Number of Transactions')
ax1.set_title('Transaction Volume: Online vs Physical')
ax1.grid(axis='y', alpha=0.3)
for i, v in enumerate(online_pd['total_transactions']):
    ax1.text(i, v, f'{v:,}', ha='center', va='bottom')

# Fraud rate
ax2.bar(online_pd['is_online'], online_pd['fraud_rate'], color=['red', 'darkred'], alpha=0.7)
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate: Online vs Physical')
ax2.grid(axis='y', alpha=0.3)
for i, v in enumerate(online_pd['fraud_rate']):
    ax2.text(i, v, f'{v:.3f}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()

---
## 6. MCC (Merchant Category Code) Analysis
Identify high-risk merchant categories and fraud patterns by business type.

In [0]:
# Analyze fraud rate by MCC category
print("Fraud Analysis by MCC Category:")
print("=" * 80)

mcc_fraud = df.groupBy('mcc', 'mcc_description').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

# Filter out null MCC and get categories with significant volume
mcc_fraud = mcc_fraud.filter(F.col('mcc').isNotNull())

print("\nTop 10 MCC Categories by Transaction Volume:")
mcc_fraud.orderBy(F.desc('total_transactions')).show(10, truncate=False)

print("\nTop 10 MCC Categories by Fraud Rate (min 1000 transactions):")
mcc_fraud.filter(F.col('total_transactions') >= 1000) \
         .orderBy(F.desc('fraud_rate')).show(10, truncate=False)

In [0]:
# Visualize MCC patterns
mcc_pd = mcc_fraud.filter(F.col('total_transactions') >= 1000).toPandas()

fig, axes = plt.subplots(2, 1, figsize=(14, 12))

# Top 15 MCC by volume
top_mcc_volume = mcc_pd.nlargest(15, 'total_transactions')
axes[0].barh(top_mcc_volume['mcc_description'], top_mcc_volume['total_transactions'], color='steelblue', alpha=0.7)
axes[0].set_xlabel('Number of Transactions')
axes[0].set_title('Top 15 Merchant Categories by Transaction Volume')
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)

# Top 15 MCC by fraud rate
top_mcc_fraud = mcc_pd.nlargest(15, 'fraud_rate')
axes[1].barh(top_mcc_fraud['mcc_description'], top_mcc_fraud['fraud_rate'], color='red', alpha=0.7)
axes[1].set_xlabel('Fraud Rate (%)')
axes[1].set_title('Top 15 Merchant Categories by Fraud Rate (min 1000 txns)')
axes[1].invert_yaxis()
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✅ MCC analysis complete!")

In [0]:
# Summary statistics for MCC fraud rates
print("MCC Fraud Rate Summary Statistics:")
print("=" * 80)

mcc_stats = mcc_fraud.filter(F.col('total_transactions') >= 100).select('fraud_rate').summary()
mcc_stats.show()

# Identify high-risk vs low-risk categories
avg_fraud_rate = mcc_fraud.filter(F.col('total_transactions') >= 100).agg(F.avg('fraud_rate')).collect()[0][0]

print(f"\nAverage Fraud Rate across MCC categories: {avg_fraud_rate:.3f}%")
print(f"\nHigh-Risk Categories (fraud rate > {avg_fraud_rate:.3f}%):")
mcc_fraud.filter((F.col('total_transactions') >= 100) & (F.col('fraud_rate') > avg_fraud_rate)) \
         .orderBy(F.desc('fraud_rate')).show(10, truncate=False)

---
## 7. User Demographics Analysis
Analyze fraud patterns by user characteristics: age, income, credit score.

In [0]:
# Analyze fraud by age groups
print("Fraud Analysis by Age Groups:")
print("=" * 80)

# Create age bins
age_fraud = df.withColumn('age_group',
    F.when(F.col('current_age') < 25, '18-24')
     .when(F.col('current_age') < 35, '25-34')
     .when(F.col('current_age') < 45, '35-44')
     .when(F.col('current_age') < 55, '45-54')
     .when(F.col('current_age') < 65, '55-64')
     .otherwise('65+')
).groupBy('age_group').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count'),
    F.avg('current_age').alias('avg_age')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

age_fraud = age_fraud.orderBy('avg_age')
age_fraud.show()

# Visualize
age_pd = age_fraud.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Transaction volume by age
ax1.bar(age_pd['age_group'], age_pd['total_transactions'], color='steelblue', alpha=0.7)
ax1.set_xlabel('Age Group')
ax1.set_ylabel('Number of Transactions')
ax1.set_title('Transaction Volume by Age Group')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# Fraud rate by age
ax2.plot(age_pd['age_group'], age_pd['fraud_rate'], marker='o', color='red', linewidth=2, markersize=8)
ax2.set_xlabel('Age Group')
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate by Age Group')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [0]:
# Analyze fraud by income levels
print("Fraud Analysis by Income Levels:")
print("=" * 80)

# Create income bins
income_fraud = df.withColumn('income_group',
    F.when(F.col('yearly_income') < 30000, 'Under $30K')
     .when(F.col('yearly_income') < 50000, '$30K-$50K')
     .when(F.col('yearly_income') < 75000, '$50K-$75K')
     .when(F.col('yearly_income') < 100000, '$75K-$100K')
     .when(F.col('yearly_income') < 150000, '$100K-$150K')
     .otherwise('Over $150K')
).groupBy('income_group').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count'),
    F.avg('yearly_income').alias('avg_income')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

income_fraud = income_fraud.orderBy('avg_income')
income_fraud.show()

# Visualize
income_pd = income_fraud.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Transaction volume by income
ax1.bar(income_pd['income_group'], income_pd['total_transactions'], color='green', alpha=0.7)
ax1.set_xlabel('Income Group')
ax1.set_ylabel('Number of Transactions')
ax1.set_title('Transaction Volume by Income Level')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# Fraud rate by income
ax2.plot(income_pd['income_group'], income_pd['fraud_rate'], marker='o', color='red', linewidth=2, markersize=8)
ax2.set_xlabel('Income Group')
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate by Income Level')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [0]:
# Analyze fraud by credit score ranges
print("Fraud Analysis by Credit Score:")
print("=" * 80)

# Create credit score bins
credit_fraud = df.withColumn('credit_group',
    F.when(F.col('credit_score') < 580, 'Poor (<580)')
     .when(F.col('credit_score') < 670, 'Fair (580-669)')
     .when(F.col('credit_score') < 740, 'Good (670-739)')
     .when(F.col('credit_score') < 800, 'Very Good (740-799)')
     .otherwise('Excellent (800+)')
).groupBy('credit_group').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count'),
    F.avg('credit_score').alias('avg_credit_score')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

credit_fraud = credit_fraud.orderBy('avg_credit_score')
credit_fraud.show()

# Visualize
credit_pd = credit_fraud.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Transaction volume by credit score
ax1.bar(credit_pd['credit_group'], credit_pd['total_transactions'], color='purple', alpha=0.7)
ax1.set_xlabel('Credit Score Range')
ax1.set_ylabel('Number of Transactions')
ax1.set_title('Transaction Volume by Credit Score')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# Fraud rate by credit score
ax2.plot(credit_pd['credit_group'], credit_pd['fraud_rate'], marker='o', color='red', linewidth=2, markersize=8)
ax2.set_xlabel('Credit Score Range')
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate by Credit Score')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✅ Demographics analysis complete!")

---
## 8. Card Characteristics Analysis
Analyze fraud patterns by card type, brand, chip usage, and dark web exposure.

In [0]:
# Analyze fraud by card type
print("Fraud Analysis by Card Type:")
print("=" * 80)

card_type_fraud = df.groupBy('card_type').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

card_type_fraud.orderBy(F.desc('fraud_rate')).show()

# Analyze fraud by card brand
print("\nFraud Analysis by Card Brand:")
print("=" * 80)

card_brand_fraud = df.groupBy('card_brand').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

card_brand_fraud.orderBy(F.desc('fraud_rate')).show()

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Card type fraud rate
card_type_pd = card_type_fraud.toPandas()
ax1.bar(card_type_pd['card_type'], card_type_pd['fraud_rate'], color='orange', alpha=0.7)
ax1.set_xlabel('Card Type')
ax1.set_ylabel('Fraud Rate (%)')
ax1.set_title('Fraud Rate by Card Type')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# Card brand fraud rate
card_brand_pd = card_brand_fraud.toPandas()
ax2.bar(card_brand_pd['card_brand'], card_brand_pd['fraud_rate'], color='purple', alpha=0.7)
ax2.set_xlabel('Card Brand')
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate by Card Brand')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [0]:
# Analyze fraud by chip usage
print("Fraud Analysis: Chip vs Swipe Transactions:")
print("=" * 80)

chip_fraud = df.groupBy('use_chip').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

chip_fraud.orderBy(F.desc('fraud_rate')).show()

# Visualize
chip_pd = chip_fraud.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Transaction volume
ax1.bar(chip_pd['use_chip'], chip_pd['total_transactions'], color=['steelblue', 'orange', 'green'], alpha=0.7)
ax1.set_xlabel('Transaction Type')
ax1.set_ylabel('Number of Transactions')
ax1.set_title('Transaction Volume by Type')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# Fraud rate
ax2.bar(chip_pd['use_chip'], chip_pd['fraud_rate'], color=['red', 'darkred', 'crimson'], alpha=0.7)
ax2.set_xlabel('Transaction Type')
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate by Transaction Type')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [0]:
# Analyze fraud for cards on dark web
print("Fraud Analysis: Card Dark Web Exposure:")
print("=" * 80)

dark_web_fraud = df.groupBy('card_on_dark_web').agg(
    F.count('*').alias('total_transactions'),
    F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
).withColumn('fraud_rate', (F.col('fraud_count') / F.col('total_transactions') * 100))

dark_web_fraud.orderBy(F.desc('fraud_rate')).show()

# Visualize
dark_web_pd = dark_web_fraud.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Transaction volume
ax1.bar(dark_web_pd['card_on_dark_web'], dark_web_pd['total_transactions'], color=['green', 'red'], alpha=0.7)
ax1.set_xlabel('Card on Dark Web')
ax1.set_ylabel('Number of Transactions')
ax1.set_title('Transaction Volume by Dark Web Status')
ax1.grid(axis='y', alpha=0.3)
for i, v in enumerate(dark_web_pd['total_transactions']):
    ax1.text(i, v, f'{v:,}', ha='center', va='bottom')

# Fraud rate
ax2.bar(dark_web_pd['card_on_dark_web'], dark_web_pd['fraud_rate'], color=['green', 'darkred'], alpha=0.7)
ax2.set_xlabel('Card on Dark Web')
ax2.set_ylabel('Fraud Rate (%)')
ax2.set_title('Fraud Rate by Dark Web Status')
ax2.grid(axis='y', alpha=0.3)
for i, v in enumerate(dark_web_pd['fraud_rate']):
    ax2.text(i, v, f'{v:.3f}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\n✅ Card characteristics analysis complete!")

---
## 9. Feature Correlations & Key Insights
Identify which features correlate most with fraud for model building.

In [0]:
# Analyze correlations for numeric features
print("Correlation Analysis with Fraud Label:")
print("=" * 80)

# Convert label to numeric for correlation
df_corr = df.withColumn('is_fraud', F.when(F.col('label') == 'Yes', 1).otherwise(0))

# Select numeric columns
numeric_features = ['amount', 'current_age', 'credit_score', 'yearly_income', 'total_debt', 
                    'num_credit_cards', 'num_cards_issued', 'year_pin_last_changed']

# Calculate correlations
correlations = []
for feature in numeric_features:
    if feature in df_corr.columns:
        corr = df_corr.stat.corr('is_fraud', feature)
        correlations.append((feature, corr))

# Sort by absolute correlation
correlations_sorted = sorted(correlations, key=lambda x: abs(x[1]), reverse=True)

print("\nFeature Correlations with Fraud (sorted by absolute value):")
for feature, corr in correlations_sorted:
    print(f"{feature:30} {corr:>10.4f}")

# Visualize
import pandas as pd
corr_df = pd.DataFrame(correlations_sorted, columns=['Feature', 'Correlation'])

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['red' if x < 0 else 'green' for x in corr_df['Correlation']]
ax.barh(corr_df['Feature'], corr_df['Correlation'], color=colors, alpha=0.7)
ax.set_xlabel('Correlation with Fraud')
ax.set_title('Feature Correlations with Fraud Label')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [0]:
# Analyze fraud rates for key categorical features
print("Fraud Rates by Categorical Features:")
print("=" * 80)

categorical_features = ['use_chip', 'card_type', 'card_brand', 'has_chip', 'card_on_dark_web', 'gender']

for feature in categorical_features:
    if feature in df.columns:
        print(f"\n{feature.upper()}:")
        feature_fraud = df.groupBy(feature).agg(
            F.count('*').alias('count'),
            F.sum(F.when(F.col('label') == 'Yes', 1).otherwise(0)).alias('fraud_count')
        ).withColumn('fraud_rate', (F.col('fraud_count') / F.col('count') * 100))
        
        feature_fraud.orderBy(F.desc('fraud_rate')).show(5)

In [0]:
# Summary of key insights
print("\n" + "=" * 80)
print("KEY INSIGHTS SUMMARY")
print("=" * 80)

# 1. Overall fraud rate
total_txns = df.count()
fraud_txns = df.filter(F.col('label') == 'Yes').count()
fraud_rate = (fraud_txns / total_txns * 100)

print(f"\n1. FRAUD OVERVIEW:")
print(f"   - Total Transactions: {total_txns:,}")
print(f"   - Fraud Cases: {fraud_txns:,}")
print(f"   - Fraud Rate: {fraud_rate:.3f}%")
print(f"   - Class Imbalance: 1:{int(total_txns/fraud_txns)}")

# 2. Amount insights
fraud_avg_amount = df.filter(F.col('label') == 'Yes').agg(F.avg('amount')).collect()[0][0]
legit_avg_amount = df.filter(F.col('label') == 'No').agg(F.avg('amount')).collect()[0][0]

print(f"\n2. TRANSACTION AMOUNTS:")
print(f"   - Average Fraud Amount: ${fraud_avg_amount:.2f}")
print(f"   - Average Legitimate Amount: ${legit_avg_amount:.2f}")
print(f"   - Difference: ${abs(fraud_avg_amount - legit_avg_amount):.2f}")

# 3. Top risk factors
print(f"\n3. TOP RISK INDICATORS:")
print(f"   - Check correlations above for numeric features")
print(f"   - Review categorical fraud rates for patterns")
print(f"   - Consider temporal, geographic, and MCC patterns")

print(f"\n4. RECOMMENDATIONS FOR MODELING:")
print(f"   - Address class imbalance (SMOTE, class weights, or undersampling)")
print(f"   - Feature engineering: time-based, geographic, behavioral features")
print(f"   - Consider ensemble methods for better fraud detection")
print(f"   - Focus on high-risk segments identified in EDA")

print("\n" + "=" * 80)
print("✅ EXPLORATORY DATA ANALYSIS COMPLETE!")
print("=" * 80)