# Data Exploration and Parsing Analysis

This notebook explores the synthetic transaction logs dataset and tests our robust parser implementation.

## Objectives:
1. Load and explore the raw dataset
2. Test the log parser on various formats
3. Analyze parsing success rates and data quality
4. Create visualizations for data understanding

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import our modules
from parser.log_parser import TransactionLogParser
from utils.config import config
from utils.visualization import viz

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ All imports successful!")
print(f"Configuration loaded: {config.get('data.input_file')}")

## 1. Load and Explore Raw Dataset

In [None]:
# Load raw data
data_file = '../synthetic_dirty_transaction_logs.csv'
df_raw = pd.read_csv(data_file, delimiter='|', header=None, names=['line_num', 'raw_log'])

print(f"📊 Dataset Overview:")
print(f"Total entries: {len(df_raw):,}")
print(f"Columns: {list(df_raw.columns)}")
print(f"Memory usage: {df_raw.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display basic statistics
print("\n📈 Raw Data Info:")
print(df_raw.info())

In [None]:
# Sample different types of log entries
print("🔍 Sample Log Entries:")
print("=" * 80)

# Get non-null, non-empty entries
valid_logs = df_raw[df_raw['raw_log'].notna() & 
                   (df_raw['raw_log'] != '""') & 
                   (df_raw['raw_log'] != 'MALFORMED_LOG')]['raw_log']

# Show first 10 valid log entries to understand formats
for i, log_entry in enumerate(valid_logs.head(10), 1):
    print(f"{i:2d}. {log_entry}")
    
print("\n" + "=" * 80)

In [None]:
# Analyze log entry patterns
print("📋 Log Entry Pattern Analysis:")
print("=" * 50)

# Count different patterns
pattern_counts = {
    'Empty ("")': len(df_raw[df_raw['raw_log'] == '""']),
    'Malformed': len(df_raw[df_raw['raw_log'] == 'MALFORMED_LOG']),
    'Null/NaN': len(df_raw[df_raw['raw_log'].isna()]),
    'With |': len(df_raw[df_raw['raw_log'].str.contains('\|', na=False)]),
    'With ::': len(df_raw[df_raw['raw_log'].str.contains('::', na=False)]),
    'With >>>': len(df_raw[df_raw['raw_log'].str.contains('>>', na=False)]),
    'With :::': len(df_raw[df_raw['raw_log'].str.contains(':::', na=False)]),
    'With usr:': len(df_raw[df_raw['raw_log'].str.contains('usr:', na=False)]),
    'Space separated': len(df_raw[df_raw['raw_log'].str.startswith('user', na=False)])
}

for pattern, count in pattern_counts.items():
    percentage = (count / len(df_raw)) * 100
    print(f"{pattern:15s}: {count:5d} ({percentage:5.1f}%)")

print(f"\nTotal valid logs: {len(valid_logs):,}")

## 2. Test Log Parser

In [None]:
# Initialize parser
parser = TransactionLogParser()
print("🔧 Parser initialized successfully!")

# Test parser on sample entries
print("\n🧪 Testing Parser on Sample Entries:")
print("=" * 80)

test_entries = [
    "2025-07-05 19:18:10::user1069::withdrawal::2995.12::London::iPhone 13",
    "usr:user1076|cashout|€4821.85|Glasgow|2025-07-15 12:56:05|Pixel 6",
    "2025-07-20 05:38:14 >> [user1034] did top-up - amt=€2191.06 - None // dev:iPhone 13",
    "13/07/2025 14:53:36 ::: user1048 *** TOP-UP ::: amt:3248.15£ @ Manchester <iPhone 13>",
    "2025-06-23 14:45:58 - user=user1075 - action=debit $1215.74 - ATM: Leeds - device=Samsung Galaxy S10"
]

for i, entry in enumerate(test_entries, 1):
    print(f"\nTest {i}:")
    print(f"Raw: {entry}")
    
    result = parser.parse_log_entry(entry)
    print(f"Parsed: {result.is_parsed}")
    
    if result.is_parsed:
        print(f"  User: {result.user_id}")
        print(f"  Type: {result.transaction_type}")
        print(f"  Amount: {result.currency}{result.amount}")
        print(f"  Location: {result.location}")
        print(f"  Device: {result.device}")
        print(f"  Timestamp: {result.timestamp}")
    else:
        print(f"  Errors: {result.parse_errors}")

## 3. Parse Complete Dataset

In [None]:
# Parse the complete dataset
print("🚀 Parsing complete dataset...")
print("This may take a few minutes...")

df_parsed, parsing_stats = parser.parse_dataset(data_file)

print("\n✅ Parsing Complete!")
print("=" * 50)
print(f"Total logs processed: {parsing_stats['total_logs']:,}")
print(f"Successfully parsed: {parsing_stats['parsed_successfully']:,}")
print(f"Parsing failed: {parsing_stats['parsing_failed']:,}")
print(f"Empty logs: {parsing_stats['empty_logs']:,}")
print(f"Malformed logs: {parsing_stats['malformed_logs']:,}")

success_rate = (parsing_stats['parsed_successfully'] / parsing_stats['total_logs']) * 100
print(f"\n📈 Overall Success Rate: {success_rate:.1f}%")

In [None]:
# Display comprehensive parsing statistics
print("\n" + "="*80)
print("📊 COMPREHENSIVE PARSING STATISTICS")
print("="*80)
parser.print_parsing_statistics(parsing_stats)

# Export detailed statistics report
parser.export_statistics_report(parsing_stats, '../results/detailed_parsing_report.json')
print("\n💾 Detailed parsing report saved to results/detailed_parsing_report.json")

# Create and display summary table
summary_table = parser.get_parsing_summary_table(parsing_stats)
print("\n📋 Parsing Summary Table:")
print(summary_table.to_string(index=False))

# Display parsed data overview
print("\n📊 Parsed Dataset Overview:")
print(df_parsed.info())

print("\n🎯 Successfully Parsed Data:")
df_valid = df_parsed[df_parsed['is_parsed'] == True]
print(f"Valid transactions: {len(df_valid):,}")
print(f"Date range: {df_valid['timestamp'].min()} to {df_valid['timestamp'].max()}")
print(f"Unique users: {df_valid['user_id'].nunique():,}")
print(f"Transaction types: {df_valid['transaction_type'].nunique()}")
print(f"Unique locations: {df_valid['location'].nunique()}")
print(f"Unique devices: {df_valid['device'].nunique()}")

## 4. Data Quality Analysis

In [None]:
# Create data quality visualization
fig = viz.plot_data_quality_report(df_valid, parsing_stats)
plt.show()

# Save the figure
fig.savefig('../results/data_quality_report.png', dpi=300, bbox_inches='tight')
print("💾 Data quality report saved to results/data_quality_report.png")

In [None]:
# Detailed data completeness analysis
print("🔍 Data Completeness Analysis:")
print("=" * 40)

completeness = {}
for col in ['timestamp', 'user_id', 'transaction_type', 'amount', 'currency', 'location', 'device']:
    non_null = df_valid[col].notna().sum()
    total = len(df_valid)
    completeness[col] = (non_null / total) * 100
    print(f"{col:16s}: {non_null:5d}/{total:5d} ({completeness[col]:.1f}%)")

print(f"\nOverall completeness score: {np.mean(list(completeness.values())):.1f}%")

In [None]:
# Transaction type distribution
print("💳 Transaction Type Distribution:")
type_counts = df_valid['transaction_type'].value_counts()
print(type_counts)

# Currency distribution
print("\n💰 Currency Distribution:")
currency_counts = df_valid['currency'].value_counts()
print(currency_counts)

# Location distribution
print("\n🌍 Location Distribution:")
location_counts = df_valid['location'].value_counts()
print(location_counts)

## 5. Temporal Analysis

In [None]:
# Add temporal features
df_valid = df_valid.copy()
df_valid['hour'] = df_valid['timestamp'].dt.hour
df_valid['day_of_week'] = df_valid['timestamp'].dt.dayofweek
df_valid['is_weekend'] = df_valid['day_of_week'].isin([5, 6])

# Plot temporal patterns
fig = viz.plot_temporal_patterns(df_valid)
plt.show()

# Save the figure
fig.savefig('../results/temporal_patterns.png', dpi=300, bbox_inches='tight')
print("💾 Temporal patterns saved to results/temporal_patterns.png")

## 6. User Behavior Analysis

In [None]:
# User behavior analysis
fig = viz.plot_user_behavior_analysis(df_valid)
plt.show()

# Save the figure
fig.savefig('../results/user_behavior_analysis.png', dpi=300, bbox_inches='tight')
print("💾 User behavior analysis saved to results/user_behavior_analysis.png")

In [None]:
# User statistics
print("👤 User Statistics:")
print("=" * 30)

user_stats = df_valid.groupby('user_id').agg({
    'amount': ['count', 'mean', 'std', 'min', 'max'],
    'location': 'nunique',
    'device': 'nunique',
    'transaction_type': 'nunique'
}).round(2)

user_stats.columns = ['tx_count', 'avg_amount', 'std_amount', 'min_amount', 'max_amount', 
                     'unique_locations', 'unique_devices', 'unique_types']

print("Summary statistics for all users:")
print(user_stats.describe())

print("\nTop 10 most active users:")
print(user_stats.sort_values('tx_count', ascending=False).head(10))

## 7. Amount Analysis

In [None]:
# Amount distribution analysis
print("💵 Amount Analysis:")
print("=" * 25)

amount_stats = df_valid['amount'].describe()
print("Amount statistics:")
print(amount_stats)

# Plot amount distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Transaction Amount Analysis', fontsize=16, fontweight='bold')

# Histogram
axes[0, 0].hist(df_valid['amount'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Amount Distribution')
axes[0, 0].set_xlabel('Amount')
axes[0, 0].set_ylabel('Frequency')

# Log scale histogram
axes[0, 1].hist(np.log1p(df_valid['amount']), bins=50, alpha=0.7, edgecolor='black', color='orange')
axes[0, 1].set_title('Amount Distribution (Log Scale)')
axes[0, 1].set_xlabel('Log(Amount + 1)')
axes[0, 1].set_ylabel('Frequency')

# Box plot by transaction type
df_valid.boxplot(column='amount', by='transaction_type', ax=axes[1, 0])
axes[1, 0].set_title('Amount by Transaction Type')
axes[1, 0].set_xlabel('Transaction Type')
axes[1, 0].set_ylabel('Amount')

# Box plot by currency
df_valid.boxplot(column='amount', by='currency', ax=axes[1, 1])
axes[1, 1].set_title('Amount by Currency')
axes[1, 1].set_xlabel('Currency')
axes[1, 1].set_ylabel('Amount')

plt.tight_layout()
plt.show()

# Save the figure
fig.savefig('../results/amount_analysis.png', dpi=300, bbox_inches='tight')
print("💾 Amount analysis saved to results/amount_analysis.png")

## 8. Save Processed Data

In [None]:
# Create results directory
os.makedirs('../results', exist_ok=True)

# Save processed data
df_valid.to_csv('../results/parsed_transactions.csv', index=False)
print(f"💾 Parsed transactions saved: {len(df_valid):,} records")

# Save parsing statistics
import json
with open('../results/parsing_stats.json', 'w') as f:
    # Convert datetime objects to strings for JSON serialization
    json_stats = parsing_stats.copy()
    json.dump(json_stats, f, indent=2)
print("💾 Parsing statistics saved")

print("\n✅ Data exploration complete!")
print(f"Ready for feature engineering with {len(df_valid):,} clean transactions")

## Summary

### Key Findings:
1. **Parsing Success**: Our robust parser achieved a high success rate in handling multiple log formats
2. **Data Quality**: The parsed dataset shows good completeness across key fields
3. **Temporal Patterns**: Clear patterns in transaction timing and user behavior
4. **User Diversity**: Wide variety in user transaction patterns and amounts
5. **Business Context**: Data represents realistic financial transaction patterns

### Next Steps:
1. Feature engineering based on discovered patterns
2. Behavioral modeling for anomaly detection
3. Implementation of multiple detection approaches
4. Model evaluation and optimization