# Budget Lens - Data Analysis (Homework 3)

This notebook demonstrates NumPy operations, pandas data loading, summary statistics, and reusable functions for financial data analysis.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import time

# Add src directory to path for importing custom functions
sys.path.append('../src')
from utils import get_summary_stats, category_analysis, save_summary_to_files, create_spending_plot, compare_performance

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")

## 1. NumPy Operations

### Array Creation and Element-wise Operations

In [None]:
# Create sample arrays for financial calculations
monthly_expenses = np.array([1200, 1350, 1180, 1420, 1290, 1380])
monthly_income = np.array([3000, 3000, 3000, 3000, 3000, 3000])

print("Monthly Expenses:", monthly_expenses)
print("Monthly Income:", monthly_income)

# Element-wise operations
savings = monthly_income - monthly_expenses
savings_rate = (savings / monthly_income) * 100
expense_ratio = monthly_expenses / monthly_income

print("\n=== Element-wise Operations ===")
print(f"Monthly Savings: {savings}")
print(f"Savings Rate (%): {savings_rate.round(2)}")
print(f"Expense Ratio: {expense_ratio.round(3)}")

# Array statistics
print("\n=== Array Statistics ===")
print(f"Average monthly expenses: ${monthly_expenses.mean():.2f}")
print(f"Total expenses (6 months): ${monthly_expenses.sum():.2f}")
print(f"Highest expense month: ${monthly_expenses.max():.2f}")
print(f"Lowest expense month: ${monthly_expenses.min():.2f}")
print(f"Expense standard deviation: ${monthly_expenses.std():.2f}")

### Performance Comparison: Loop vs Vectorized Operations

In [None]:
# Compare performance using our utility function
performance_results = compare_performance(data_size=100000)

print("=== Performance Comparison: Loop vs Vectorized ===")
print(f"Data size: {performance_results['data_size']:,} elements")
print(f"Loop time: {performance_results['loop_time']:.4f} seconds")
print(f"Vectorized time: {performance_results['vectorized_time']:.4f} seconds")
print(f"Speedup factor: {performance_results['speedup_factor']:.1f}x faster")

# Manual demonstration with smaller array
print("\n=== Manual Demonstration ===")
test_array = np.array([1, 2, 3, 4, 5])
print(f"Original array: {test_array}")

# Loop-based approach
loop_result = []
for x in test_array:
    loop_result.append(x ** 2 + 2 * x + 1)
print(f"Loop result: {loop_result}")

# Vectorized approach
vectorized_result = test_array ** 2 + 2 * test_array + 1
print(f"Vectorized result: {vectorized_result}")
print(f"Results match: {np.array_equal(loop_result, vectorized_result)}")

## 2. Dataset Loading

### Load and Inspect CSV Data

In [None]:
# Load the starter dataset
df = pd.read_csv('../data/starter_data.csv')

print("=== Dataset Successfully Loaded ===")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Display first few rows
print("\n=== First 5 rows ===")
print(df.head())

In [None]:
# Inspect dataset with .info()
print("=== Dataset Info ===")
df.info()

# Check for missing values
print("\n=== Missing Values ===")
missing_values = df.isnull().sum()
print(missing_values)

# Data types
print("\n=== Data Types ===")
print(df.dtypes)

## 3. Summary Statistics

### Basic Descriptive Statistics

In [None]:
# Calculate summary statistics using our utility function
summary_stats = get_summary_stats(df)

print("=== Summary Statistics for Numeric Columns ===")
print(summary_stats)

# Additional statistics
print("\n=== Additional Statistics ===")
print(f"Total spending: ${df['amount'].sum():.2f}")
print(f"Average transaction: ${df['amount'].mean():.2f}")
print(f"Median transaction: ${df['amount'].median():.2f}")
print(f"Number of transactions: {len(df)}")
print(f"Number of unique categories: {df['category'].nunique()}")
print(f"Categories: {sorted(df['category'].unique())}")

### GroupBy Analysis by Category

In [None]:
# Perform category analysis using our utility function
category_stats = category_analysis(df)

print("=== Spending Analysis by Category ===")
print(category_stats)

# Sort by total spending
print("\n=== Categories Ranked by Total Spending ===")
category_ranking = category_stats.sort_values('sum', ascending=False)
print(category_ranking[['count', 'sum', 'mean']])

## 4. Save Outputs

### Save Summary Statistics to Files

In [None]:
# Prepare summary data for saving
summary_data = {
    'overall_summary': summary_stats,
    'category_analysis': category_stats,
    'total_spending': df['amount'].sum(),
    'average_transaction': df['amount'].mean(),
    'transaction_count': len(df),
    'unique_categories': df['category'].nunique()
}

# Save to files using our utility function
save_summary_to_files(summary_data, '../data/processed')

print("✅ Summary statistics saved successfully!")

### Bonus: Create and Save Basic Plot

In [None]:
# Create comprehensive spending analysis plot
fig = create_spending_plot(df, '../data/processed/spending_analysis.png')
plt.show()

print("✅ Spending analysis plot created and saved!")

## 5. Reusable Functions Demonstration

### Testing Our Custom Utility Functions

In [None]:
# Demonstrate reusable functions with different parameters
print("=== Testing get_summary_stats with different options ===")

# Numeric only (default)
numeric_summary = get_summary_stats(df, numeric_only=True)
print("Numeric columns summary:")
print(numeric_summary)

print("\n" + "="*50)

# All columns
all_summary = get_summary_stats(df, numeric_only=False)
print("All columns summary:")
print(all_summary)

In [None]:
# Test category analysis with custom parameters
print("=== Testing category_analysis function ===")

# Standard analysis
standard_analysis = category_analysis(df)
print("Standard category analysis:")
print(standard_analysis)

# Filter for specific categories
food_data = df[df['category'] == 'Food']
print(f"\n=== Food Category Deep Dive ===")
print(f"Food transactions: {len(food_data)}")
print(f"Total food spending: ${food_data['amount'].sum():.2f}")
print(f"Average food transaction: ${food_data['amount'].mean():.2f}")
print(f"Food merchants: {food_data['merchant'].unique()}")

## Summary

This notebook successfully demonstrates:

1. ✅ **NumPy Operations**: Array creation, element-wise operations, and performance comparison
2. ✅ **Dataset Loading**: CSV loading with pandas, inspection using `.info()` and `.head()`
3. ✅ **Summary Statistics**: `.describe()` for numeric columns and `.groupby()` aggregation
4. ✅ **Save Outputs**: Summary statistics saved to CSV and JSON formats, plus visualization
5. ✅ **Reusable Functions**: Custom utility functions in `src/utils.py` imported and used

### Key Insights from the Data:
- Total spending across all categories
- Food is typically the largest expense category
- Vectorized operations are significantly faster than loops
- Clear spending patterns by category with actionable insights