# Generate synthetic data

### Utilities

In [1]:
import pandas as pd
import random
import csv
from datetime import datetime, timedelta

# Define utility bill ranges
water_bill_range = (40, 70)
electric_bill_range = (100, 150)
gas_bill_range = (15, 40)

# Generate data for each month of 2023
def generate_utility_expenses(year=2023):
    expenses = []
    for month in range(1, 13):  # Loop through months
        water_bill = round(random.uniform(*water_bill_range), 2)
        electric_bill = round(random.uniform(*electric_bill_range), 2)
        gas_bill = round(random.uniform(*gas_bill_range), 2)

        # Create a valid datetime object for the given month
        date_for_month = datetime(year, month, 1)
        
        expenses.append({
            "Month": date_for_month.strftime("%Y-%m"),
            "Water Bill": water_bill,
            "Electric Bill": electric_bill,
            "Gas Bill": gas_bill
        })
    return expenses

# Generate the utility expenses
utility_expenses = generate_utility_expenses()


# Convert the list of dictionaries to a pandas DataFrame
utility_expenses_df = pd.DataFrame(utility_expenses)

# Use .head() to view the first 5 rows
utility_expenses_df.head()

Unnamed: 0,Month,Water Bill,Electric Bill,Gas Bill
0,2023-01,67.21,112.0,37.65
1,2023-02,55.76,124.34,25.93
2,2023-03,69.88,115.07,37.36
3,2023-04,66.66,110.1,28.82
4,2023-05,46.51,143.98,29.53


In [2]:
# Save the DataFrame to a CSV file
utility_expenses_df.to_csv('FakeExpenses-Utilities.csv', index=False)

### Recurring Expenses

In [4]:
# Define the recurring expenses
recurring_expenses = {
    "Health Insurance": 149.76,
    "Rent": 1200.00,
    "Savings": 1200.00,
    "Internet Bill": 63.42
}

# Define the payment dates for each expense
payment_dates = {
    "Health Insurance": 1,  # 1st of the month
    "Rent": 1,              # 1st of the month
    "Savings": 1,           # 1st of the month
    "Internet Bill": 21     # 21th of the month
}

# Create a function to generate expense entries
def generate_expenses(start_year, end_year, recurring_expenses, payment_dates):
    expenses = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):  # Loop through months 1 to 12
            for expense, amount in recurring_expenses.items():
                day = payment_dates[expense]
                date = datetime(year, month, day).strftime('%Y-%m-%d')
                expenses.append({"Date": date, "Expense": expense, "Amount": amount})
    return expenses

# Generate expenses for 2023
expense_data = generate_expenses(2023, 2023, recurring_expenses, payment_dates)

# Create a DataFrame
df = pd.DataFrame(expense_data)

# Display the DataFrame
df.head()

# Save to a file
df.to_csv("FakeExpenses-RecurringExpenses.csv", index=False)

### Variable Expenses

In [12]:
from faker import Faker
from datetime import date

# Initialize Faker instance
fake = Faker()

# Configuration for expense categories and their ranges
categories = {
    "Transportation": {
        "monthly_range": (125.00, 225.00),
        "transaction_range": (31.25, 56.25),
    },
    "Food": {
        "monthly_range": (325.00, 425.00),
        "transaction_range": (5.00, 100.00),
    },
    "Health": {
        "monthly_range": (25.00, 60.00),
        "transaction_range": (6.25, 15.00),
    },
    "Miscellaneous": {
        "monthly_range": (350.00, 450.00),
        "transaction_range": (3.25, 325.00),
    },
}

# Generate synthetic data for each month of 2023
def generate_expense_data():
    data = []

    for month in range(1, 13):  # Iterate over months
        for category, ranges in categories.items():
            monthly_total = round(random.uniform(*ranges["monthly_range"]), 2)
            transactions = []

            while sum(transactions) < monthly_total:
                transaction_amount = round(random.uniform(*ranges["transaction_range"]), 2)
                if sum(transactions) + transaction_amount > monthly_total:
                    break
                transactions.append(transaction_amount)

            # Create individual transactions with random dates in the month
            for transaction in transactions:
                transaction_date = fake.date_between_dates(
                    date_start=date(2023, month, 1),
                    date_end=date(2023, month, 28)
                )
                data.append({
                    "Date": transaction_date,
                    "Category": category,
                    "Amount": transaction,
                })

    return data

# Generate data and convert to a DataFrame
expense_data = generate_expense_data()
df = pd.DataFrame(expense_data)

# Sort data by date for better readability
df.sort_values(by="Date", inplace=True)

# Save the data to a CSV file
df.to_csv("FakeExpenses-VariableExpenses.csv", index=False)

In [13]:
df.head()

Unnamed: 0,Date,Category,Amount
12,2023-01-01,Health,8.51
7,2023-01-01,Food,96.07
1,2023-01-03,Transportation,48.82
8,2023-01-03,Food,71.19
3,2023-01-06,Transportation,37.66
