<a href="https://colab.research.google.com/github/jojrg/Star_Schema_Generator/blob/master/Star_Schema_Data_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install Faker

Collecting Faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-37.1.0


# Star Schema Data Generator

This python code can be used as foundation for generating synthetic data stored in a Star Schema Data model

The Code generates data for the follwing tables:

*   Customers (Dimension Table)
*   Receipts (Fact Table)
*   Stores (Dimension Table)



In [2]:
import random
from faker import Faker

class DataFactory:
    """
    A factory class to generate synthetic data for a star schema,
    including receipts (fact table), customers, and stores (dimension tables).
    """

    # German bounding box for generating random points (approximate)
    GERMANY_BBOX = (47.2, 5.9, 55.0, 15.0)

    def __init__(self, num_distinct_customers=50, num_distinct_stores=10,
                 registered_customer_probability=0.6, locale='de_DE'):
        """
        Initializes the DataFactory with configuration parameters.

        Args:
            num_distinct_customers (int, optional): Number of distinct customer IDs. Defaults to 50.
            num_distinct_stores (int, optional): Number of distinct store IDs. Defaults to 10.
            registered_customer_probability (float, optional): Probability of a customer being registered. Defaults to 0.6.
            locale (str, optional): Faker locale to use. Defaults to 'de_DE'.
        """
        self.num_distinct_customers = num_distinct_customers
        self.num_distinct_stores = num_distinct_stores
        self.registered_customer_probability = registered_customer_probability
        self.fake = Faker(locale)
        self.distinct_customer_ids = [self.fake.uuid4() for _ in range(self.num_distinct_customers)]
        self.distinct_store_ids = [str(random.randint(1, 100)) for _ in range(self.num_distinct_stores)]
        self.product_categories = {
            "Electronics": ["Laptop", "Smartphone", "Tablet", "Headphones", "Smartwatch"],
            "Clothing": ["T-shirt", "Jeans", "Dress", "Jacket", "Shoes"],
            "Books": ["Novel", "Cookbook", "Textbook", "Biography", "Poetry"],
            "Groceries": ["Milk", "Eggs", "Bread", "Apples", "Chicken"],
            "Home Goods": ["Lamp", "Pillow", "Blanket", "Table", "Chair"],
            "Toys": ["Action Figure", "Board Game", "Puzzle", "Doll", "Building Blocks"],
            "Sports": ["Basketball", "Running Shoes", "Yoga Mat", "Tennis Racket", "Football"],
            "Beauty": ["Lipstick", "Shampoo", "Lotion", "Perfume", "Mascara"]
        }
        self.payment_types = ["Credit Card", "Debit Card", "Cash", "Online Payment"]

    def _generate_german_coordinates(self):
        """Generates random latitude and longitude within Germany."""
        min_lat, min_lon, max_lat, max_lon = self.GERMANY_BBOX
        latitude = random.uniform(min_lat, max_lat)
        longitude = random.uniform(min_lon, max_lon)
        return latitude, longitude

    def generate_receipts(self, num_receipts):
        """Generates a list of synthetic receipt records (fact table).

        Args:
            num_receipts (int): The number of receipt records to generate.

        Returns:
            list: A list of tuples, where each tuple represents a receipt record.
                   Returns an empty list if num_receipts is not positive.
        """
        if num_receipts <= 0:
            return []
        receipts_data = []
        for _ in range(num_receipts):
            timestamp = self.fake.date_time_between(start_date="-1y", end_date="now")
            amount = random.randint(1, 10)
            price = round(random.uniform(5, 100), 2)
            price_sum = round(price * amount, 2)
            category = random.choice(list(self.product_categories.keys()))
            product = random.choice(self.product_categories[category])
            payment_type = random.choice(self.payment_types)
            creditcard_id = self.fake.credit_card_number() if payment_type in ["Credit Card", "Debit Card"] else None
            customer_id = random.choice(self.distinct_customer_ids) if random.random() < self.registered_customer_probability else None
            store_id = random.choice(self.distinct_store_ids)
            receipts_data.append((
                self.fake.uuid4(), timestamp, store_id, product, category,
                amount, price, price_sum, payment_type, creditcard_id, customer_id
            ))
        return receipts_data

    def generate_customers(self):
        """Generates a list of synthetic customer records (dimension table).

        Returns:
            list: A list of tuples, where each tuple represents a customer record.
        """
        customer_data = []
        for cid in self.distinct_customer_ids:
            first_name = self.fake.first_name().lower()
            last_name = self.fake.last_name().lower()
            city = self.fake.city()
            phone_number = self.fake.phone_number()
            while not phone_number.startswith('+49'):
                phone_number = self.fake.phone_number()
            base_username = f"{first_name}.{last_name}"
            profile_data = self.fake.profile()
            generated_username = profile_data['username']
            username = f"{first_name[:3]}{last_name[:3]}{''.join(filter(str.isalnum, generated_username))[-3:]}".lower()
            domain = random.choice(['example.com', 'fake-mail.net', 'test.org'])
            email = f"{username}@{domain}"
            customer_data.append((cid, first_name.capitalize(), last_name.capitalize(), city, phone_number, email))
        return customer_data

    def generate_stores(self):
        """Generates a list of synthetic store records (dimension table).

        Returns:
            list: A list of tuples, where each tuple represents a store record.
        """
        store_data = []
        for store_id in self.distinct_store_ids:
            store_name = f"Store #{store_id}"
            latitude, longitude = self._generate_german_coordinates()
            city = self.fake.city()
            store_data.append((store_id, store_name, city, latitude, longitude))
        return store_data

def main():
    """
    Main function to demonstrate the usage of DataFactory.
    It creates an instance of DataFactory, generates data for receipts,
    customers, and stores, and then prints the number of records generated
    for each table.
    """
    data_factory = DataFactory(num_distinct_customers=50, num_distinct_stores=10,
                                 registered_customer_probability=0.6, locale='de_DE')

    num_receipts_to_generate = 100000
    receipts_data = data_factory.generate_receipts(num_receipts_to_generate)
    customers_data = data_factory.generate_customers()
    stores_data = data_factory.generate_stores()

    print(f"Generated {len(receipts_data)} receipt records.")
    print(f"Generated {len(customers_data)} customer records.")
    print(f"Generated {len(stores_data)} store records.")

    # You can further process the data here, e.g., insert into database tables.

if __name__ == "__main__":
    main()

Generated 100000 receipt records.
Generated 50 customer records.
Generated 10 store records.
