<a href="https://colab.research.google.com/github/SSSpock/skillspire/blob/main/ecomm_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The goal of this notebook is to create a realistic representation of an ecommerce data set for teaching datascience to students.

In [13]:
import pandas as pd
import numpy as np
import random
from datetime import date
from datetime import datetime
from dateutil.relativedelta import relativedelta
from ast import literal_eval

!pip install faker
import faker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Create Customer Data set

# Initialize Faker object
fake = faker.Faker()

# Set seed for reproducibility
random.seed(123)

# Set date range for customer creation dates
start_date = date.today() - relativedelta(months=24)

# Fake Cities
cities = []
for _ in range(15):
  cities.append(fake.city())

# Generate customer information
customers = []
customer_ids = set()
while len(customers) < 500 * 24:
  # Generate random customer ID (unique)
  customer_id = random.randint(0, 19999)
  
  while customer_id in customer_ids:
    customer_id = random.randint(0, 19999)
  
  customer_ids.add(customer_id)
  
  # Generate random gender
  gender = random.choice(["Male", "Female"])

  # Generate random first and last name
  if gender == "Male":
      first_name = fake.first_name_male()
      last_name = fake.last_name_male()
  else:
      first_name = fake.first_name_female()
      last_name = fake.last_name_female()
  
  # Generate random email address
  email = f"{first_name}.{last_name}@{fake.domain_name()}"
  
  # Generate random birthdate
  birthdate = fake.date_of_birth(minimum_age=18, maximum_age=65)
  
  # Generate random customer creation date
  creation_date = fake.date_between(start_date=start_date, end_date="today")

  # Generate random last purchase date (Todo)
  last_purchase_date = fake.date_between(start_date=start_date, end_date="today")

  # Generate City
  customer_city = random.choice(cities)

  # Append customer information to list
  customer_info = {"customer_id": customer_id, "gender": gender, "first_name": first_name, "last_name": last_name,
                    "email": email, "birthdate": birthdate, "creation_date": creation_date, "last_purchase_date": None,
                   "city":customer_city}
  customers.append(customer_info)

customer_df = pd.DataFrame(customers)

customer_df.head()

Unnamed: 0,customer_id,gender,first_name,last_name,email,birthdate,creation_date,last_purchase_date,city
0,1715,Female,Anne,Moody,Anne.Moody@woodward-anderson.net,2003-02-18,2021-03-17,,Monroeport
1,13344,Female,Annette,Holland,Annette.Holland@dougherty.com,1981-08-06,2022-02-06,,Monroeport
2,1250,Female,Jessica,Walsh,Jessica.Walsh@orr.info,1984-09-23,2021-08-05,,Lake Matthewhaven
3,18426,Female,Christine,Hodge,Christine.Hodge@stone.net,1975-07-21,2022-10-28,,North Clifford
4,1700,Male,Paul,Jimenez,Paul.Jimenez@conrad.org,1995-03-01,2022-02-12,,New Samuelchester


In [3]:
# Set seed for reproducibility
random.seed(123)

# Define product categories and names as dictionary
categories = {
    "TVs": [
        "Samsung 55-Inch 4K UHD Smart TV",
        "Sony 65-Inch 4K UHD Smart TV",
        "LG 43-Inch Full HD Smart TV"
    ],
    "Computers": [
        "Apple MacBook Air 13-Inch Laptop",
        "Dell XPS 13 Laptop",
        "HP Pavilion 15-Inch Laptop"
    ],
    "Home Audio": [
        "Sonos Beam Soundbar",
        "Bose SoundTouch 300 Soundbar"
    ],
    "Smart Home Devices": [
        "Amazon Echo Dot (3rd Gen)",
        "Google Nest Mini",
        "Ring Video Doorbell",
        "Nest Hello Video Doorbell"
    ],
    "Phones & Tablets": [
        "Apple iPhone 12",
        "Samsung Galaxy S21",
        "Google Pixel 5",
        "Apple iPad (8th Gen)",
        "Samsung Galaxy Tab S7",
        "Amazon Fire HD 10 Tablet"
    ]
}

# Generate product information
products = []
for category, product_names in categories.items():
    for product_name in product_names:
        # Generate unique 4-digit product ID
        product_id = random.randint(1000, 9999)
        while product_id in [p["product_id"] for p in products]:
            product_id = random.randint(1000, 9999)
        
        # Generate random gross price and retail price
        gross_price = round(random.uniform(50, 1000), 2)
        retail_price = round(gross_price * random.uniform(1.1, 1.5), 2)
        
        # Append product information to list
        product_info = {"product_id": product_id, "product_name": product_name, "gross_price": gross_price,
                        "retail_price": retail_price, "category": category}
        products.append(product_info)

print(products)

product_df = pd.DataFrame(products)

[{'product_id': 1857, 'product_name': 'Samsung 55-Inch 4K UHD Smart TV', 'gross_price': 304.29, 'retail_price': 428.31, 'category': 'TVs'}, {'product_id': 5367, 'product_name': 'Sony 65-Inch 4K UHD Smart TV', 'gross_price': 152.32, 'retail_price': 222.46, 'category': 'TVs'}, {'product_id': 1625, 'product_name': 'LG 43-Inch Full HD Smart TV', 'gross_price': 410.17, 'retail_price': 543.45, 'category': 'TVs'}, {'product_id': 6583, 'product_name': 'Apple MacBook Air 13-Inch Laptop', 'gross_price': 859.48, 'retail_price': 1000.32, 'category': 'Computers'}, {'product_id': 6524, 'product_name': 'Dell XPS 13 Laptop', 'gross_price': 582.91, 'retail_price': 804.81, 'category': 'Computers'}, {'product_id': 3683, 'product_name': 'HP Pavilion 15-Inch Laptop', 'gross_price': 51.59, 'retail_price': 65.75, 'category': 'Computers'}, {'product_id': 2435, 'product_name': 'Sonos Beam Soundbar', 'gross_price': 884.7, 'retail_price': 1106.79, 'category': 'Home Audio'}, {'product_id': 1108, 'product_name': '

In [10]:
# Set seed for reproducibility
random.seed(123)

# Set popularity probabilities for product categories
category_probs = {
    "TVs": 0.1,
    "Computers": 0.2,
    "Home Audio": 0.15,
    "Smart Home Devices": 0.3,
    "Phones & Tablets": 0.25
}

# Generate transaction information
transactions = []
transaction_id = 10000
start_date = date.today() - relativedelta(months=24)
end_date = date.today()
date_range = pd.date_range(start=start_date, end=end_date, freq="D")
for date in date_range:
    transaction_date = date
    # Determine number of transactions on this day based on a probability distribution
    num_transactions = int(random.normalvariate(200, 50))
    if num_transactions < 1:
        num_transactions = 1
    
    # Create a list of customer IDs for this day's transactions (weighted by gender)
    customer_ids = []
    for customer in customers:
        if customer["gender"] == "Male":
            customer_ids.extend([customer["customer_id"]] * 5)
        else:
            customer_ids.extend([customer["customer_id"]] * 6)
    random.shuffle(customer_ids)
    
    # Generate transactions for this day
    for i in range(num_transactions):
        # Generate random transaction ID
        transaction_id += 1
        transaction_id_str = str(transaction_id).zfill(5)
        
        # Select a customer ID (with replacement)
        customer_id = random.choice(customer_ids)
        
      # Calculate popularity probabilities for each product
        product_probs = [0] * len(products)
        for i, product in enumerate(products):
            category = product["category"]
            num_products_in_category = sum([p["category"] == category for p in products])
            product_probs[i] = category_probs[category] / num_products_in_category

        # Select a product category (weighted by popularity probability)
        category_idx = np.random.choice(len(products), p=product_probs)
        category_products = [p for p in products if p["category"] == products[category_idx]["category"]]
            
        # Select number of products purchased in this transaction
        num_products = np.random.choice([1, 2, 3], p=[0.7, 0.2, 0.1])
          
        # Select products purchased (weighted by retail price)
        retail_prices = [p["retail_price"] for p in category_products]
        
        if num_products > len(category_products):
          num_products = len(category_products)
          product_indices = np.arange(num_products)
        else:
          product_indices = np.random.choice(len(category_products), size=num_products, replace=False, p=np.array(retail_prices) / sum(retail_prices))
        
        product_ids = [category_products[i]["product_id"] for i in product_indices]
        
        # Select transaction amount (equal to total retail price of products purchased)
        transaction_amount = sum([p["retail_price"] for p in category_products if p["product_id"] in product_ids])
        
  # Determine transaction seasonality and holidays
        month = transaction_date.month
        day = transaction_date.day
        if month == 12 and day == 25:  # Christmas
            transaction_date += pd.Timedelta(days=random.randint(1, 3))
        elif month == 11 and day == 26:  # Black Friday
            transaction_date += pd.Timedelta(days=random.randint(0, 2))
        elif month == 7 and day == 4:  # Fourth of July
            transaction_date += pd.Timedelta(days=random.randint(0, 1))
        elif month == 5 and day == 31:  # Memorial Day
            transaction_date += pd.Timedelta(days=random.randint(0, 1))
        else:
            pass
        
        # Check that transaction date is not earlier than customer creation date
        creation_date = [c["creation_date"] for c in customers if c["customer_id"] == customer_id][0]
        if transaction_date < creation_date:
            transaction_date = creation_date
        
        # Add transaction to list
        transaction_info = {
            "transaction_id": transaction_id_str,
            "transaction_date": transaction_date,
            "customer_id": customer_id,
            "product_ids": product_ids,
            "num_products": num_products,
            "transaction_amount": transaction_amount
        }
        transactions.append(transaction_info)

# Explode Product ID columns
transaction_df['product_ids'] = transaction_df.product_ids.apply(literal_eval)
transaction_df = transaction_df.explode('product_ids')

# Output dataframe
transaction_df = pd.DataFrame(transactions)


  if transaction_date < creation_date:


In [16]:
from google.colab import drive
drive.mount('drive')

# Export DataFrames
transaction_df.to_csv('transactions.csv')
product_df.to_csv('products.csv')
customer_df.to_csv('customers.csv')

# Copy to drive
!cp transactions.csv "drive/My Drive/"
!cp products.csv "drive/My Drive/"
!cp customers.csv "drive/My Drive/"

Mounted at drive
