In [5]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Define product categories and products
categories = {
    "Electronics": ["Laptop", "Smartphone", "Headphones", "Camera"],
    "Clothing": ["Shirt", "Jeans", "Jacket", "Sweater"],
    "Home": ["Vacuum Cleaner", "Blender", "Toaster", "Microwave"],
    "Books": ["Fiction", "Non-Fiction", "Comics", "Biography"],
    "Beauty": ["Lipstick", "Perfume", "Lotion", "Makeup Kit"]
}

# Generate dataset
data = []
for _ in range(50000):  # 1000 rows
    category = random.choice(list(categories.keys()))
    product = random.choice(categories[category])
    price = round(random.uniform(10, 1000), 2)
    quantity = random.choice([1, 2, 3, None])  # Simulate missing values
    discount = random.choice([None, round(random.uniform(0, 50), 2)])  # Simulate missing values
    total_amount = round(price * (quantity if quantity else 1), 2)
    purchase_date = random.choice([
        fake.date_this_year(), 
        fake.date_this_decade(),
        None  # Simulate missing dates
    ])
    data.append({
        "Transaction ID": fake.uuid4(),
        "Customer Name": fake.name(),
        "Product Name": product,
        "Category": category,
        "Price": price,
        "Quantity": quantity,
        "Total Amount": total_amount,
        "Discount": discount,
        "Purchase Date": purchase_date,
        "Customer Location": fake.city() + ", " + fake.state(),
        "Payment Method": random.choice(["Credit Card", "PayPal", "Cash"])
    })

# Create duplicates
df = pd.DataFrame(data)
df = pd.concat([df, df.sample(0, replace=True)])  # Add duplicate rows

# Write to CSV
df.to_csv("dataset/sales_data.csv", index=False)
df.head()

Unnamed: 0,Transaction ID,Customer Name,Product Name,Category,Price,Quantity,Total Amount,Discount,Purchase Date,Customer Location,Payment Method
0,f4076d3d-fe31-45f3-a0e5-6e5cfd982037,Jessica Cortez,Shirt,Clothing,822.96,,822.96,,2021-10-12,"Kochberg, Vermont",Credit Card
1,5a045ef4-e0a5-4e33-b612-59583f221d67,Trevor Martin,Camera,Electronics,195.34,3.0,586.02,29.81,,"Morgantown, Oregon",Credit Card
2,4eb1af8f-0d7f-428f-9528-0ca36fa1f4c9,Judy Allen,Headphones,Electronics,766.73,1.0,766.73,32.8,2024-02-19,"West Beverly, Colorado",PayPal
3,0af6e29b-468e-4ed0-a5e4-a905dc0a1030,Bradley Blair,Perfume,Beauty,930.46,3.0,2791.38,,,"Jasonberg, Indiana",Credit Card
4,5f296419-af51-4402-8324-d645a98e2334,Stephanie Durham,Microwave,Home,948.88,3.0,2846.64,46.91,2021-06-21,"Lake Courtney, Iowa",Credit Card


## CSV to SQL Data

In [6]:
import pandas as pd
from sqlalchemy import create_engine

def csv_to_sql(database_name,table_name,file_name):
    # Replace with your MySQL credentials
    # Example: 'mysql+mysqlconnector://username:password@localhost:3306/your_database'
    engine = create_engine(f'mysql+mysqlconnector://root:root@localhost:3306/{database_name}')

    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_name)

    # Write the DataFrame to MySQL (this will create the table if it doesn't exist)
    # 'if_exists' can be 'fail', 'replace', or 'append'. 'replace' drops and recreates the table.
    df.to_sql(table_name, engine, if_exists='replace', index=False)

csv_to_sql('demo_data','sales',"dataset/sales_data.csv")