In [2]:
import pandas as pd
import random
import datetime

In [3]:
# Generate a sample dataset
random.seed(42)
num_rows = 1000

In [4]:
# Generate random data for the dataset
ids = list(range(1, num_rows + 1))
customer_ids = [random.randint(1, 100) for _ in range(num_rows)]
timestamps = [datetime.datetime(2023, 1, 1) + datetime.timedelta(days=random.randint(0, 365)) for _ in range(num_rows)]
product_categories = [random.choice(['Electronics', 'Clothing', 'Groceries', 'Books', 'Furniture']) for _ in range(num_rows)]

In [5]:
# Introduce anomalies in the Amount column
amounts = [random.uniform(10, 500) for _ in range(num_rows)]
for i in range(10):  # Introduce 10 anomalies with excessively high amounts
    idx = random.randint(0, num_rows - 1)
    amounts[idx] = random.uniform(1000, 5000)

In [6]:
# Add anomalous customer behavior (frequent purchases)
anomalous_customer = random.choice(customer_ids)
for i in range(20):  # Introduce 20 rows for the same customer within a short time frame
    ids.append(num_rows + i + 1)
    customer_ids.append(anomalous_customer)
    timestamps.append(timestamps[random.randint(0, num_rows - 1)] + datetime.timedelta(hours=random.randint(0, 2)))
    product_categories.append(random.choice(['Electronics', 'Clothing', 'Groceries']))
    amounts.append(random.uniform(10, 50))  # Small amounts for frequent purchases

In [10]:
# Create the DataFrame
data = {
    "Id": ids,
    "CustomerId": customer_ids,
    "Timestamp": timestamps,
    "ProductCategory": product_categories,
    "Amount": amounts
}
df = pd.DataFrame(data)

In [15]:
df.head()

Unnamed: 0,Id,CustomerId,Timestamp,ProductCategory,Amount
0,1,82,2023-04-02,Clothing,363.874434
1,2,15,2023-06-04,Furniture,272.51167
2,3,4,2023-01-08,Books,150.477383
3,4,95,2023-12-29,Clothing,256.085856
4,5,36,2023-10-01,Clothing,351.950947


In [16]:
# Update timestamps to include hours, minutes, and seconds
timestamps_with_time = [ts + datetime.timedelta(
    hours=random.randint(0, 23),
    minutes=random.randint(0, 59),
    seconds=random.randint(0, 59)
) for ts in timestamps]

# Replace the existing timestamps with the updated ones
df["Timestamp"] = timestamps_with_time

In [17]:
df.head()

Unnamed: 0,Id,CustomerId,Timestamp,ProductCategory,Amount
0,1,82,2023-04-02 06:54:14,Clothing,363.874434
1,2,15,2023-06-04 20:14:43,Furniture,272.51167
2,3,4,2023-01-08 06:17:41,Books,150.477383
3,4,95,2023-12-29 13:32:01,Clothing,256.085856
4,5,36,2023-10-01 21:00:30,Clothing,351.950947
