In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Generate dummy data for 72 days (Monday-Friday) starting from March 18th
start_date = datetime(2024, 3, 18)
dates = [
    start_date + timedelta(days=i)
    for i in range(72)
    if (start_date + timedelta(days=i)).weekday() < 5
]

# Create dummy data with potential errors
data = {
    "Date": dates,
    "Bank Transaction": np.random.choice(
        ["Deposit", "Withdrawal", "Fee", "Transfer", ""], size=72
    ),
    "Transaction Amount": np.random.randint(-500, 1000, size=72).astype(float),
    "Item Sold": np.random.choice(
        ["Candy", "Chips", "Soda", "Cigarettes", "Lottery", ""], size=72
    ),
    "Price": np.random.randint(1, 20, size=72).astype(float),
    "Payment Method": np.random.choice(["Cash", "Card", "Check", ""], size=72),
}

# Introduce some typos and invalid amounts
for i in range(10):
    data["Bank Transaction"][np.random.randint(0, 72)] = "Deposi"
    data["Item Sold"][np.random.randint(0, 72)] = "Sda"
    data["Transaction Amount"][np.random.randint(0, 72)] = "Invalid"
    data["Price"][np.random.randint(0, 72)] = "N/A"

df = pd.DataFrame(data)

# Data Cleaning
# --------------------------------------------------------------------------------

# Convert Date column to datetime objects
df["Date"] = pd.to_datetime(df["Date"])

# Clean 'Bank Transaction' column
df["Bank Transaction"] = df["Bank Transaction"].replace("Deposi", "Deposit")

# Clean 'Item Sold' column
df["Item Sold"] = df["Item Sold"].replace("Sda", "Soda")

# Clean 'Transaction Amount' and 'Price' columns
df["Transaction Amount"] = pd.to_numeric(df["Transaction Amount"], errors="coerce")
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")

# Fill missing values in 'Transaction Amount' and 'Price' with 0
df.fillna({"Transaction Amount": 0, "Price": 0}, inplace=True)

# Basic Statistical Analysis
# --------------------------------------------------------------------------------

# Calculate total revenue from items sold
total_revenue = df["Price"].sum()

# Calculate total bank transactions
total_bank_transactions = df["Transaction Amount"].sum()

# Calculate average daily revenue
average_daily_revenue = total_revenue / len(df["Date"].unique())

# Weekly Summary
# --------------------------------------------------------------------------------

# Create a new column for week number
df["Week"] = df["Date"].dt.isocalendar().week

# Group data by week and calculate weekly statistics
weekly_summary = df.groupby("Week").agg(
    Total_Revenue=("Price", "sum"),
    Total_Bank_Transactions=("Transaction Amount", "sum"),
    Average_Daily_Revenue=("Price", "mean"),
)

# Print the results
print("Cleaned Data:")
print(df)
print("\nBasic Statistical Analysis:")
print(f"Total Revenue: ${total_revenue:.2f}")
print(f"Total Bank Transactions: ${total_bank_transactions:.2f}")
print(f"Average Daily Revenue: ${average_daily_revenue:.2f}")
print("\nWeekly Summary:")
print(weekly_summary)

ValueError: could not convert string to float: 'Invalid'

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Generate dummy data for 72 days (Monday-Friday) starting from March 18th
start_date = datetime(2024, 3, 18)
dates = [
    start_date + timedelta(days=i)
    for i in range(72)
    if (start_date + timedelta(days=i)).weekday() < 5
]

# Determine the length of the dates list
num_dates = len(dates)

# Create dummy data with potential errors
data = {
    "Date": dates,
    "Bank Transaction": np.random.choice(
        ["Deposit", "Withdrawal", "Fee", "Transfer", ""], size=num_dates
    ),
    "Transaction Amount": np.random.randint(-500, 1000, size=num_dates).astype(str),
    "Item Sold": np.random.choice(
        ["Candy", "Chips", "Soda", "Cigarettes", "Lottery", ""], size=num_dates
    ),
    "Price": np.random.randint(1, 20, size=num_dates).astype(str),
    "Payment Method": np.random.choice(["Cash", "Card", "Check", ""], size=num_dates),
}

# Introduce some typos and NaN values
for i in range(10):
    data["Bank Transaction"][np.random.randint(0, num_dates)] = "Deposi"
    data["Item Sold"][np.random.randint(0, num_dates)] = "Sda"
    data["Transaction Amount"][np.random.randint(0, num_dates)] = np.nan
    data["Price"][np.random.randint(0, num_dates)] = np.nan

df = pd.DataFrame(data)

# Data Cleaning
# --------------------------------------------------------------------------------

# Convert Date column to datetime objects
df["Date"] = pd.to_datetime(df["Date"])

# Clean 'Bank Transaction' column
df["Bank Transaction"] = df["Bank Transaction"].replace("Deposi", "Deposit")

# Clean 'Item Sold' column
df["Item Sold"] = df["Item Sold"].replace("Sda", "Soda")

# Clean 'Transaction Amount' and 'Price' columns
df["Transaction Amount"] = pd.to_numeric(df["Transaction Amount"], errors="coerce")
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")

# Fill missing values in 'Transaction Amount' and 'Price' with 0
df.fillna({"Transaction Amount": 0, "Price": 0}, inplace=True)

# Basic Statistical Analysis
# --------------------------------------------------------------------------------

# Calculate total revenue from items sold
total_revenue = df["Price"].sum()

# Calculate total bank transactions
total_bank_transactions = df["Transaction Amount"].sum()

# Calculate average daily revenue
average_daily_revenue = total_revenue / len(df["Date"].unique())

# Weekly Summary
# --------------------------------------------------------------------------------

# Create a new column for week number
df["Week"] = df["Date"].dt.isocalendar().week

# Group data by week and calculate weekly statistics
weekly_summary = df.groupby("Week").agg(
    Total_Revenue=("Price", "sum"),
    Total_Bank_Transactions=("Transaction Amount", "sum"),
    Average_Daily_Revenue=("Price", "mean"),
)

# Print the results
print("Cleaned Data:")
print(df)
print("\nBasic Statistical Analysis:")
print(f"Total Revenue: ${total_revenue:.2f}")
print(f"Total Bank Transactions: ${total_bank_transactions:.2f}")
print(f"Average Daily Revenue: ${average_daily_revenue:.2f}")
print("\nWeekly Summary:")
print(weekly_summary)

Cleaned Data:
         Date Bank Transaction  Transaction Amount   Item Sold  Price  \
0  2024-03-18          Deposit                 0.0        Soda   19.0   
1  2024-03-19          Deposit                 0.0        Soda    0.0   
2  2024-03-20         Transfer                14.0     Lottery    4.0   
3  2024-03-21                                846.0       Candy    8.0   
4  2024-03-22                                538.0               16.0   
5  2024-03-25         Transfer               282.0        Soda   11.0   
6  2024-03-26                                665.0  Cigarettes    6.0   
7  2024-03-27              Fee               377.0       Candy    0.0   
8  2024-03-28       Withdrawal               404.0               11.0   
9  2024-03-29         Transfer               934.0       Candy   12.0   
10 2024-04-01              Fee              -330.0     Lottery   15.0   
11 2024-04-02         Transfer               300.0        Soda    9.0   
12 2024-04-03       Withdrawal       